steal tasks to improve CPU utilization

[RFC PATCH v5 8/9] sched/fair: Steal work from an overloaded CPU when CPU goes idle
Posted by Chen Jinghuang 2 weeks, 3 days ago
From: Steve Sistare <steven.sistare@oracle.com>

When a CPU has no more CFS tasks to run, and idle_balance() fails to find a
task, then attempt to steal a task from an overloaded CPU in the same LLC,
using the cfs_overload_cpus bitmap to efficiently identify candidates.  To
minimize search time, steal the first migratable task that is found when
the bitmap is traversed.  For fairness, search for migratable tasks on an
overloaded CPU in order of next to run.

This simple stealing yields a higher CPU utilization than idle_balance()
alone, because the search is cheap, so it may be called every time the CPU
is about to go idle.  idle_balance() does more work because it searches
widely for the busiest queue, so to limit its CPU consumption, it declines
to search if the system is too busy.  Simple stealing does not offload the
globally busiest queue, but it is much better than running nothing at all.

Stealing is controlled by the sched feature SCHED_STEAL, which is enabled
by default. Note that all test results presented below are based on the 
NO_DELAY_DEQUEUE implementation.

Stealing imprroves utilization with only a modest CPU overhead in scheduler
code.  In the following experiment, hackbench is run with varying numbers
of groups (40 tasks per group), and the delta in /proc/schedstat is shown
for each run, averaged per CPU, augmented with these non-standard stats:

  steal - number of times a task is stolen from another CPU.

X6-2: 2 socket * 40 cores * 2 hyperthreads = 160 CPUs
Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz
hackbench <grps> process 100000

  baseline
  grps  time   %busy  sched   idle    wake   steal
  1     2.182  20.00  35876   17905   17958  0
  2     2.391  39.00  67753   33808   33921  0
  3     2.871  47.00  100944  48966   51538  0
  4     2.928  62.00  114489  55171   59059  0
  8     4.852  83.00  219907  92961   121703 0

  new
  grps  time   %busy  sched   idle    wake   steal   %speedup
  1     2.229  18.00  45450   22691   22751  52      -2.1
  2     2.123  40.00  49975   24977   24990  6       12.6
  3     2.690  61.00  56118   22641   32780  9073    6.7
  4     2.828  80.00  37927   12828   24165  8442    3.5
  8     4.120  95.00  85929   8613    57858  11098   17.8

Elapsed time improves by 17.8, and CPU busy utilization is up
by 1 to 18% hitting 95% at peak load.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
---
 kernel/sched/fair.c     | 174 ++++++++++++++++++++++++++++++++++++++--
 kernel/sched/features.h |   6 ++
 2 files changed, 174 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0bf6d18dac05..500215a57392 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,6 +5092,9 @@ static void overload_clear(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
 
+	if (!sched_feat(STEAL))
+		return;
+
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
@@ -5103,17 +5106,29 @@ static void overload_set(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
 
+	if (!sched_feat(STEAL))
+		return;
+
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
 		sparsemask_set_elem(overload_cpus, rq->cpu);
 	rcu_read_unlock();
 }
+
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 static inline void rq_idle_stamp_update(struct rq *rq) {}
 static inline void rq_idle_stamp_clear(struct rq *rq) {}
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
+
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+	return 0;
+}
+
 #endif
 
 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
@@ -9024,21 +9039,24 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 idle:
 	if (rf) {
 		/*
-		 * We must set idle_stamp _before_ calling idle_balance(), such that we
-		 * measure the duration of idle_balance() as idle time.
+		 * We must set idle_stamp _before_ calling try_steal() or
+		 * sched_balance_newidle(), such that we measure the duration
+		 * as idle time.
 		 */
 		rq_idle_stamp_update(rq);
 
 		new_tasks = sched_balance_newidle(rq, rf);
+		if (new_tasks == 0)
+			new_tasks = try_steal(rq, rf);
 
 		if (new_tasks)
 			rq_idle_stamp_clear(rq);
 
 		/*
-		 * Because sched_balance_newidle() releases (and re-acquires)
-		 * rq->lock, it is possible for any higher priority task to
-		 * appear. In that case we must re-start the pick_next_entity()
-		 * loop.
+		 * Because try_steal() and sched_balance_newidle() release
+		 * (and re-acquire) rq->lock, it is possible for any higher priority
+		 * task to appear. In that case we must re-start the
+		 * pick_next_entity() loop.
 		 */
 		if (new_tasks < 0)
 			return RETRY_TASK;
@@ -13133,6 +13151,150 @@ void sched_balance_trigger(struct rq *rq)
 	nohz_balancer_kick(rq);
 }
 
+/*
+ * Search the runnable tasks in @cfs_rq in order of next to run, and find
+ * the first one that can be migrated to @dst_rq.  @cfs_rq is locked on entry.
+ * On success, dequeue the task from @cfs_rq and return it, else return NULL.
+ */
+static struct task_struct *
+detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq)
+{
+	int dst_cpu = dst_rq->cpu;
+	struct task_struct *p;
+	struct rq *rq = rq_of(cfs_rq);
+
+	lockdep_assert_rq_held(rq);
+
+	list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) {
+		if (can_migrate_task_llc(p, rq, dst_rq)) {
+			detach_task_steal(p, rq, dst_cpu);
+			return p;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Attempt to migrate a CFS task from @src_cpu to @dst_rq.  @locked indicates
+ * whether @dst_rq is already locked on entry.  This function may lock or
+ * unlock @dst_rq, and updates @locked to indicate the locked state on return.
+ * The locking protocol is based on idle_balance().
+ * Returns 1 on success and 0 on failure.
+ */
+static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      int src_cpu)
+{
+	struct task_struct *p;
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = cpu_rq(src_cpu);
+
+	if (dst_cpu == src_cpu || src_rq->cfs.h_nr_runnable < 2)
+		return 0;
+
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_rq_unlock(dst_rq);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (src_rq->cfs.h_nr_runnable < 2 || !cpu_active(src_cpu))
+		p = NULL;
+	else
+		p = detach_next_task(&src_rq->cfs, dst_rq);
+
+	rq_unlock(src_rq, &rf);
+
+	if (p) {
+		raw_spin_rq_lock(dst_rq);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, p);
+		stolen = 1;
+	}
+	local_irq_restore(rf.flags);
+
+	return stolen;
+}
+
+/*
+ * Conservative upper bound on the max cost of a steal, in nsecs (the typical
+ * cost is 1-2 microsec).  Do not steal if average idle time is less.
+ */
+#define SCHED_STEAL_COST 10000
+
+/*
+ * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq,
+ * and migrate it to @dst_rq.  rq_lock is held on entry and return, but
+ * may be dropped in between.  Return 1 on success, 0 on failure, and -1
+ * if a task in a different scheduling class has become runnable on @dst_rq.
+ */
+static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
+{
+	int src_cpu;
+	int dst_cpu = dst_rq->cpu;
+	bool locked = true;
+	int stolen = 0;
+	struct sparsemask *overload_cpus;
+
+	if (!sched_feat(STEAL))
+		return 0;
+
+	if (!cpu_active(dst_cpu))
+		return 0;
+
+	if (dst_rq->avg_idle < SCHED_STEAL_COST)
+		return 0;
+
+	/* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus);
+	if (!overload_cpus) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+#ifdef CONFIG_SCHED_SMT
+	/*
+	 * First try overloaded CPUs on the same core to preserve cache warmth.
+	 */
+	if (static_branch_likely(&sched_smt_present)) {
+		for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
+			if (sparsemask_test_elem(overload_cpus, src_cpu) &&
+			    steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+				stolen = 1;
+				goto out;
+			}
+		}
+	}
+#endif	/* CONFIG_SCHED_SMT */
+
+	/* Accept any suitable task in the LLC */
+
+	sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) {
+		if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+			stolen = 1;
+			goto out;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	if (!locked) {
+		raw_spin_rq_lock(dst_rq);
+		rq_repin_lock(dst_rq, dst_rf);
+	}
+	stolen |= (dst_rq->cfs.h_nr_runnable > 0);
+	if (dst_rq->nr_running != dst_rq->cfs.h_nr_runnable)
+		stolen = -1;
+	return stolen;
+}
+
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..e8c3e19bf585 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -87,6 +87,12 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_UTIL, true)
 
+/*
+ * Steal a CFS task from another CPU when going idle.
+ * Improves CPU utilization.
+ */
+SCHED_FEAT(STEAL, true)
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
-- 
2.34.1
[RFC PATCH v5 1/9] sched: Provide sparsemask, a reduced contention bitmap
[RFC PATCH v5 2/9] sched/topology: Provide hooks to allocate data shared per LLC
[RFC PATCH v5 3/9] sched/topology: Provide cfs_overload_cpus bitmap
[RFC PATCH v5 4/9] sched/fair: Dynamically update cfs_overload_cpus
[RFC PATCH v5 5/9] sched/fair: Hoist idle_stamp up from idle_balance
[RFC PATCH v5 6/9] sched/fair: Generalize the detach_task interface
[RFC PATCH v5 7/9] sched/fair: Provide can_migrate_task_llc
[RFC PATCH v5 8/9] sched/fair: Steal work from an overloaded CPU when CPU goes idle
[RFC PATCH v5 9/9] sched/fair: Provide idle search schedstats