[RFC][PATCH 7/8] sched: Remove sched_class::pick_next_task()

Peter Zijlstra posted 8 patches 2 weeks, 6 days ago
[RFC][PATCH 7/8] sched: Remove sched_class::pick_next_task()
Posted by Peter Zijlstra 2 weeks, 6 days ago
The reason for pick_next_task_fair() is the put/set optimization that
avoids touching the common ancestors. However, it is possible to
implement this in the put_prev_task() and set_next_task() calls as
used in put_prev_set_next_task().

Notably, put_prev_set_next_task() is the only site that:

 - calls put_prev_task() with a .next argument;
 - calls set_next_task() with .first = true.

This means that put_prev_task() can determine the common hierarchy and
stop there, and then set_next_task() can terminate where put_prev_task
stopped.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c  |   27 +++------
 kernel/sched/fair.c  |  153 ++++++++++++++-------------------------------------
 kernel/sched/sched.h |   14 ----
 3 files changed, 52 insertions(+), 142 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5924,16 +5924,15 @@ __pick_next_task(struct rq *rq, struct t
 	if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
 		   rq->nr_running == rq->cfs.h_nr_queued)) {
 
-		p = pick_next_task_fair(rq, prev, rf);
+		p = pick_task_fair(rq, rf);
 		if (unlikely(p == RETRY_TASK))
 			goto restart;
 
 		/* Assume the next prioritized class is idle_sched_class */
-		if (!p) {
+		if (!p)
 			p = pick_task_idle(rq, rf);
-			put_prev_set_next_task(rq, prev, p);
-		}
 
+		put_prev_set_next_task(rq, prev, p);
 		return p;
 	}
 
@@ -5941,20 +5940,12 @@ __pick_next_task(struct rq *rq, struct t
 	prev_balance(rq, prev, rf);
 
 	for_each_active_class(class) {
-		if (class->pick_next_task) {
-			p = class->pick_next_task(rq, prev, rf);
-			if (unlikely(p == RETRY_TASK))
-				goto restart;
-			if (p)
-				return p;
-		} else {
-			p = class->pick_task(rq, rf);
-			if (unlikely(p == RETRY_TASK))
-				goto restart;
-			if (p) {
-				put_prev_set_next_task(rq, prev, p);
-				return p;
-			}
+		p = class->pick_task(rq, rf);
+		if (unlikely(p == RETRY_TASK))
+			goto restart;
+		if (p) {
+			put_prev_set_next_task(rq, prev, p);
+			return p;
 		}
 	}
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8891,7 +8891,7 @@ static void wakeup_preempt_fair(struct r
 	resched_curr_lazy(rq);
 }
 
-static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
+struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
 	__must_hold(__rq_lockp(rq))
 {
 	struct sched_entity *se;
@@ -8934,91 +8934,6 @@ static struct task_struct *pick_task_fai
 	return NULL;
 }
 
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-
-struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-	__must_hold(__rq_lockp(rq))
-{
-	struct sched_entity *se;
-	struct task_struct *p;
-	int new_tasks;
-
-again:
-	p = pick_task_fair(rq, rf);
-	if (!p)
-		goto idle;
-	se = &p->se;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	if (prev->sched_class != &fair_sched_class)
-		goto simple;
-
-	__put_prev_set_next_dl_server(rq, prev, p);
-
-	/*
-	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
-	 * likely that a next task is from the same cgroup as the current.
-	 *
-	 * Therefore attempt to avoid putting and setting the entire cgroup
-	 * hierarchy, only change the part that actually changes.
-	 *
-	 * Since we haven't yet done put_prev_entity and if the selected task
-	 * is a different task than we started out with, try and touch the
-	 * least amount of cfs_rqs.
-	 */
-	if (prev != p) {
-		struct sched_entity *pse = &prev->se;
-		struct cfs_rq *cfs_rq;
-
-		while (!(cfs_rq = is_same_group(se, pse))) {
-			int se_depth = se->depth;
-			int pse_depth = pse->depth;
-
-			if (se_depth <= pse_depth) {
-				put_prev_entity(cfs_rq_of(pse), pse);
-				pse = parent_entity(pse);
-			}
-			if (se_depth >= pse_depth) {
-				set_next_entity(cfs_rq_of(se), se, true);
-				se = parent_entity(se);
-			}
-		}
-
-		put_prev_entity(cfs_rq, pse);
-		set_next_entity(cfs_rq, se, true);
-
-		__set_next_task_fair(rq, p, true);
-	}
-
-	return p;
-
-simple:
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-	put_prev_set_next_task(rq, prev, p);
-	return p;
-
-idle:
-	if (rf) {
-		new_tasks = sched_balance_newidle(rq, rf);
-
-		/*
-		 * Because sched_balance_newidle() releases (and re-acquires)
-		 * rq->lock, it is possible for any higher priority task to
-		 * appear. In that case we must re-start the pick_next_entity()
-		 * loop.
-		 */
-		if (new_tasks < 0)
-			return RETRY_TASK;
-
-		if (new_tasks > 0)
-			goto again;
-	}
-
-	return NULL;
-}
-
 static struct task_struct *
 fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
 	__must_hold(__rq_lockp(dl_se->rq))
@@ -9042,10 +8957,28 @@ static void put_prev_task_fair(struct rq
 {
 	struct sched_entity *se = &prev->se;
 	struct cfs_rq *cfs_rq;
+	struct sched_entity *nse = NULL;
 
-	for_each_sched_entity(se) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (next && next->sched_class == &fair_sched_class)
+		nse = &next->se;
+#endif
+
+	while (se) {
 		cfs_rq = cfs_rq_of(se);
-		put_prev_entity(cfs_rq, se);
+		if (!nse || cfs_rq->curr)
+			put_prev_entity(cfs_rq, se);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		if (nse) {
+			if (is_same_group(se, nse))
+				break;
+			if (nse->depth >= se->depth)
+				nse = parent_entity(nse);
+			if (nse->depth > se->depth)
+				continue;
+		}
+#endif
+		se = parent_entity(se);
 	}
 }
 
@@ -13566,10 +13499,30 @@ static void switched_to_fair(struct rq *
 	}
 }
 
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+/*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 {
 	struct sched_entity *se = &p->se;
 
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) &&
+		    first && cfs_rq->curr)
+			break;
+
+		set_next_entity(cfs_rq, se, true);
+		/* ensure bandwidth has been allocated on our new cfs_rq */
+		account_cfs_rq_runtime(cfs_rq, 0);
+	}
+
+	se = &p->se;
+
 	if (task_on_rq_queued(p)) {
 		/*
 		 * Move the next running task to the front of the list, so our
@@ -13589,27 +13542,6 @@ static void __set_next_task_fair(struct
 	sched_fair_update_stop_tick(rq, p);
 }
 
-/*
- * Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
-{
-	struct sched_entity *se = &p->se;
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-		set_next_entity(cfs_rq, se, first);
-		/* ensure bandwidth has been allocated on our new cfs_rq */
-		account_cfs_rq_runtime(cfs_rq, 0);
-	}
-
-	__set_next_task_fair(rq, p, first);
-}
-
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
@@ -13921,7 +13853,6 @@ DEFINE_SCHED_CLASS(fair) = {
 	.wakeup_preempt		= wakeup_preempt_fair,
 
 	.pick_task		= pick_task_fair,
-	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 	.set_next_task          = set_next_task_fair,
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2538,17 +2538,6 @@ struct sched_class {
 	 * schedule/pick_next_task: rq->lock
 	 */
 	struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
-	/*
-	 * Optional! When implemented pick_next_task() should be equivalent to:
-	 *
-	 *   next = pick_task();
-	 *   if (next) {
-	 *       put_prev_task(prev);
-	 *       set_next_task_first(next);
-	 *   }
-	 */
-	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
-					      struct rq_flags *rf);
 
 	/*
 	 * sched_change:
@@ -2761,8 +2750,7 @@ static inline bool sched_fair_runnable(s
 	return rq->cfs.nr_queued > 0;
 }
 
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
-					       struct rq_flags *rf);
+extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf);
 extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
 
 #define SCA_CHECK		0x01