[RFC][PATCH 6/6] sched: Cleanup sched_delayed handling for class switches

Peter Zijlstra posted 6 patches 3 weeks, 4 days ago
[RFC][PATCH 6/6] sched: Cleanup sched_delayed handling for class switches
Posted by Peter Zijlstra 3 weeks, 4 days ago
Use the new sched_class::switching_from() method to dequeue delayed
tasks before switching to another class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   12 ++++++++----
 kernel/sched/ext.c      |   12 ++++--------
 kernel/sched/fair.c     |    7 +++++++
 kernel/sched/syscalls.c |    3 ---
 4 files changed, 19 insertions(+), 15 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7139,9 +7139,6 @@ void rt_mutex_setprio(struct task_struct
 	if (prev_class != next_class)
 		queue_flag |= DEQUEUE_CLASS;
 
-	if (prev_class != next_class && p->se.sched_delayed)
-		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
 	scoped_guard (sched_change, p, queue_flag) {
 		/*
 		 * Boosting condition are:
@@ -10530,8 +10527,15 @@ struct sched_change_ctx sched_change_beg
 
 	lockdep_assert_rq_held(rq);
 
-	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) {
+		/*
+		 * switching_from_fair() assumes CLASS implies NOCLOCK; fixing
+		 * this assumption would mean switching_from() would need to be
+		 * able to change flags.
+		 */
+		SCHED_WARN_ON(!(flags & DEQUEUE_NOCLOCK));
 		p->sched_class->switching_from(rq, p, flags);
+	}
 
 	struct sched_change_ctx ctx = {
 		.p = p,
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4485,7 +4485,7 @@ static void scx_ops_disable_workfn(struc
 
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
-		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
@@ -4493,9 +4493,7 @@ static void scx_ops_disable_workfn(struc
 		if (old_class != new_class)
 			queue_flags |= DEQUEUE_CLASS;
 
-		if (old_class != new_class && p->se.sched_delayed)
-			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
+		update_rq_clock(task_rq(p));
 		scoped_guard (sched_change, p, queue_flags) {
 			p->sched_class = new_class;
 		}
@@ -5202,7 +5200,7 @@ static int scx_ops_enable(struct sched_e
 	percpu_down_write(&scx_fork_rwsem);
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
-		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
@@ -5210,9 +5208,7 @@ static int scx_ops_enable(struct sched_e
 		if (old_class != new_class)
 			queue_flags |= DEQUEUE_CLASS;
 
-		if (old_class != new_class && p->se.sched_delayed)
-			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
+		update_rq_clock(task_rq(p));
 		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 			p->scx.slice = SCX_SLICE_DFL;
 			p->sched_class = new_class;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13175,6 +13175,12 @@ static void attach_task_cfs_rq(struct ta
 	attach_entity_cfs_rq(se);
 }
 
+static void switching_from_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+	if ((flags & DEQUEUE_CLASS) && p->se.sched_delayed)
+		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
 static void switched_from_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	detach_task_cfs_rq(p);
@@ -13592,6 +13598,7 @@ DEFINE_SCHED_CLASS(fair) = {
 
 	.reweight_task		= reweight_task_fair,
 	.prio_changed		= prio_changed_fair,
+	.switching_from		= switching_from_fair,
 	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
 
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -701,9 +701,6 @@ int __sched_setscheduler(struct task_str
 	if (prev_class != next_class)
 		queue_flags |= DEQUEUE_CLASS;
 
-	if (prev_class != next_class && p->se.sched_delayed)
-		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
 	scoped_guard (sched_change, p, queue_flags) {
 
 		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
Re: [RFC][PATCH 6/6] sched: Cleanup sched_delayed handling for class switches
Posted by Peter Zijlstra 3 weeks, 4 days ago
On Wed, Oct 30, 2024 at 04:13:01PM +0100, Peter Zijlstra wrote:

> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c

> @@ -5202,7 +5200,7 @@ static int scx_ops_enable(struct sched_e
>  	percpu_down_write(&scx_fork_rwsem);
>  	scx_task_iter_start(&sti);
>  	while ((p = scx_task_iter_next_locked(&sti))) {
> -		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
> +		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
>  		const struct sched_class *old_class = p->sched_class;
>  		const struct sched_class *new_class =
>  			__setscheduler_class(p->policy, p->prio);
> @@ -5210,9 +5208,7 @@ static int scx_ops_enable(struct sched_e
>  		if (old_class != new_class)
>  			queue_flags |= DEQUEUE_CLASS;
>  
> -		if (old_class != new_class && p->se.sched_delayed)
> -			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> -
> +		update_rq_clock(task_rq(p));
>  		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
>  			p->scx.slice = SCX_SLICE_DFL;
>  			p->sched_class = new_class;

TJ, strictly speaking you should probably do a __balance_callbacks() in
__scx_task_irq_rq_unlock(). The various switched_from() methods like to
queue_balance_callback().

I think you're currently good by the fact that only RT/DL do this, and
you're not touching those in these loops.
Re: [RFC][PATCH 6/6] sched: Cleanup sched_delayed handling for class switches
Posted by Tejun Heo 3 weeks, 4 days ago
On Wed, Oct 30, 2024 at 04:45:51PM +0100, Peter Zijlstra wrote:
> TJ, strictly speaking you should probably do a __balance_callbacks() in
> __scx_task_irq_rq_unlock(). The various switched_from() methods like to
> queue_balance_callback().
> 
> I think you're currently good by the fact that only RT/DL do this, and
> you're not touching those in these loops.

Ah, will send a patch soon.

Thanks.

-- 
tejun