[v1] sched: Pass correct scheduling policy to __setscheduler_class

[PATCH] sched: Pass correct scheduling policy to __setscheduler_class

Posted by Aboorva Devarajan 1 month ago

The function __setscheduler_class determines the appropriate
sched_class based on the scheduling policy and priority. Previously,
the function used the task's pointer to retrieve the scheduling policy,
which could lead to incorrect decisions if the task's struct had an
outdated policy. This behaviour where the task pointer may reference an
outdated policy when __setscheduler_class is called, was introduced in
commit 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")

To resolve this, corresponding scheduling policy is passed directly
to __setscheduler_class instead of relying on the task pointer's cached
policy. This ensures that the correct policy is always used when
determining the scheduling class.

-------------------------------------------------------
Before Patch:
-------------------------------------------------------

```
sched_ext # ./runner -t init_enable_count
===== START =====
TEST: init_enable_count
DESCRIPTION: Verify we do the correct amount of counting of init,
             enable, etc callbacks.
OUTPUT:
ERR: init_enable_count.c:132
Expected skel->bss->enable_cnt == num_children (3 == 5)
not ok 1 init_enable_count #
=====  END  =====

=============================

RESULTS:

PASSED:  0
SKIPPED: 0
FAILED:  1
```
-------------------------------------------------------
After Patch:
-------------------------------------------------------

```
sched-ext # ./runner -t init_enable_count
===== START =====
TEST: init_enable_count
DESCRIPTION: Verify we do the correct amount of counting of init,
             enable, etc callbacks.
OUTPUT:
ok 1 init_enable_count #
=====  END  =====

=============================

RESULTS:

PASSED:  1
SKIPPED: 0
FAILED:  0
```

Fixes: 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
---
 kernel/sched/core.c     | 8 ++++----
 kernel/sched/ext.c      | 8 ++++----
 kernel/sched/ext.h      | 2 +-
 kernel/sched/sched.h    | 2 +-
 kernel/sched/syscalls.c | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dbfb5717d6af..719e0ed1e976 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
 #ifdef CONFIG_SCHED_CLASS_EXT
-	} else if (task_should_scx(p)) {
+	} else if (task_should_scx(p->policy)) {
 		p->sched_class = &ext_sched_class;
 #endif
 	} else {
@@ -7025,7 +7025,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
+const struct sched_class *__setscheduler_class(int policy, int prio)
 {
 	if (dl_prio(prio))
 		return &dl_sched_class;
@@ -7034,7 +7034,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
 		return &rt_sched_class;
 
 #ifdef CONFIG_SCHED_CLASS_EXT
-	if (task_should_scx(p))
+	if (task_should_scx(policy))
 		return &ext_sched_class;
 #endif
 
@@ -7142,7 +7142,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
-	next_class = __setscheduler_class(p, prio);
+	next_class = __setscheduler_class(p->policy, prio);
 
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5900b06fd036..40bdfe84e4f0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4256,14 +4256,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
  * Used by sched_fork() and __setscheduler_prio() to pick the matching
  * sched_class. dl/rt are already handled.
  */
-bool task_should_scx(struct task_struct *p)
+bool task_should_scx(int policy)
 {
 	if (!scx_enabled() ||
 	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
 		return false;
 	if (READ_ONCE(scx_switching_all))
 		return true;
-	return p->policy == SCHED_EXT;
+	return policy == SCHED_EXT;
 }
 
 /**
@@ -4493,7 +4493,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
-		p->sched_class = __setscheduler_class(p, p->prio);
+		p->sched_class = __setscheduler_class(p->policy, p->prio);
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
@@ -5204,7 +5204,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
 		p->scx.slice = SCX_SLICE_DFL;
-		p->sched_class = __setscheduler_class(p, p->prio);
+		p->sched_class = __setscheduler_class(p->policy, p->prio);
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 246019519231..b1675bb59fc4 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq);
 void scx_rq_activate(struct rq *rq);
 void scx_rq_deactivate(struct rq *rq);
 int scx_check_setscheduler(struct task_struct *p, int policy);
-bool task_should_scx(struct task_struct *p);
+bool task_should_scx(int policy);
 void init_sched_ext_class(void);
 
 static inline u32 scx_cpuperf_target(s32 cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 081519ffab46..29180ae51e54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3800,7 +3800,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
+extern const struct sched_class *__setscheduler_class(int policy, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 0470bcc3d204..24f9f90b6574 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -707,7 +707,7 @@ int __sched_setscheduler(struct task_struct *p,
 	}
 
 	prev_class = p->sched_class;
-	next_class = __setscheduler_class(p, newprio);
+	next_class = __setscheduler_class(policy, newprio);
 
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-- 
2.25.1

Re: [PATCH] sched: Pass correct scheduling policy to __setscheduler_class

Posted by Tejun Heo 1 month ago

On Sat, Oct 26, 2024 at 12:20:20AM +0530, Aboorva Devarajan wrote:
> The function __setscheduler_class determines the appropriate
> sched_class based on the scheduling policy and priority. Previously,
> the function used the task's pointer to retrieve the scheduling policy,
> which could lead to incorrect decisions if the task's struct had an
> outdated policy. This behaviour where the task pointer may reference an
> outdated policy when __setscheduler_class is called, was introduced in
> commit 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
> 
> To resolve this, corresponding scheduling policy is passed directly
> to __setscheduler_class instead of relying on the task pointer's cached
> policy. This ensures that the correct policy is always used when
> determining the scheduling class.
> 
> -------------------------------------------------------
> Before Patch:
> -------------------------------------------------------
> 
> ```
> sched_ext # ./runner -t init_enable_count
> ===== START =====
> TEST: init_enable_count
> DESCRIPTION: Verify we do the correct amount of counting of init,
>              enable, etc callbacks.
> OUTPUT:
> ERR: init_enable_count.c:132
> Expected skel->bss->enable_cnt == num_children (3 == 5)
> not ok 1 init_enable_count #
> =====  END  =====
> 
> =============================
> 
> RESULTS:
> 
> PASSED:  0
> SKIPPED: 0
> FAILED:  1
> ```
> -------------------------------------------------------
> After Patch:
> -------------------------------------------------------
> 
> ```
> sched-ext # ./runner -t init_enable_count
> ===== START =====
> TEST: init_enable_count
> DESCRIPTION: Verify we do the correct amount of counting of init,
>              enable, etc callbacks.
> OUTPUT:
> ok 1 init_enable_count #
> =====  END  =====
> 
> =============================
> 
> RESULTS:
> 
> PASSED:  1
> SKIPPED: 0
> FAILED:  0
> ```
> 
> Fixes: 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
> Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>

Acked-by: Tejun Heo <tj@kernel.org>

Peter, do you want me to route this patch or would tip be better?

Thanks.

-- 
tejun

Re: [PATCH] sched: Pass correct scheduling policy to __setscheduler_class

Posted by Peter Zijlstra 1 month ago

On Fri, Oct 25, 2024 at 09:28:07AM -1000, Tejun Heo wrote:
> On Sat, Oct 26, 2024 at 12:20:20AM +0530, Aboorva Devarajan wrote:
> > The function __setscheduler_class determines the appropriate
> > sched_class based on the scheduling policy and priority. Previously,
> > the function used the task's pointer to retrieve the scheduling policy,
> > which could lead to incorrect decisions if the task's struct had an
> > outdated policy. This behaviour where the task pointer may reference an
> > outdated policy when __setscheduler_class is called, was introduced in
> > commit 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
> > 
> > To resolve this, corresponding scheduling policy is passed directly
> > to __setscheduler_class instead of relying on the task pointer's cached
> > policy. This ensures that the correct policy is always used when
> > determining the scheduling class.
> > 
> > -------------------------------------------------------
> > Before Patch:
> > -------------------------------------------------------
> > 
> > ```
> > sched_ext # ./runner -t init_enable_count
> > ===== START =====
> > TEST: init_enable_count
> > DESCRIPTION: Verify we do the correct amount of counting of init,
> >              enable, etc callbacks.
> > OUTPUT:
> > ERR: init_enable_count.c:132
> > Expected skel->bss->enable_cnt == num_children (3 == 5)
> > not ok 1 init_enable_count #
> > =====  END  =====
> > 
> > =============================
> > 
> > RESULTS:
> > 
> > PASSED:  0
> > SKIPPED: 0
> > FAILED:  1
> > ```
> > -------------------------------------------------------
> > After Patch:
> > -------------------------------------------------------
> > 
> > ```
> > sched-ext # ./runner -t init_enable_count
> > ===== START =====
> > TEST: init_enable_count
> > DESCRIPTION: Verify we do the correct amount of counting of init,
> >              enable, etc callbacks.
> > OUTPUT:
> > ok 1 init_enable_count #
> > =====  END  =====
> > 
> > =============================
> > 
> > RESULTS:
> > 
> > PASSED:  1
> > SKIPPED: 0
> > FAILED:  0
> > ```
> > 
> > Fixes: 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
> > Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> 
> Acked-by: Tejun Heo <tj@kernel.org>
> 
> Peter, do you want me to route this patch or would tip be better?

Once I've figured out what the word soup means and reverse engineerd
what it actually does I'll probably take it :/

That Changelog is horrible

Re: [PATCH] sched: Pass correct scheduling policy to __setscheduler_class

Posted by Peter Zijlstra 1 month ago

On Sat, Oct 26, 2024 at 08:52:48AM +0200, Peter Zijlstra wrote:

> > Peter, do you want me to route this patch or would tip be better?
> 
> Once I've figured out what the word soup means and reverse engineerd
> what it actually does I'll probably take it :/
> 
> That Changelog is horrible

I've rewritten the Changelog and stuck the patch in queue/sched/urgent
and will move it to tip/sched/urgent on monday or so -- it's too late to
make Sunday's -rc.