'Document' the locking context the various sched_class methods are
called under.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 6 +-
kernel/sched/sched.h | 106 ++++++++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 103 insertions(+), 9 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,8 +583,8 @@ EXPORT_SYMBOL(__trace_set_current_state)
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -4162,7 +4162,7 @@ int try_to_wake_up(struct task_struct *p
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2345,8 +2345,7 @@ extern const u32 sched_prio_to_wmult[40
/*
* {de,en}queue flags:
*
- * DEQUEUE_SLEEP - task is no longer runnable
- * ENQUEUE_WAKEUP - task just became runnable
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
@@ -2359,6 +2358,11 @@ extern const u32 sched_prio_to_wmult[40
*
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
*
+ * DELAYED - de/re-queue a sched_delayed task
+ *
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
@@ -2409,14 +2413,50 @@ struct sched_class {
int uclamp_enabled;
#endif
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+
+ /*
+ * schedule/pick_next_task: rq->lock
+ */
struct task_struct *(*pick_task)(struct rq *rq);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
@@ -2429,48 +2469,102 @@ struct sched_class {
*/
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ /*
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
+ */
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
void (*task_dead)(struct task_struct *p);
+ /*
+ * sched_change
+ */
void (*switching_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switching_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
-
- void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
- const struct load_weight *lw);
-
u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
u64 oldprio);
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
+ */
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
+
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
Hi Peter,
On 06/10/25 12:44, Peter Zijlstra wrote:
> 'Document' the locking context the various sched_class methods are
> called under.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
...
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2345,8 +2345,7 @@ extern const u32 sched_prio_to_wmult[40
> /*
> * {de,en}queue flags:
> *
> - * DEQUEUE_SLEEP - task is no longer runnable
> - * ENQUEUE_WAKEUP - task just became runnable
> + * SLEEP/WAKEUP - task is no-longer/just-became runnable
> *
> * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
> * are in a known state which allows modification. Such pairs
> @@ -2359,6 +2358,11 @@ extern const u32 sched_prio_to_wmult[40
> *
> * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
> *
> + * DELAYED - de/re-queue a sched_delayed task
> + *
> + * CLASS - going to update p->sched_class; makes sched_change call the
> + * various switch methods.
> + *
> * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
> * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
> * ENQUEUE_MIGRATED - the task was migrated during wakeup
Not for this patch, but I wondered if, while we are at it, we wanted to
complete documentation of these flags. My new AI friend is suggesting
the following, is it very much garbage? :)
Thanks,
Juri
---
From: Claude <claude-sonnet-4-5@anthropic.com>
Date: Mon, 7 Oct 2025 12:44:13 +0200
Subject: sched: Document remaining DEQUEUE/ENQUEUE flags
Complete the flag documentation by adding descriptions for the three
previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and
ENQUEUE_INITIAL.
DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped,
traced, parked, dead, or frozen) that don't use the normal wait-loop
pattern and must not use delayed dequeue.
DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to
CFS bandwidth throttling, preventing delayed dequeue to ensure proper
throttling behavior.
ENQUEUE_INITIAL is used when enqueueing newly created tasks in
wake_up_new_task(), allowing the fair scheduler to give them preferential
initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled).
Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com>
Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com>
---
kernel/sched/sched.h | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c222fa8f908..1a2b3c8d9e4f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2364,10 +2364,20 @@ extern const u32 sched_prio_to_wmult[40];
* CLASS - going to update p->sched_class; makes sched_change call the
* various switch methods.
*
+ * DEQUEUE_SPECIAL - task is in a special state (STOPPED, TRACED, PARKED,
+ * DEAD, FROZEN) that doesn't use the normal wait-loop;
+ * disables delayed dequeue.
+ *
+ * DEQUEUE_THROTTLE - dequeuing due to CFS bandwidth throttling; disables
+ * delayed dequeue to ensure proper throttling.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
+ * ENQUEUE_INITIAL - enqueuing a newly created task in wake_up_new_task();
+ * fair scheduler may give preferential initial placement
+ * (e.g., half vslice with PLACE_DEADLINE_INITIAL).
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
*/
On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote: > Not for this patch, but I wondered if, while we are at it, we wanted to > complete documentation of these flags. My new AI friend is suggesting > the following, is it very much garbage? :) Heh; its not terrible. I've been playing with local LLMs, but mostly I've found they struggle with getting enough context to not be utterly demented. And when you up the context window, they get unusable slow :/ Setting up and configuring the whole pile of subtly interlocking stacks of software to get anything useful out of this stuff is non-trivial (it reminds me of the sendmail m4 days). > --- > > From: Claude <claude-sonnet-4-5@anthropic.com> > Date: Mon, 7 Oct 2025 12:44:13 +0200 > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags > > Complete the flag documentation by adding descriptions for the three > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and > ENQUEUE_INITIAL. > > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped, > traced, parked, dead, or frozen) that don't use the normal wait-loop > pattern and must not use delayed dequeue. > > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to > CFS bandwidth throttling, preventing delayed dequeue to ensure proper > throttling behavior. > > ENQUEUE_INITIAL is used when enqueueing newly created tasks in > wake_up_new_task(), allowing the fair scheduler to give them preferential > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled). > > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com> > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com> Is this the generally acceptable form of attribution for these things? I'm not sure what the official guidance is on using these AI tools. Greg, you have any insights here? > --- > kernel/sched/sched.h | 10 ++++++++++ > 1 file changed, 10 insertions(+) > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 4c222fa8f908..1a2b3c8d9e4f 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -2364,10 +2364,20 @@ extern const u32 sched_prio_to_wmult[40]; > * CLASS - going to update p->sched_class; makes sched_change call the > * various switch methods. > * > + * DEQUEUE_SPECIAL - task is in a special state (STOPPED, TRACED, PARKED, > + * DEAD, FROZEN) that doesn't use the normal wait-loop; > + * disables delayed dequeue. > + * > + * DEQUEUE_THROTTLE - dequeuing due to CFS bandwidth throttling; disables > + * delayed dequeue to ensure proper throttling. > + * > * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) > * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) > * ENQUEUE_MIGRATED - the task was migrated during wakeup > + * ENQUEUE_INITIAL - enqueuing a newly created task in wake_up_new_task(); > + * fair scheduler may give preferential initial placement > + * (e.g., half vslice with PLACE_DEADLINE_INITIAL). > * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called > * > */ >
On Wed, Oct 08, 2025 at 09:04:19AM +0200, Peter Zijlstra wrote: > On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote: > > > Not for this patch, but I wondered if, while we are at it, we wanted to > > complete documentation of these flags. My new AI friend is suggesting > > the following, is it very much garbage? :) > > Heh; its not terrible. I've been playing with local LLMs, but mostly > I've found they struggle with getting enough context to not be utterly > demented. And when you up the context window, they get unusable slow :/ > > Setting up and configuring the whole pile of subtly interlocking stacks > of software to get anything useful out of this stuff is non-trivial (it > reminds me of the sendmail m4 days). > > > --- > > > > From: Claude <claude-sonnet-4-5@anthropic.com> > > Date: Mon, 7 Oct 2025 12:44:13 +0200 > > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags > > > > Complete the flag documentation by adding descriptions for the three > > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and > > ENQUEUE_INITIAL. > > > > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped, > > traced, parked, dead, or frozen) that don't use the normal wait-loop > > pattern and must not use delayed dequeue. > > > > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to > > CFS bandwidth throttling, preventing delayed dequeue to ensure proper > > throttling behavior. > > > > ENQUEUE_INITIAL is used when enqueueing newly created tasks in > > wake_up_new_task(), allowing the fair scheduler to give them preferential > > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled). > > > > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com> > > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com> > > Is this the generally acceptable form of attribution for these things? > I'm not sure what the official guidance is on using these AI tools. > > Greg, you have any insights here? First off, Claude can NOT sign off on anything, so that's a non-starter. All Red Hat people should know that :) Otherwise, there is a draft of something that was going to address stuff like this floating around by Dave Hansen, I'll go poke him to see what the status of that is. thanks, greg k-h
On 08/10/25 09:33, Greg Kroah-Hartman wrote: > On Wed, Oct 08, 2025 at 09:04:19AM +0200, Peter Zijlstra wrote: > > On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote: > > > > > Not for this patch, but I wondered if, while we are at it, we wanted to > > > complete documentation of these flags. My new AI friend is suggesting > > > the following, is it very much garbage? :) > > > > Heh; its not terrible. I've been playing with local LLMs, but mostly > > I've found they struggle with getting enough context to not be utterly > > demented. And when you up the context window, they get unusable slow :/ > > > > Setting up and configuring the whole pile of subtly interlocking stacks > > of software to get anything useful out of this stuff is non-trivial (it > > reminds me of the sendmail m4 days). > > > > > --- > > > > > > From: Claude <claude-sonnet-4-5@anthropic.com> > > > Date: Mon, 7 Oct 2025 12:44:13 +0200 > > > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags > > > > > > Complete the flag documentation by adding descriptions for the three > > > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and > > > ENQUEUE_INITIAL. > > > > > > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped, > > > traced, parked, dead, or frozen) that don't use the normal wait-loop > > > pattern and must not use delayed dequeue. > > > > > > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to > > > CFS bandwidth throttling, preventing delayed dequeue to ensure proper > > > throttling behavior. > > > > > > ENQUEUE_INITIAL is used when enqueueing newly created tasks in > > > wake_up_new_task(), allowing the fair scheduler to give them preferential > > > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled). > > > > > > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com> > > > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com> > > > > Is this the generally acceptable form of attribution for these things? > > I'm not sure what the official guidance is on using these AI tools. > > > > Greg, you have any insights here? > > First off, Claude can NOT sign off on anything, so that's a non-starter. > All Red Hat people should know that :) Yep, knew that. But I felt guilty nontheless as I didn't touch the change at all. Current SoB was kind of a (silly) joke. :) > Otherwise, there is a draft of something that was going to address stuff > like this floating around by Dave Hansen, I'll go poke him to see what > the status of that is. I believe it was suggested something like Co-developed-by: <model> and then Signed-off-by: <human>, but indeed curious to know how that discussion ended. Thanks! Juri
On Wed, Oct 08, 2025 at 11:43:21AM +0200, Juri Lelli wrote: > On 08/10/25 09:33, Greg Kroah-Hartman wrote: > > On Wed, Oct 08, 2025 at 09:04:19AM +0200, Peter Zijlstra wrote: > > > On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote: > > > > > > > Not for this patch, but I wondered if, while we are at it, we wanted to > > > > complete documentation of these flags. My new AI friend is suggesting > > > > the following, is it very much garbage? :) > > > > > > Heh; its not terrible. I've been playing with local LLMs, but mostly > > > I've found they struggle with getting enough context to not be utterly > > > demented. And when you up the context window, they get unusable slow :/ > > > > > > Setting up and configuring the whole pile of subtly interlocking stacks > > > of software to get anything useful out of this stuff is non-trivial (it > > > reminds me of the sendmail m4 days). > > > > > > > --- > > > > > > > > From: Claude <claude-sonnet-4-5@anthropic.com> > > > > Date: Mon, 7 Oct 2025 12:44:13 +0200 > > > > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags > > > > > > > > Complete the flag documentation by adding descriptions for the three > > > > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and > > > > ENQUEUE_INITIAL. > > > > > > > > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped, > > > > traced, parked, dead, or frozen) that don't use the normal wait-loop > > > > pattern and must not use delayed dequeue. > > > > > > > > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to > > > > CFS bandwidth throttling, preventing delayed dequeue to ensure proper > > > > throttling behavior. > > > > > > > > ENQUEUE_INITIAL is used when enqueueing newly created tasks in > > > > wake_up_new_task(), allowing the fair scheduler to give them preferential > > > > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled). > > > > > > > > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com> > > > > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com> > > > > > > Is this the generally acceptable form of attribution for these things? > > > I'm not sure what the official guidance is on using these AI tools. > > > > > > Greg, you have any insights here? > > > > First off, Claude can NOT sign off on anything, so that's a non-starter. > > All Red Hat people should know that :) > > Yep, knew that. But I felt guilty nontheless as I didn't touch the > change at all. Current SoB was kind of a (silly) joke. :) > > > Otherwise, there is a draft of something that was going to address stuff > > like this floating around by Dave Hansen, I'll go poke him to see what > > the status of that is. > > I believe it was suggested something like Co-developed-by: <model> and > then Signed-off-by: <human>, but indeed curious to know how that > discussion ended. The general answer is "you better know the copyright ownership information of the output of the tool you use" before you do anything with any of these tools. Be careful about this, because adding your signed-off-by to a patch like makes it your responsibility :) After that, treat it like any other tool that you use to generate a patch, document what you used and why/how, and you should be fine. You have to do this today if you were to use any type of tool, so in that way, "AI" is no different, with the exception of the ownership of the output result (again, consult the terms of the tool used.) Hopefully documentation updates to our process documents will reflect this in the near future. thanks, greg k-h
On Wed, 8 Oct 2025 12:06:56 +0200 Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote: > The general answer is "you better know the copyright ownership > information of the output of the tool you use" before you do anything > with any of these tools. Be careful about this, because adding your > signed-off-by to a patch like makes it your responsibility :) And there are a lot of copyright battles going on in courts wrt AI right now. It's best to see how that plays out too. > > After that, treat it like any other tool that you use to generate a > patch, document what you used and why/how, and you should be fine. You > have to do this today if you were to use any type of tool, so in that > way, "AI" is no different, with the exception of the ownership of the > output result (again, consult the terms of the tool used.) > > Hopefully documentation updates to our process documents will reflect > this in the near future. Yeah, I need to help Dave on that too. Thanks for the reminder, -- Steve
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 46a177fb01e52ec0e3f9eab9b217a0f7c8909eeb
Gitweb: https://git.kernel.org/tip/46a177fb01e52ec0e3f9eab9b217a0f7c8909eeb
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 09 Sep 2025 11:58:02 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:53 +02:00
sched: Add locking comments to sched_class methods
'Document' the locking context the various sched_class methods are
called under.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 6 +-
kernel/sched/sched.h | 108 +++++++++++++++++++++++++++++++++++++++---
2 files changed, 105 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e932439..8c55740 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,8 +583,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -4162,7 +4162,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ea2ea8f..3462145 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2345,8 +2345,7 @@ extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
*
- * DEQUEUE_SLEEP - task is no longer runnable
- * ENQUEUE_WAKEUP - task just became runnable
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
@@ -2359,11 +2358,18 @@ extern const u32 sched_prio_to_wmult[40];
*
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
*
+ * DELAYED - de/re-queue a sched_delayed task
+ *
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
+ * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but
+ * SCHED_DEADLINE seems to rely on this for now.
*/
#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
@@ -2409,14 +2415,50 @@ struct sched_class {
int uclamp_enabled;
#endif
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+
+ /*
+ * schedule/pick_next_task: rq->lock
+ */
struct task_struct *(*pick_task)(struct rq *rq);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
@@ -2429,48 +2471,102 @@ struct sched_class {
*/
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ /*
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
+ */
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
void (*task_dead)(struct task_struct *p);
+ /*
+ * sched_change
+ */
void (*switching_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switching_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
-
- void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
- const struct load_weight *lw);
-
u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
u64 oldprio);
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
+ */
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
+
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
© 2016 - 2026 Red Hat, Inc.