One of the things lost with introduction of DELAY_DEQUEUE is the
ability of TTWU to move those tasks around on wakeup, since they're
on_rq, and as such, need to be woken in-place.
Doing the in-place thing adds quite a bit of cross-cpu latency, add a
little something that gets remote CPUs to do their own in-place
wakeups, significantly reducing the rq->lock contention.
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 74 ++++++++++++++++++++++++++++++++++++++++++------
kernel/sched/fair.c | 5 ++-
kernel/sched/features.h | 1
kernel/sched/sched.h | 1
4 files changed, 72 insertions(+), 9 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3784,6 +3784,8 @@ static int __ttwu_runnable(struct rq *rq
return 1;
}
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
+
/*
* Consider @p being inside a wait loop:
*
@@ -3811,6 +3813,33 @@ static int __ttwu_runnable(struct rq *rq
*/
static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
+#ifdef CONFIG_SMP
+ if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
+ /*
+ * Similar to try_to_block_task():
+ *
+ * __schedule() ttwu()
+ * prev_state = prev->state if (p->sched_delayed)
+ * if (prev_state) smp_acquire__after_ctrl_dep()
+ * try_to_block_task() p->state = TASK_WAKING
+ * ... set_delayed()
+ * RELEASE p->sched_delayed = 1
+ *
+ * __schedule() and ttwu() have matching control dependencies.
+ *
+ * Notably, once we observe sched_delayed we know the task has
+ * passed try_to_block_task() and p->state is ours to modify.
+ *
+ * TASK_WAKING controls ttwu() concurrency.
+ */
+ smp_acquire__after_ctrl_dep();
+ WRITE_ONCE(p->__state, TASK_WAKING);
+
+ if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
+ return 1;
+ }
+#endif
+
CLASS(__task_rq_lock, guard)(p);
return __ttwu_runnable(guard.rq, p, wake_flags);
}
@@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
update_rq_clock(rq);
llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
+ struct rq *p_rq = task_rq(p);
+ int ret;
+
+ /*
+ * This is the ttwu_runnable() case. Notably it is possible for
+ * on-rq entities to get migrated -- even sched_delayed ones.
+ */
+ if (unlikely(p_rq != rq)) {
+ rq_unlock(rq, &rf);
+ p_rq = __task_rq_lock(p, &rf);
+ }
+
+ ret = __ttwu_runnable(p_rq, p, WF_TTWU);
+
+ if (unlikely(p_rq != rq)) {
+ if (!ret)
+ set_task_cpu(p, cpu_of(rq));
+
+ __task_rq_unlock(p_rq, &rf);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ }
+
+ if (ret) {
+ // XXX ttwu_stat()
+ continue;
+ }
+
+ /*
+ * This is the 'normal' case where the task is blocked.
+ */
+
if (WARN_ON_ONCE(p->on_cpu))
smp_cond_load_acquire(&p->on_cpu, !VAL);
- if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
- set_task_cpu(p, cpu_of(rq));
-
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
@@ -3974,7 +4032,7 @@ static inline bool ttwu_queue_cond(struc
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- bool def = sched_feat(TTWU_QUEUE_DEFAULT);
+ bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
if (!ttwu_queue_cond(p, cpu, def))
return false;
@@ -4269,8 +4327,8 @@ int try_to_wake_up(struct task_struct *p
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
+ * schedule()'s try_to_block_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in try_to_block_task().
*/
smp_acquire__after_ctrl_dep();
@@ -6712,8 +6770,8 @@ static void __sched notrace __schedule(i
preempt = sched_mode == SM_PREEMPT;
/*
- * We must load prev->state once (task_struct::state is volatile), such
- * that we form a control dependency vs deactivate_task() below.
+ * We must load prev->state once, such that we form a control
+ * dependency vs try_to_block_task() below.
*/
prev_state = READ_ONCE(prev->__state);
if (sched_mode == SM_IDLE) {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5395,7 +5395,10 @@ static __always_inline void return_cfs_r
static void set_delayed(struct sched_entity *se)
{
- se->sched_delayed = 1;
+ /*
+ * See TTWU_QUEUE_DELAYED in ttwu_runnable().
+ */
+ smp_store_release(&se->sched_delayed, 1);
/*
* Delayed se of cfs_rq have no tasks queued on them.
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
SCHED_FEAT(TTWU_QUEUE, true)
#endif
SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
+SCHED_FEAT(TTWU_QUEUE_DELAYED, false)
SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
/*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2313,6 +2313,7 @@ static inline int task_on_rq_migrating(s
#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
#define WF_ON_CPU 0x0100
+#define WF_DELAYED 0x0200
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
On 20/05/2025 11:45, Peter Zijlstra wrote:
[...]
> @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> update_rq_clock(rq);
>
> llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> + struct rq *p_rq = task_rq(p);
> + int ret;
> +
> + /*
> + * This is the ttwu_runnable() case. Notably it is possible for
> + * on-rq entities to get migrated -- even sched_delayed ones.
> + */
> + if (unlikely(p_rq != rq)) {
> + rq_unlock(rq, &rf);
> + p_rq = __task_rq_lock(p, &rf);
I always get this fairly early with TTWU_QUEUE_DELAYED enabled, related
to p->pi_lock not held in wakeup from interrupt.
[ 36.175285] WARNING: CPU: 0 PID: 162 at kernel/sched/core.c:679 __task_rq_lock+0xf8/0x128
[ 36.176021] Modules linked in:
[ 36.176187] CPU: 0 UID: 0 PID: 162 Comm: (udev-worker) Tainted: G W 6.15.0-00005-gcacccfab15bd-dirty #59 PREEMPT
[ 36.176587] Tainted: [W]=WARN
[ 36.176727] Hardware name: linux,dummy-virt (DT)
[ 36.176964] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 36.177301] pc : __task_rq_lock+0xf8/0x128
[ 36.177576] lr : __task_rq_lock+0xf4/0x128
...
[ 36.181314] Call trace:
[ 36.181510] __task_rq_lock+0xf8/0x128 (P)
[ 36.181824] sched_ttwu_pending+0x2d8/0x378
[ 36.182020] __flush_smp_call_function_queue+0x138/0x37c
[ 36.182222] generic_smp_call_function_single_interrupt+0x14/0x20
[ 36.182440] ipi_handler+0x254/0x2bc
[ 36.182585] handle_percpu_devid_irq+0xa8/0x2d4
[ 36.182780] handle_irq_desc+0x34/0x58
[ 36.182942] generic_handle_domain_irq+0x1c/0x28
[ 36.183109] gic_handle_irq+0x40/0xe0
[ 36.183289] call_on_irq_stack+0x24/0x64
[ 36.183441] do_interrupt_handler+0x80/0x84
[ 36.183647] el1_interrupt+0x34/0x70
[ 36.183795] el1h_64_irq_handler+0x18/0x24
[ 36.184002] el1h_64_irq+0x6c/0x70
[...]
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2313,6 +2313,7 @@ static inline int task_on_rq_migrating(s
> #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
>
> #define WF_ON_CPU 0x0100
Looks like there is no specific handling for WF_ON_CPU yet?
> +#define WF_DELAYED 0x0200
[...]
On Fri, Jun 13, 2025 at 09:34:22AM +0200, Dietmar Eggemann wrote:
> On 20/05/2025 11:45, Peter Zijlstra wrote:
>
> [...]
>
> > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > update_rq_clock(rq);
> >
> > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > + struct rq *p_rq = task_rq(p);
> > + int ret;
> > +
> > + /*
> > + * This is the ttwu_runnable() case. Notably it is possible for
> > + * on-rq entities to get migrated -- even sched_delayed ones.
> > + */
> > + if (unlikely(p_rq != rq)) {
> > + rq_unlock(rq, &rf);
> > + p_rq = __task_rq_lock(p, &rf);
>
> I always get this fairly early with TTWU_QUEUE_DELAYED enabled, related
> to p->pi_lock not held in wakeup from interrupt.
>
> [ 36.175285] WARNING: CPU: 0 PID: 162 at kernel/sched/core.c:679 __task_rq_lock+0xf8/0x128
Thanks, let me go have a look.
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2313,6 +2313,7 @@ static inline int task_on_rq_migrating(s
> > #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
> >
> > #define WF_ON_CPU 0x0100
>
> Looks like there is no specific handling for WF_ON_CPU yet?
Oh, indeed. That didn't survive the tinkering and then I forgot to clean
it up here. Let me go find a broom and sweep these few bits under the
carpet then :-)
On Fri, Jun 13, 2025 at 11:51:19AM +0200, Peter Zijlstra wrote:
> On Fri, Jun 13, 2025 at 09:34:22AM +0200, Dietmar Eggemann wrote:
> > On 20/05/2025 11:45, Peter Zijlstra wrote:
> >
> > [...]
> >
> > > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > > update_rq_clock(rq);
> > >
> > > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > > + struct rq *p_rq = task_rq(p);
> > > + int ret;
> > > +
> > > + /*
> > > + * This is the ttwu_runnable() case. Notably it is possible for
> > > + * on-rq entities to get migrated -- even sched_delayed ones.
> > > + */
> > > + if (unlikely(p_rq != rq)) {
> > > + rq_unlock(rq, &rf);
> > > + p_rq = __task_rq_lock(p, &rf);
> >
> > I always get this fairly early with TTWU_QUEUE_DELAYED enabled, related
> > to p->pi_lock not held in wakeup from interrupt.
> >
> > [ 36.175285] WARNING: CPU: 0 PID: 162 at kernel/sched/core.c:679 __task_rq_lock+0xf8/0x128
>
> Thanks, let me go have a look.
I'm thinking this should cure things.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -677,7 +677,12 @@ struct rq *__task_rq_lock(struct task_st
{
struct rq *rq;
- lockdep_assert_held(&p->pi_lock);
+ /*
+ * TASK_WAKING is used to serialize the remote end of wakeup, rather
+ * than p->pi_lock.
+ */
+ lockdep_assert(p->__state == TASK_WAKING ||
+ lockdep_is_held(&p->pi_lock) != LOCK_STATE_NOT_HELD);
for (;;) {
rq = task_rq(p);
On 13/06/2025 12:46, Peter Zijlstra wrote:
> On Fri, Jun 13, 2025 at 11:51:19AM +0200, Peter Zijlstra wrote:
>> On Fri, Jun 13, 2025 at 09:34:22AM +0200, Dietmar Eggemann wrote:
>>> On 20/05/2025 11:45, Peter Zijlstra wrote:
[...]
>>> I always get this fairly early with TTWU_QUEUE_DELAYED enabled, related
>>> to p->pi_lock not held in wakeup from interrupt.
>>>
>>> [ 36.175285] WARNING: CPU: 0 PID: 162 at kernel/sched/core.c:679 __task_rq_lock+0xf8/0x128
>>
>> Thanks, let me go have a look.
>
> I'm thinking this should cure things.
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -677,7 +677,12 @@ struct rq *__task_rq_lock(struct task_st
> {
> struct rq *rq;
>
> - lockdep_assert_held(&p->pi_lock);
> + /*
> + * TASK_WAKING is used to serialize the remote end of wakeup, rather
> + * than p->pi_lock.
> + */
> + lockdep_assert(p->__state == TASK_WAKING ||
> + lockdep_is_held(&p->pi_lock) != LOCK_STATE_NOT_HELD);
>
> for (;;) {
> rq = task_rq(p);
Yes, it does. I assume we can only end up in sched_ttwu_pending()'s 'if
(unlikely(p_rq != rq))' when ttwu_queue_wakelist() is called from
ttwu_runnable(), i.e. for sched_delayed tasks.
On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@infradead.org> wrote:
>
> One of the things lost with introduction of DELAY_DEQUEUE is the
> ability of TTWU to move those tasks around on wakeup, since they're
> on_rq, and as such, need to be woken in-place.
I was thinking that you would call select_task_rq() somewhere in the
wake up path of delayed entity to get a chance to migrate it which was
one reason for the perf regression (and which would have also been
useful for EAS case) but IIUC, the task is still enqueued on the same
CPU but the target cpu will do the enqueue itself instead on the local
CPU. Or am I missing something ?
>
> Doing the in-place thing adds quite a bit of cross-cpu latency, add a
> little something that gets remote CPUs to do their own in-place
> wakeups, significantly reducing the rq->lock contention.
>
> Reported-by: Chris Mason <clm@meta.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> kernel/sched/core.c | 74 ++++++++++++++++++++++++++++++++++++++++++------
> kernel/sched/fair.c | 5 ++-
> kernel/sched/features.h | 1
> kernel/sched/sched.h | 1
> 4 files changed, 72 insertions(+), 9 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3784,6 +3784,8 @@ static int __ttwu_runnable(struct rq *rq
> return 1;
> }
>
> +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> +
> /*
> * Consider @p being inside a wait loop:
> *
> @@ -3811,6 +3813,33 @@ static int __ttwu_runnable(struct rq *rq
> */
> static int ttwu_runnable(struct task_struct *p, int wake_flags)
> {
> +#ifdef CONFIG_SMP
> + if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> + /*
> + * Similar to try_to_block_task():
> + *
> + * __schedule() ttwu()
> + * prev_state = prev->state if (p->sched_delayed)
> + * if (prev_state) smp_acquire__after_ctrl_dep()
> + * try_to_block_task() p->state = TASK_WAKING
> + * ... set_delayed()
> + * RELEASE p->sched_delayed = 1
> + *
> + * __schedule() and ttwu() have matching control dependencies.
> + *
> + * Notably, once we observe sched_delayed we know the task has
> + * passed try_to_block_task() and p->state is ours to modify.
> + *
> + * TASK_WAKING controls ttwu() concurrency.
> + */
> + smp_acquire__after_ctrl_dep();
> + WRITE_ONCE(p->__state, TASK_WAKING);
> +
> + if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> + return 1;
> + }
> +#endif
> +
> CLASS(__task_rq_lock, guard)(p);
> return __ttwu_runnable(guard.rq, p, wake_flags);
> }
> @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> update_rq_clock(rq);
>
> llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> + struct rq *p_rq = task_rq(p);
> + int ret;
> +
> + /*
> + * This is the ttwu_runnable() case. Notably it is possible for
> + * on-rq entities to get migrated -- even sched_delayed ones.
I haven't found where the sched_delayed task could migrate on another cpu.
> + */
> + if (unlikely(p_rq != rq)) {
> + rq_unlock(rq, &rf);
> + p_rq = __task_rq_lock(p, &rf);
> + }
> +
> + ret = __ttwu_runnable(p_rq, p, WF_TTWU);
> +
> + if (unlikely(p_rq != rq)) {
> + if (!ret)
> + set_task_cpu(p, cpu_of(rq));
> +
> + __task_rq_unlock(p_rq, &rf);
> + rq_lock(rq, &rf);
> + update_rq_clock(rq);
> + }
> +
> + if (ret) {
> + // XXX ttwu_stat()
> + continue;
> + }
> +
> + /*
> + * This is the 'normal' case where the task is blocked.
> + */
> +
> if (WARN_ON_ONCE(p->on_cpu))
> smp_cond_load_acquire(&p->on_cpu, !VAL);
>
> - if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
> - set_task_cpu(p, cpu_of(rq));
> -
> ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
> }
>
> @@ -3974,7 +4032,7 @@ static inline bool ttwu_queue_cond(struc
>
> static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
> {
> - bool def = sched_feat(TTWU_QUEUE_DEFAULT);
> + bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
>
> if (!ttwu_queue_cond(p, cpu, def))
> return false;
> @@ -4269,8 +4327,8 @@ int try_to_wake_up(struct task_struct *p
> * __schedule(). See the comment for smp_mb__after_spinlock().
> *
> * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
> - * schedule()'s deactivate_task() has 'happened' and p will no longer
> - * care about it's own p->state. See the comment in __schedule().
> + * schedule()'s try_to_block_task() has 'happened' and p will no longer
> + * care about it's own p->state. See the comment in try_to_block_task().
> */
> smp_acquire__after_ctrl_dep();
>
> @@ -6712,8 +6770,8 @@ static void __sched notrace __schedule(i
> preempt = sched_mode == SM_PREEMPT;
>
> /*
> - * We must load prev->state once (task_struct::state is volatile), such
> - * that we form a control dependency vs deactivate_task() below.
> + * We must load prev->state once, such that we form a control
> + * dependency vs try_to_block_task() below.
> */
> prev_state = READ_ONCE(prev->__state);
> if (sched_mode == SM_IDLE) {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5395,7 +5395,10 @@ static __always_inline void return_cfs_r
>
> static void set_delayed(struct sched_entity *se)
> {
> - se->sched_delayed = 1;
> + /*
> + * See TTWU_QUEUE_DELAYED in ttwu_runnable().
> + */
> + smp_store_release(&se->sched_delayed, 1);
>
> /*
> * Delayed se of cfs_rq have no tasks queued on them.
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
> SCHED_FEAT(TTWU_QUEUE, true)
> #endif
> SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
> +SCHED_FEAT(TTWU_QUEUE_DELAYED, false)
I'm not sure that the feature will be tested as people mainly test
default config
> SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
>
> /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2313,6 +2313,7 @@ static inline int task_on_rq_migrating(s
> #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
>
> #define WF_ON_CPU 0x0100
> +#define WF_DELAYED 0x0200
>
> #ifdef CONFIG_SMP
> static_assert(WF_EXEC == SD_BALANCE_EXEC);
>
>
On Fri, Jun 06, 2025 at 05:03:36PM +0200, Vincent Guittot wrote:
> On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > One of the things lost with introduction of DELAY_DEQUEUE is the
> > ability of TTWU to move those tasks around on wakeup, since they're
> > on_rq, and as such, need to be woken in-place.
>
> I was thinking that you would call select_task_rq() somewhere in the
> wake up path of delayed entity to get a chance to migrate it which was
> one reason for the perf regression (and which would have also been
> useful for EAS case) but IIUC,
FWIW, the trivial form of all this is something like the below. The
problem is that performance sucks :/ For me it is worse than not doing
it. But perhaps it is the right thing for the more complicated cases ?
On my SPR:
schbench-6.9.0-1.txt:average rps: 2975450.75
schbench-6.9.0-2.txt:average rps: 2975464.38
schbench-6.9.0-3.txt:average rps: 2974881.02
(these patches)
schbench-6.15.0-dirty-1.txt:average rps: 3029984.58
schbench-6.15.0-dirty-2.txt:average rps: 3034723.10
schbench-6.15.0-dirty-3.txt:average rps: 3033893.33
TTWU_QUEUE_DELAYED
schbench-6.15.0-dirty-delayed-1.txt:average rps: 3048778.58
schbench-6.15.0-dirty-delayed-2.txt:average rps: 3049587.90
schbench-6.15.0-dirty-delayed-3.txt:average rps: 3045826.95
NO_DELAY_DEQUEUE
schbench-6.15.0-dirty-no_delay-1.txt:average rps: 3043629.03
schbench-6.15.0-dirty-no_delay-2.txt:average rps: 3046054.47
schbench-6.15.0-dirty-no_delay-3.txt:average rps: 3044736.37
TTWU_DEQUEUE
schbench-6.15.0-dirty-dequeue-1.txt:average rps: 3008790.80
schbench-6.15.0-dirty-dequeue-2.txt:average rps: 3017497.33
schbench-6.15.0-dirty-dequeue-3.txt:average rps: 3005858.57
Index: linux-2.6/kernel/sched/core.c
===================================================================
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -3770,8 +3770,13 @@ static int __ttwu_runnable(struct rq *rq
return 0;
update_rq_clock(rq);
- if (p->se.sched_delayed)
+ if (p->se.sched_delayed) {
+ if (sched_feat(TTWU_DEQUEUE)) {
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_DELAYED | DEQUEUE_SLEEP);
+ return 0;
+ }
enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
+ }
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
Index: linux-2.6/kernel/sched/features.h
===================================================================
--- linux-2.6.orig/kernel/sched/features.h
+++ linux-2.6/kernel/sched/features.h
@@ -84,6 +84,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
SCHED_FEAT(TTWU_QUEUE_DELAYED, false)
SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
+SCHED_FEAT(TTWU_DEQUEUE, false)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
On Mon, Jun 16, 2025 at 02:01:25PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 06, 2025 at 05:03:36PM +0200, Vincent Guittot wrote:
> > On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > One of the things lost with introduction of DELAY_DEQUEUE is the
> > > ability of TTWU to move those tasks around on wakeup, since they're
> > > on_rq, and as such, need to be woken in-place.
> >
> > I was thinking that you would call select_task_rq() somewhere in the
> > wake up path of delayed entity to get a chance to migrate it which was
> > one reason for the perf regression (and which would have also been
> > useful for EAS case) but IIUC,
>
> FWIW, the trivial form of all this is something like the below. The
> problem is that performance sucks :/ For me it is worse than not doing
> it.
And because I was poking at the thing, I had to try the complicated
version again... This seems to survive long enough for a few benchmark
runs, and its not bad.
It very much burns after a while though :-( So I'll have to poke more at
this. Clearly I'm missing something (again!).
---
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -994,6 +994,7 @@ struct task_struct {
* ->sched_remote_wakeup gets used, so it can be in this word.
*/
unsigned sched_remote_wakeup:1;
+ unsigned sched_remote_delayed:1;
#ifdef CONFIG_RT_MUTEXES
unsigned sched_rt_mutex:1;
#endif
Index: linux-2.6/kernel/sched/core.c
===================================================================
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -3844,6 +3849,50 @@ static int ttwu_runnable(struct task_str
}
#ifdef CONFIG_SMP
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
+
+static inline bool ttwu_do_migrate(struct task_struct *p, int cpu)
+{
+ if (task_cpu(p) == cpu)
+ return false;
+
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ return true;
+}
+
+static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ int cpu = task_cpu(p);
+
+ /*
+ * Notably it is possible for on-rq entities to get migrated -- even
+ * sched_delayed ones.
+ */
+ if (unlikely(cpu_of(rq) != cpu)) {
+ /* chase after it */
+ __ttwu_queue_wakelist(p, cpu, wake_flags | WF_DELAYED);
+ return 1;
+ }
+
+ if (task_on_rq_queued(p))
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+ if (!ttwu_do_migrate(p, cpu))
+ return 0;
+
+ wake_flags |= WF_MIGRATED;
+ /* shoot it to the other CPU */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return 1;
+}
+
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
@@ -3857,39 +3906,12 @@ void sched_ttwu_pending(void *arg)
update_rq_clock(rq);
llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
- struct rq *p_rq = task_rq(p);
- int ret;
-
- /*
- * This is the ttwu_runnable() case. Notably it is possible for
- * on-rq entities to get migrated -- even sched_delayed ones.
- */
- if (unlikely(p_rq != rq)) {
- rq_unlock(rq, &guard.rf);
- p_rq = __task_rq_lock(p, &guard.rf);
- }
-
- ret = __ttwu_runnable(p_rq, p, WF_TTWU);
-
- if (unlikely(p_rq != rq)) {
- if (!ret)
- set_task_cpu(p, cpu_of(rq));
-
- __task_rq_unlock(p_rq, &guard.rf);
- rq_lock(rq, &guard.rf);
- update_rq_clock(rq);
- }
-
- if (ret)
- continue;
-
- /*
- * This is the 'normal' case where the task is blocked.
- */
-
if (WARN_ON_ONCE(p->on_cpu))
smp_cond_load_acquire(&p->on_cpu, !VAL);
+ if (p->sched_remote_delayed && ttwu_delayed(rq, p, WF_TTWU))
+ continue;
+
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &guard.rf);
}
@@ -3933,6 +3955,7 @@ static void __ttwu_queue_wakelist(struct
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+ p->sched_remote_delayed = !!(wake_flags & WF_DELAYED);
WRITE_ONCE(rq->ttwu_pending, 1);
__smp_call_single_queue(cpu, &p->wake_entry.llist);
@@ -4371,17 +4394,8 @@ int try_to_wake_up(struct task_struct *p
* their previous state and preserve Program Order.
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
-
- if (task_cpu(p) != cpu) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
+ if (ttwu_do_migrate(p, cpu))
wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
#else
cpu = task_cpu(p);
#endif /* CONFIG_SMP */
Hi Peter,
On Fri, Jun 06, 2025 at 05:03:36PM +0200 Vincent Guittot wrote:
> On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > One of the things lost with introduction of DELAY_DEQUEUE is the
> > ability of TTWU to move those tasks around on wakeup, since they're
> > on_rq, and as such, need to be woken in-place.
>
> I was thinking that you would call select_task_rq() somewhere in the
> wake up path of delayed entity to get a chance to migrate it which was
> one reason for the perf regression (and which would have also been
> useful for EAS case) but IIUC, the task is still enqueued on the same
> CPU but the target cpu will do the enqueue itself instead on the local
> CPU. Or am I missing something ?
Yeah, this one still bites us. We ran these patches on our perf
tests (with out twiddling any FEATs) and it was basically a wash.
The fs regression we saw due to always waking up on the same cpu
was still present as expected based on this patch I suppose.
Thanks,
Phil
>
> >
> > Doing the in-place thing adds quite a bit of cross-cpu latency, add a
> > little something that gets remote CPUs to do their own in-place
> > wakeups, significantly reducing the rq->lock contention.
> >
> > Reported-by: Chris Mason <clm@meta.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > kernel/sched/core.c | 74 ++++++++++++++++++++++++++++++++++++++++++------
> > kernel/sched/fair.c | 5 ++-
> > kernel/sched/features.h | 1
> > kernel/sched/sched.h | 1
> > 4 files changed, 72 insertions(+), 9 deletions(-)
> >
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3784,6 +3784,8 @@ static int __ttwu_runnable(struct rq *rq
> > return 1;
> > }
> >
> > +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> > +
> > /*
> > * Consider @p being inside a wait loop:
> > *
> > @@ -3811,6 +3813,33 @@ static int __ttwu_runnable(struct rq *rq
> > */
> > static int ttwu_runnable(struct task_struct *p, int wake_flags)
> > {
> > +#ifdef CONFIG_SMP
> > + if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> > + /*
> > + * Similar to try_to_block_task():
> > + *
> > + * __schedule() ttwu()
> > + * prev_state = prev->state if (p->sched_delayed)
> > + * if (prev_state) smp_acquire__after_ctrl_dep()
> > + * try_to_block_task() p->state = TASK_WAKING
> > + * ... set_delayed()
> > + * RELEASE p->sched_delayed = 1
> > + *
> > + * __schedule() and ttwu() have matching control dependencies.
> > + *
> > + * Notably, once we observe sched_delayed we know the task has
> > + * passed try_to_block_task() and p->state is ours to modify.
> > + *
> > + * TASK_WAKING controls ttwu() concurrency.
> > + */
> > + smp_acquire__after_ctrl_dep();
> > + WRITE_ONCE(p->__state, TASK_WAKING);
> > +
> > + if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> > + return 1;
> > + }
> > +#endif
> > +
> > CLASS(__task_rq_lock, guard)(p);
> > return __ttwu_runnable(guard.rq, p, wake_flags);
> > }
> > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > update_rq_clock(rq);
> >
> > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > + struct rq *p_rq = task_rq(p);
> > + int ret;
> > +
> > + /*
> > + * This is the ttwu_runnable() case. Notably it is possible for
> > + * on-rq entities to get migrated -- even sched_delayed ones.
>
> I haven't found where the sched_delayed task could migrate on another cpu.
>
> > + */
> > + if (unlikely(p_rq != rq)) {
> > + rq_unlock(rq, &rf);
> > + p_rq = __task_rq_lock(p, &rf);
> > + }
> > +
> > + ret = __ttwu_runnable(p_rq, p, WF_TTWU);
> > +
> > + if (unlikely(p_rq != rq)) {
> > + if (!ret)
> > + set_task_cpu(p, cpu_of(rq));
> > +
> > + __task_rq_unlock(p_rq, &rf);
> > + rq_lock(rq, &rf);
> > + update_rq_clock(rq);
> > + }
> > +
> > + if (ret) {
> > + // XXX ttwu_stat()
> > + continue;
> > + }
> > +
> > + /*
> > + * This is the 'normal' case where the task is blocked.
> > + */
> > +
> > if (WARN_ON_ONCE(p->on_cpu))
> > smp_cond_load_acquire(&p->on_cpu, !VAL);
> >
> > - if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
> > - set_task_cpu(p, cpu_of(rq));
> > -
> > ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
> > }
> >
> > @@ -3974,7 +4032,7 @@ static inline bool ttwu_queue_cond(struc
> >
> > static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
> > {
> > - bool def = sched_feat(TTWU_QUEUE_DEFAULT);
> > + bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
> >
> > if (!ttwu_queue_cond(p, cpu, def))
> > return false;
> > @@ -4269,8 +4327,8 @@ int try_to_wake_up(struct task_struct *p
> > * __schedule(). See the comment for smp_mb__after_spinlock().
> > *
> > * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
> > - * schedule()'s deactivate_task() has 'happened' and p will no longer
> > - * care about it's own p->state. See the comment in __schedule().
> > + * schedule()'s try_to_block_task() has 'happened' and p will no longer
> > + * care about it's own p->state. See the comment in try_to_block_task().
> > */
> > smp_acquire__after_ctrl_dep();
> >
> > @@ -6712,8 +6770,8 @@ static void __sched notrace __schedule(i
> > preempt = sched_mode == SM_PREEMPT;
> >
> > /*
> > - * We must load prev->state once (task_struct::state is volatile), such
> > - * that we form a control dependency vs deactivate_task() below.
> > + * We must load prev->state once, such that we form a control
> > + * dependency vs try_to_block_task() below.
> > */
> > prev_state = READ_ONCE(prev->__state);
> > if (sched_mode == SM_IDLE) {
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5395,7 +5395,10 @@ static __always_inline void return_cfs_r
> >
> > static void set_delayed(struct sched_entity *se)
> > {
> > - se->sched_delayed = 1;
> > + /*
> > + * See TTWU_QUEUE_DELAYED in ttwu_runnable().
> > + */
> > + smp_store_release(&se->sched_delayed, 1);
> >
> > /*
> > * Delayed se of cfs_rq have no tasks queued on them.
> > --- a/kernel/sched/features.h
> > +++ b/kernel/sched/features.h
> > @@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
> > SCHED_FEAT(TTWU_QUEUE, true)
> > #endif
> > SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
> > +SCHED_FEAT(TTWU_QUEUE_DELAYED, false)
>
> I'm not sure that the feature will be tested as people mainly test
> default config
>
> > SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
> >
> > /*
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2313,6 +2313,7 @@ static inline int task_on_rq_migrating(s
> > #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
> >
> > #define WF_ON_CPU 0x0100
> > +#define WF_DELAYED 0x0200
> >
> > #ifdef CONFIG_SMP
> > static_assert(WF_EXEC == SD_BALANCE_EXEC);
> >
> >
>
--
On Fri, Jun 06, 2025 at 05:03:36PM +0200, Vincent Guittot wrote:
> On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > One of the things lost with introduction of DELAY_DEQUEUE is the
> > ability of TTWU to move those tasks around on wakeup, since they're
> > on_rq, and as such, need to be woken in-place.
>
> I was thinking that you would call select_task_rq() somewhere in the
> wake up path of delayed entity to get a chance to migrate it which was
> one reason for the perf regression (and which would have also been
> useful for EAS case) but IIUC, the task is still enqueued on the same
> CPU but the target cpu will do the enqueue itself instead on the local
> CPU. Or am I missing something ?
Correct. I tried to add that migration into the mix, but then things get
tricky real fast.
Just getting rid of the remote rq lock also helped; these dispatch
threads just need to get on with waking up tasks, any delay hurts.
> >
> > Doing the in-place thing adds quite a bit of cross-cpu latency, add a
> > little something that gets remote CPUs to do their own in-place
> > wakeups, significantly reducing the rq->lock contention.
> >
> > Reported-by: Chris Mason <clm@meta.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> > kernel/sched/core.c | 74 ++++++++++++++++++++++++++++++++++++++++++------
> > kernel/sched/fair.c | 5 ++-
> > kernel/sched/features.h | 1
> > kernel/sched/sched.h | 1
> > 4 files changed, 72 insertions(+), 9 deletions(-)
> >
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3784,6 +3784,8 @@ static int __ttwu_runnable(struct rq *rq
> > return 1;
> > }
> >
> > +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> > +
> > /*
> > * Consider @p being inside a wait loop:
> > *
> > @@ -3811,6 +3813,33 @@ static int __ttwu_runnable(struct rq *rq
> > */
> > static int ttwu_runnable(struct task_struct *p, int wake_flags)
> > {
> > +#ifdef CONFIG_SMP
> > + if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> > + /*
> > + * Similar to try_to_block_task():
> > + *
> > + * __schedule() ttwu()
> > + * prev_state = prev->state if (p->sched_delayed)
> > + * if (prev_state) smp_acquire__after_ctrl_dep()
> > + * try_to_block_task() p->state = TASK_WAKING
> > + * ... set_delayed()
> > + * RELEASE p->sched_delayed = 1
> > + *
> > + * __schedule() and ttwu() have matching control dependencies.
> > + *
> > + * Notably, once we observe sched_delayed we know the task has
> > + * passed try_to_block_task() and p->state is ours to modify.
> > + *
> > + * TASK_WAKING controls ttwu() concurrency.
> > + */
> > + smp_acquire__after_ctrl_dep();
> > + WRITE_ONCE(p->__state, TASK_WAKING);
> > +
> > + if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> > + return 1;
> > + }
> > +#endif
> > +
> > CLASS(__task_rq_lock, guard)(p);
> > return __ttwu_runnable(guard.rq, p, wake_flags);
> > }
> > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > update_rq_clock(rq);
> >
> > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > + struct rq *p_rq = task_rq(p);
> > + int ret;
> > +
> > + /*
> > + * This is the ttwu_runnable() case. Notably it is possible for
> > + * on-rq entities to get migrated -- even sched_delayed ones.
>
> I haven't found where the sched_delayed task could migrate on another cpu.
Doesn't happen often, but it can happen. Nothing really stops it from
happening. Eg weight based balancing can do it. As can numa balancing
and affinity changes.
On Fri, 6 Jun 2025 at 17:38, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jun 06, 2025 at 05:03:36PM +0200, Vincent Guittot wrote:
> > On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > One of the things lost with introduction of DELAY_DEQUEUE is the
> > > ability of TTWU to move those tasks around on wakeup, since they're
> > > on_rq, and as such, need to be woken in-place.
> >
> > I was thinking that you would call select_task_rq() somewhere in the
> > wake up path of delayed entity to get a chance to migrate it which was
> > one reason for the perf regression (and which would have also been
> > useful for EAS case) but IIUC, the task is still enqueued on the same
> > CPU but the target cpu will do the enqueue itself instead on the local
> > CPU. Or am I missing something ?
>
> Correct. I tried to add that migration into the mix, but then things get
> tricky real fast.
Yeah, I can imagine
>
> Just getting rid of the remote rq lock also helped; these dispatch
> threads just need to get on with waking up tasks, any delay hurts.
>
> > >
> > > Doing the in-place thing adds quite a bit of cross-cpu latency, add a
> > > little something that gets remote CPUs to do their own in-place
> > > wakeups, significantly reducing the rq->lock contention.
> > >
> > > Reported-by: Chris Mason <clm@meta.com>
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > ---
> > > kernel/sched/core.c | 74 ++++++++++++++++++++++++++++++++++++++++++------
> > > kernel/sched/fair.c | 5 ++-
> > > kernel/sched/features.h | 1
> > > kernel/sched/sched.h | 1
> > > 4 files changed, 72 insertions(+), 9 deletions(-)
> > >
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -3784,6 +3784,8 @@ static int __ttwu_runnable(struct rq *rq
> > > return 1;
> > > }
> > >
> > > +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> > > +
> > > /*
> > > * Consider @p being inside a wait loop:
> > > *
> > > @@ -3811,6 +3813,33 @@ static int __ttwu_runnable(struct rq *rq
> > > */
> > > static int ttwu_runnable(struct task_struct *p, int wake_flags)
> > > {
> > > +#ifdef CONFIG_SMP
> > > + if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> > > + /*
> > > + * Similar to try_to_block_task():
> > > + *
> > > + * __schedule() ttwu()
> > > + * prev_state = prev->state if (p->sched_delayed)
> > > + * if (prev_state) smp_acquire__after_ctrl_dep()
> > > + * try_to_block_task() p->state = TASK_WAKING
> > > + * ... set_delayed()
> > > + * RELEASE p->sched_delayed = 1
> > > + *
> > > + * __schedule() and ttwu() have matching control dependencies.
> > > + *
> > > + * Notably, once we observe sched_delayed we know the task has
> > > + * passed try_to_block_task() and p->state is ours to modify.
> > > + *
> > > + * TASK_WAKING controls ttwu() concurrency.
> > > + */
> > > + smp_acquire__after_ctrl_dep();
> > > + WRITE_ONCE(p->__state, TASK_WAKING);
> > > +
> > > + if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> > > + return 1;
> > > + }
> > > +#endif
> > > +
> > > CLASS(__task_rq_lock, guard)(p);
> > > return __ttwu_runnable(guard.rq, p, wake_flags);
> > > }
> > > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > > update_rq_clock(rq);
> > >
> > > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > > + struct rq *p_rq = task_rq(p);
> > > + int ret;
> > > +
> > > + /*
> > > + * This is the ttwu_runnable() case. Notably it is possible for
> > > + * on-rq entities to get migrated -- even sched_delayed ones.
> >
> > I haven't found where the sched_delayed task could migrate on another cpu.
>
> Doesn't happen often, but it can happen. Nothing really stops it from
> happening. Eg weight based balancing can do it. As can numa balancing
> and affinity changes.
Yes, I agree that delayed tasks can migrate because of load balancing
but not at wake up.
On Fri, Jun 06, 2025 at 06:55:37PM +0200, Vincent Guittot wrote:
> > > > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > > > update_rq_clock(rq);
> > > >
> > > > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > > > + struct rq *p_rq = task_rq(p);
> > > > + int ret;
> > > > +
> > > > + /*
> > > > + * This is the ttwu_runnable() case. Notably it is possible for
> > > > + * on-rq entities to get migrated -- even sched_delayed ones.
> > >
> > > I haven't found where the sched_delayed task could migrate on another cpu.
> >
> > Doesn't happen often, but it can happen. Nothing really stops it from
> > happening. Eg weight based balancing can do it. As can numa balancing
> > and affinity changes.
>
> Yes, I agree that delayed tasks can migrate because of load balancing
> but not at wake up.
Right, but this here is the case where wakeup races with load-balancing.
Specifically, due to the wake_list, the wakeup can happen while the task
is on CPU N, and by the time the IPI gets processed the task has moved
to CPU M.
It doesn't happen often, but it was 'fun' chasing that fail around for a
day :/
On Wed, 11 Jun 2025 at 11:39, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jun 06, 2025 at 06:55:37PM +0200, Vincent Guittot wrote:
> > > > > @@ -3830,12 +3859,41 @@ void sched_ttwu_pending(void *arg)
> > > > > update_rq_clock(rq);
> > > > >
> > > > > llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
> > > > > + struct rq *p_rq = task_rq(p);
> > > > > + int ret;
> > > > > +
> > > > > + /*
> > > > > + * This is the ttwu_runnable() case. Notably it is possible for
> > > > > + * on-rq entities to get migrated -- even sched_delayed ones.
> > > >
> > > > I haven't found where the sched_delayed task could migrate on another cpu.
> > >
> > > Doesn't happen often, but it can happen. Nothing really stops it from
> > > happening. Eg weight based balancing can do it. As can numa balancing
> > > and affinity changes.
> >
> > Yes, I agree that delayed tasks can migrate because of load balancing
> > but not at wake up.
>
> Right, but this here is the case where wakeup races with load-balancing.
> Specifically, due to the wake_list, the wakeup can happen while the task
> is on CPU N, and by the time the IPI gets processed the task has moved
> to CPU M.
>
> It doesn't happen often, but it was 'fun' chasing that fail around for a
> day :/
Ok, it makes sense now.
© 2016 - 2025 Red Hat, Inc.