sched: Address schbench regression

[PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Peter Zijlstra 3 months, 1 week ago

One of the more expensive things to do is take a remote runqueue lock;
which is exactly what ttwu_runnable() ends up doing. However, in the
case of sched_delayed tasks it is possible to queue up an IPI instead.

Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250520101727.984171377@infradead.org
---
 include/linux/sched.h   |    1 
 kernel/sched/core.c     |   96 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/fair.c     |   17 ++++++++
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    1 
 5 files changed, 110 insertions(+), 6 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -984,6 +984,7 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+	unsigned			sched_remote_delayed:1;
 #ifdef CONFIG_RT_MUTEXES
 	unsigned			sched_rt_mutex:1;
 #endif
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -675,7 +675,12 @@ struct rq *__task_rq_lock(struct task_st
 {
 	struct rq *rq;
 
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * TASK_WAKING is used to serialize the remote end of wakeup, rather
+	 * than p->pi_lock.
+	 */
+	lockdep_assert(p->__state == TASK_WAKING ||
+		       lockdep_is_held(&p->pi_lock) != LOCK_STATE_NOT_HELD);
 
 	for (;;) {
 		rq = task_rq(p);
@@ -3727,6 +3732,8 @@ ttwu_do_activate(struct rq *rq, struct t
 	}
 }
 
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
+
 /*
  * Consider @p being inside a wait loop:
  *
@@ -3754,6 +3761,35 @@ ttwu_do_activate(struct rq *rq, struct t
  */
 static int ttwu_runnable(struct task_struct *p, int wake_flags)
 {
+	if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
+		/*
+		 * Similar to try_to_block_task():
+		 *
+		 * __schedule()				ttwu()
+		 *   prev_state = prev->state		  if (p->sched_delayed)
+		 *   if (prev_state)			     smp_acquire__after_ctrl_dep()
+		 *     try_to_block_task()		     p->state = TASK_WAKING
+		 *       ... set_delayed()
+		 *         RELEASE p->sched_delayed = 1
+		 *
+		 * __schedule() and ttwu() have matching control dependencies.
+		 *
+		 * Notably, once we observe sched_delayed we know the task has
+		 * passed try_to_block_task() and p->state is ours to modify.
+		 *
+		 * TASK_WAKING controls ttwu() concurrency.
+		 */
+		smp_acquire__after_ctrl_dep();
+		WRITE_ONCE(p->__state, TASK_WAKING);
+		/*
+		 * Bit of a hack, see select_task_rq_fair()'s WF_DELAYED case.
+		 */
+		p->wake_cpu = smp_processor_id();
+
+		if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
+			return 1;
+	}
+
 	CLASS(__task_rq_lock, guard)(p);
 	struct rq *rq = guard.rq;
 
@@ -3776,6 +3812,8 @@ static int ttwu_runnable(struct task_str
 	return 1;
 }
 
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu);
+
 static inline bool ttwu_do_migrate(struct rq *rq, struct task_struct *p, int cpu)
 {
 	struct rq *p_rq = rq ? : task_rq(p);
@@ -3801,6 +3839,52 @@ static inline bool ttwu_do_migrate(struc
 	return true;
 }
 
+static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
+			struct rq_flags *rf)
+{
+	struct rq *p_rq = task_rq(p);
+	int cpu;
+
+	/*
+	 * Notably it is possible for on-rq entities to get migrated -- even
+	 * sched_delayed ones. This should be rare though, so flip the locks
+	 * rather than IPI chase after it.
+	 */
+	if (unlikely(rq != p_rq)) {
+		rq_unlock(rq, rf);
+		p_rq = __task_rq_lock(p, rf);
+		update_rq_clock(p_rq);
+	}
+
+	if (task_on_rq_queued(p))
+		dequeue_task(p_rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
+	/*
+	 * NOTE: unlike the regular try_to_wake_up() path, this runs both
+	 * select_task_rq() and ttwu_do_migrate() while holding rq->lock
+	 * rather than p->pi_lock.
+	 */
+	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+	if (ttwu_do_migrate(rq, p, cpu))
+		wake_flags |= WF_MIGRATED;
+
+	if (unlikely(rq != p_rq)) {
+		__task_rq_unlock(p_rq, rf);
+		rq_lock(rq, rf);
+	}
+
+	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+	p->sched_remote_delayed = 0;
+
+	/* it wants to run here */
+	if (cpu_of(rq) == cpu)
+		return 0;
+
+	/* shoot it to the CPU it wants to run on */
+	__ttwu_queue_wakelist(p, cpu);
+	return 1;
+}
+
 void sched_ttwu_pending(void *arg)
 {
 	struct llist_node *llist = arg;
@@ -3819,12 +3903,13 @@ void sched_ttwu_pending(void *arg)
 		if (WARN_ON_ONCE(p->on_cpu))
 			smp_cond_load_acquire(&p->on_cpu, !VAL);
 
-		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
-			set_task_cpu(p, cpu_of(rq));
-
 		if (p->sched_remote_wakeup)
 			wake_flags |= WF_MIGRATED;
 
+		if (p->sched_remote_delayed &&
+		    ttwu_delayed(rq, p, wake_flags | WF_DELAYED, &guard.rf))
+			continue;
+
 		ttwu_do_activate(rq, p, wake_flags, &guard.rf);
 	}
 
@@ -3964,12 +4049,13 @@ static inline bool ttwu_queue_cond(struc
 
 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 {
-	bool def = sched_feat(TTWU_QUEUE_DEFAULT);
+	bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
 
 	if (!ttwu_queue_cond(p, cpu, def))
 		return false;
 
 	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+	p->sched_remote_delayed = !!(wake_flags & WF_DELAYED);
 
 	__ttwu_queue_wakelist(p, cpu);
 	return true;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5327,7 +5327,10 @@ static __always_inline void return_cfs_r
 
 static void set_delayed(struct sched_entity *se)
 {
-	se->sched_delayed = 1;
+	/*
+	 * See TTWU_QUEUE_DELAYED in ttwu_runnable().
+	 */
+	smp_store_release(&se->sched_delayed, 1);
 
 	/*
 	 * Delayed se of cfs_rq have no tasks queued on them.
@@ -8481,6 +8484,18 @@ select_task_rq_fair(struct task_struct *
 	/* SD_flags and WF_flags share the first nibble */
 	int sd_flag = wake_flags & 0xF;
 
+	if (wake_flags & WF_DELAYED) {
+		/*
+		 * This is the ttwu_delayed() case; where prev_cpu is in fact
+		 * the CPU that did the wakeup, while @p is running on the
+		 * current CPU.
+		 *
+		 * Make sure to flip them the right way around, otherwise
+		 * wake-affine is going to do the wrong thing.
+		 */
+		swap(cpu, new_cpu);
+	}
+
 	/*
 	 * required for stable ->cpus_allowed
 	 */
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
 SCHED_FEAT(TTWU_QUEUE, true)
 #endif
 SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
+SCHED_FEAT(TTWU_QUEUE_DELAYED, true)
 SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
 
 /*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2280,6 +2280,7 @@ static inline int task_on_rq_migrating(s
 #define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */
 
 #define WF_ON_CPU		0x0100
+#define WF_DELAYED		0x0200
 
 static_assert(WF_EXEC == SD_BALANCE_EXEC);
 static_assert(WF_FORK == SD_BALANCE_FORK);

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Shrikanth Hegde 2 months, 2 weeks ago


On 7/2/25 17:19, Peter Zijlstra wrote:
> One of the more expensive things to do is take a remote runqueue lock;
> which is exactly what ttwu_runnable() ends up doing. However, in the
> case of sched_delayed tasks it is possible to queue up an IPI instead.
> 
> Reported-by: Chris Mason <clm@meta.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lkml.kernel.org/r/20250520101727.984171377@infradead.org
> ---
>   include/linux/sched.h   |    1
>   kernel/sched/core.c     |   96 +++++++++++++++++++++++++++++++++++++++++++++---
>   kernel/sched/fair.c     |   17 ++++++++
>   kernel/sched/features.h |    1
>   kernel/sched/sched.h    |    1
>   5 files changed, 110 insertions(+), 6 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -984,6 +984,7 @@ struct task_struct {
>   	 * ->sched_remote_wakeup gets used, so it can be in this word.
>   	 */
>   	unsigned			sched_remote_wakeup:1;
> +	unsigned			sched_remote_delayed:1;
>   #ifdef CONFIG_RT_MUTEXES
>   	unsigned			sched_rt_mutex:1;
>   #endif
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -675,7 +675,12 @@ struct rq *__task_rq_lock(struct task_st
>   {
>   	struct rq *rq;
>   
> -	lockdep_assert_held(&p->pi_lock);
> +	/*
> +	 * TASK_WAKING is used to serialize the remote end of wakeup, rather
> +	 * than p->pi_lock.
> +	 */
> +	lockdep_assert(p->__state == TASK_WAKING ||
> +		       lockdep_is_held(&p->pi_lock) != LOCK_STATE_NOT_HELD);
>   
>   	for (;;) {
>   		rq = task_rq(p);
> @@ -3727,6 +3732,8 @@ ttwu_do_activate(struct rq *rq, struct t
>   	}
>   }
>   
> +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> +
>   /*
>    * Consider @p being inside a wait loop:
>    *
> @@ -3754,6 +3761,35 @@ ttwu_do_activate(struct rq *rq, struct t
>    */
>   static int ttwu_runnable(struct task_struct *p, int wake_flags)
>   {
> +	if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> +		/*
> +		 * Similar to try_to_block_task():
> +		 *
> +		 * __schedule()				ttwu()
> +		 *   prev_state = prev->state		  if (p->sched_delayed)
> +		 *   if (prev_state)			     smp_acquire__after_ctrl_dep()
> +		 *     try_to_block_task()		     p->state = TASK_WAKING
> +		 *       ... set_delayed()
> +		 *         RELEASE p->sched_delayed = 1
> +		 *
> +		 * __schedule() and ttwu() have matching control dependencies.
> +		 *
> +		 * Notably, once we observe sched_delayed we know the task has
> +		 * passed try_to_block_task() and p->state is ours to modify.
> +		 *
> +		 * TASK_WAKING controls ttwu() concurrency.
> +		 */
> +		smp_acquire__after_ctrl_dep();
> +		WRITE_ONCE(p->__state, TASK_WAKING);
> +		/*
> +		 * Bit of a hack, see select_task_rq_fair()'s WF_DELAYED case.
> +		 */
> +		p->wake_cpu = smp_processor_id();
> +
> +		if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> +			return 1;
> +	}
> +
>   	CLASS(__task_rq_lock, guard)(p);
>   	struct rq *rq = guard.rq;
>   
> @@ -3776,6 +3812,8 @@ static int ttwu_runnable(struct task_str
>   	return 1;
>   }
>   
> +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu);
> +
>   static inline bool ttwu_do_migrate(struct rq *rq, struct task_struct *p, int cpu)
>   {
>   	struct rq *p_rq = rq ? : task_rq(p);
> @@ -3801,6 +3839,52 @@ static inline bool ttwu_do_migrate(struc
>   	return true;
>   }
>   
> +static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
> +			struct rq_flags *rf)
> +{
> +	struct rq *p_rq = task_rq(p);
> +	int cpu;
> +
> +	/*
> +	 * Notably it is possible for on-rq entities to get migrated -- even
> +	 * sched_delayed ones. This should be rare though, so flip the locks
> +	 * rather than IPI chase after it.
> +	 */
> +	if (unlikely(rq != p_rq)) {
> +		rq_unlock(rq, rf);
> +		p_rq = __task_rq_lock(p, rf);
> +		update_rq_clock(p_rq);
> +	}
> +
> +	if (task_on_rq_queued(p))
> +		dequeue_task(p_rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> +
> +	/*
> +	 * NOTE: unlike the regular try_to_wake_up() path, this runs both
> +	 * select_task_rq() and ttwu_do_migrate() while holding rq->lock
> +	 * rather than p->pi_lock.
> +	 */

When it comes here, it was because p->wake_cpu was a remote cpu and taking remote rq lock
would be costly.

So, when p->wake_cpu is passed, eventually we could end up fetching that rq again, such as in idle_cpu
checks, which also could be costly no?

Why there is need for select_task_rq here again? Why can't ttwu_do_activate be
done here instead?

> +	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
> +	if (ttwu_do_migrate(rq, p, cpu))
> +		wake_flags |= WF_MIGRATED;
> +
> +	if (unlikely(rq != p_rq)) {
> +		__task_rq_unlock(p_rq, rf);
> +		rq_lock(rq, rf);
> +	}
> +
> +	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
> +	p->sched_remote_delayed = 0;
> +
> +	/* it wants to run here */
> +	if (cpu_of(rq) == cpu)
> +		return 0;
> +
> +	/* shoot it to the CPU it wants to run on */
> +	__ttwu_queue_wakelist(p, cpu);
> +	return 1;
> +}
> +
>   void sched_ttwu_pending(void *arg)
>   {
>   	struct llist_node *llist = arg;
> @@ -3819,12 +3903,13 @@ void sched_ttwu_pending(void *arg)
>   		if (WARN_ON_ONCE(p->on_cpu))
>   			smp_cond_load_acquire(&p->on_cpu, !VAL);
>   
> -		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
> -			set_task_cpu(p, cpu_of(rq));
> -
>   		if (p->sched_remote_wakeup)
>   			wake_flags |= WF_MIGRATED;
>   
> +		if (p->sched_remote_delayed &&
> +		    ttwu_delayed(rq, p, wake_flags | WF_DELAYED, &guard.rf))
> +			continue;
> +
>   		ttwu_do_activate(rq, p, wake_flags, &guard.rf);
>   	}
>   
> @@ -3964,12 +4049,13 @@ static inline bool ttwu_queue_cond(struc
>   
>   static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
>   {
> -	bool def = sched_feat(TTWU_QUEUE_DEFAULT);
> +	bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
>   
>   	if (!ttwu_queue_cond(p, cpu, def))
>   		return false;
>   
>   	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
> +	p->sched_remote_delayed = !!(wake_flags & WF_DELAYED);
>   
>   	__ttwu_queue_wakelist(p, cpu);
>   	return true;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5327,7 +5327,10 @@ static __always_inline void return_cfs_r
>   
>   static void set_delayed(struct sched_entity *se)
>   {
> -	se->sched_delayed = 1;
> +	/*
> +	 * See TTWU_QUEUE_DELAYED in ttwu_runnable().
> +	 */
> +	smp_store_release(&se->sched_delayed, 1);
>   
>   	/*
>   	 * Delayed se of cfs_rq have no tasks queued on them.
> @@ -8481,6 +8484,18 @@ select_task_rq_fair(struct task_struct *
>   	/* SD_flags and WF_flags share the first nibble */
>   	int sd_flag = wake_flags & 0xF;
>   
> +	if (wake_flags & WF_DELAYED) {
> +		/*
> +		 * This is the ttwu_delayed() case; where prev_cpu is in fact
> +		 * the CPU that did the wakeup, while @p is running on the
> +		 * current CPU.
> +		 *
> +		 * Make sure to flip them the right way around, otherwise
> +		 * wake-affine is going to do the wrong thing.
> +		 */
> +		swap(cpu, new_cpu);
> +	}
> +
>   	/*
>   	 * required for stable ->cpus_allowed
>   	 */
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
>   SCHED_FEAT(TTWU_QUEUE, true)
>   #endif
>   SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
> +SCHED_FEAT(TTWU_QUEUE_DELAYED, true)
>   SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
>   
>   /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2280,6 +2280,7 @@ static inline int task_on_rq_migrating(s
>   #define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */
>   
>   #define WF_ON_CPU		0x0100
> +#define WF_DELAYED		0x0200
>   
>   static_assert(WF_EXEC == SD_BALANCE_EXEC);
>   static_assert(WF_FORK == SD_BALANCE_FORK);
> 
> 
>

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Dietmar Eggemann 3 months ago

On 02/07/2025 13:49, Peter Zijlstra wrote:

[...]

> @@ -3801,6 +3839,52 @@ static inline bool ttwu_do_migrate(struc
>  	return true;
>  }
>  
> +static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
> +			struct rq_flags *rf)
> +{
> +	struct rq *p_rq = task_rq(p);
> +	int cpu;
> +
> +	/*
> +	 * Notably it is possible for on-rq entities to get migrated -- even
> +	 * sched_delayed ones. This should be rare though, so flip the locks
> +	 * rather than IPI chase after it.
> +	 */
> +	if (unlikely(rq != p_rq)) {
> +		rq_unlock(rq, rf);
> +		p_rq = __task_rq_lock(p, rf);
> +		update_rq_clock(p_rq);
> +	}
> +
> +	if (task_on_rq_queued(p))
> +		dequeue_task(p_rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> +
> +	/*
> +	 * NOTE: unlike the regular try_to_wake_up() path, this runs both
> +	 * select_task_rq() and ttwu_do_migrate() while holding rq->lock
> +	 * rather than p->pi_lock.
> +	 */
> +	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);

There are 'lockdep_assert_held(&p->pi_lock)'s in select_task_rq() and
select_task_rq_fair() which should trigger IMHO? Can they be changed the
same way like  __task_rq_lock()?

> +	if (ttwu_do_migrate(rq, p, cpu))
> +		wake_flags |= WF_MIGRATED;

[...]

>  /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2280,6 +2280,7 @@ static inline int task_on_rq_migrating(s
>  #define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */
>  
>  #define WF_ON_CPU		0x0100

Looks like this is still not used. Not sure whether it can be removed or
you wanted to add a condition for this as well?

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Peter Zijlstra 3 months ago

On Tue, Jul 08, 2025 at 02:44:56PM +0200, Dietmar Eggemann wrote:

> > +	/*
> > +	 * NOTE: unlike the regular try_to_wake_up() path, this runs both
> > +	 * select_task_rq() and ttwu_do_migrate() while holding rq->lock
> > +	 * rather than p->pi_lock.
> > +	 */
> > +	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
> 
> There are 'lockdep_assert_held(&p->pi_lock)'s in select_task_rq() and
> select_task_rq_fair() which should trigger IMHO? Can they be changed the
> same way like  __task_rq_lock()?

It needs a slightly different fix; notably the reason for these is the
stability of the cpumasks. For that holding either p->pi_lock or
rq->lock is sufficient.

Something a little like so...

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3557,13 +3557,13 @@ static int select_fallback_rq(int cpu, s
 	return dest_cpu;
 }
 
-/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
- */
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
 {
-	lockdep_assert_held(&p->pi_lock);
+	/*
+	 * Ensure the sched_setaffinity() state is stable.
+	 */
+	lockdep_assert_sched_held(p);
 
 	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
 		cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8499,7 +8499,7 @@ select_task_rq_fair(struct task_struct *
 	/*
 	 * required for stable ->cpus_allowed
 	 */
-	lockdep_assert_held(&p->pi_lock);
+	lockdep_assert_sched_held(p);
 	if (wake_flags & WF_TTWU) {
 		record_wakee(p);
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1500,6 +1500,12 @@ static inline void lockdep_assert_rq_hel
 	lockdep_assert_held(__rq_lockp(rq));
 }
 
+static inline void lockdep_assert_sched_held(struct task_struct *p)
+{
+	lockdep_assert(lockdep_is_held(&p->pi_lock) ||
+		       lockdep_is_held(__rq_lockp(task_rq(p))));
+}
+
 extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
 extern bool raw_spin_rq_trylock(struct rq *rq);
 extern void raw_spin_rq_unlock(struct rq *rq);

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Peter Zijlstra 3 months ago

On Tue, Jul 08, 2025 at 02:44:56PM +0200, Dietmar Eggemann wrote:
> On 02/07/2025 13:49, Peter Zijlstra wrote:
> 
> [...]
> 
> > @@ -3801,6 +3839,52 @@ static inline bool ttwu_do_migrate(struc
> >  	return true;
> >  }
> >  
> > +static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
> > +			struct rq_flags *rf)
> > +{
> > +	struct rq *p_rq = task_rq(p);
> > +	int cpu;
> > +
> > +	/*
> > +	 * Notably it is possible for on-rq entities to get migrated -- even
> > +	 * sched_delayed ones. This should be rare though, so flip the locks
> > +	 * rather than IPI chase after it.
> > +	 */
> > +	if (unlikely(rq != p_rq)) {
> > +		rq_unlock(rq, rf);
> > +		p_rq = __task_rq_lock(p, rf);
> > +		update_rq_clock(p_rq);
> > +	}
> > +
> > +	if (task_on_rq_queued(p))
> > +		dequeue_task(p_rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> > +
> > +	/*
> > +	 * NOTE: unlike the regular try_to_wake_up() path, this runs both
> > +	 * select_task_rq() and ttwu_do_migrate() while holding rq->lock
> > +	 * rather than p->pi_lock.
> > +	 */
> > +	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
> 
> There are 'lockdep_assert_held(&p->pi_lock)'s in select_task_rq() and
> select_task_rq_fair() which should trigger IMHO? Can they be changed the
> same way like  __task_rq_lock()?

And not a single robot has yet reported this :-(.. Yeah, let me go look.
Seeing how this was performance stuff, I clearly did not run enough
lockdep builds :/

> > +	if (ttwu_do_migrate(rq, p, cpu))
> > +		wake_flags |= WF_MIGRATED;
> 
> [...]
> 
> >  /*
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2280,6 +2280,7 @@ static inline int task_on_rq_migrating(s
> >  #define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */
> >  
> >  #define WF_ON_CPU		0x0100
> 
> Looks like this is still not used. Not sure whether it can be removed or
> you wanted to add a condition for this as well?

Bah, I'm sure I deleted that at some point. Let me try killing it again
:-)

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Phil Auld 3 months ago

Hi Peter,

On Wed, Jul 02, 2025 at 01:49:36PM +0200 Peter Zijlstra wrote:
> One of the more expensive things to do is take a remote runqueue lock;
> which is exactly what ttwu_runnable() ends up doing. However, in the
> case of sched_delayed tasks it is possible to queue up an IPI instead.
> 
> Reported-by: Chris Mason <clm@meta.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lkml.kernel.org/r/20250520101727.984171377@infradead.org
> ---
>  include/linux/sched.h   |    1 
>  kernel/sched/core.c     |   96 +++++++++++++++++++++++++++++++++++++++++++++---
>  kernel/sched/fair.c     |   17 ++++++++
>  kernel/sched/features.h |    1 
>  kernel/sched/sched.h    |    1 
>  5 files changed, 110 insertions(+), 6 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -984,6 +984,7 @@ struct task_struct {
>  	 * ->sched_remote_wakeup gets used, so it can be in this word.
>  	 */
>  	unsigned			sched_remote_wakeup:1;
> +	unsigned			sched_remote_delayed:1;
>  #ifdef CONFIG_RT_MUTEXES
>  	unsigned			sched_rt_mutex:1;
>  #endif
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -675,7 +675,12 @@ struct rq *__task_rq_lock(struct task_st
>  {
>  	struct rq *rq;
>  
> -	lockdep_assert_held(&p->pi_lock);
> +	/*
> +	 * TASK_WAKING is used to serialize the remote end of wakeup, rather
> +	 * than p->pi_lock.
> +	 */
> +	lockdep_assert(p->__state == TASK_WAKING ||
> +		       lockdep_is_held(&p->pi_lock) != LOCK_STATE_NOT_HELD);
>  
>  	for (;;) {
>  		rq = task_rq(p);
> @@ -3727,6 +3732,8 @@ ttwu_do_activate(struct rq *rq, struct t
>  	}
>  }
>  
> +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
> +
>  /*
>   * Consider @p being inside a wait loop:
>   *
> @@ -3754,6 +3761,35 @@ ttwu_do_activate(struct rq *rq, struct t
>   */
>  static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  {
> +	if (sched_feat(TTWU_QUEUE_DELAYED) && READ_ONCE(p->se.sched_delayed)) {
> +		/*
> +		 * Similar to try_to_block_task():
> +		 *
> +		 * __schedule()				ttwu()
> +		 *   prev_state = prev->state		  if (p->sched_delayed)
> +		 *   if (prev_state)			     smp_acquire__after_ctrl_dep()
> +		 *     try_to_block_task()		     p->state = TASK_WAKING
> +		 *       ... set_delayed()
> +		 *         RELEASE p->sched_delayed = 1
> +		 *
> +		 * __schedule() and ttwu() have matching control dependencies.
> +		 *
> +		 * Notably, once we observe sched_delayed we know the task has
> +		 * passed try_to_block_task() and p->state is ours to modify.
> +		 *
> +		 * TASK_WAKING controls ttwu() concurrency.
> +		 */
> +		smp_acquire__after_ctrl_dep();
> +		WRITE_ONCE(p->__state, TASK_WAKING);
> +		/*
> +		 * Bit of a hack, see select_task_rq_fair()'s WF_DELAYED case.
> +		 */
> +		p->wake_cpu = smp_processor_id();
> +
> +		if (ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_DELAYED))
> +			return 1;
> +	}
> +
>  	CLASS(__task_rq_lock, guard)(p);
>  	struct rq *rq = guard.rq;
>  
> @@ -3776,6 +3812,8 @@ static int ttwu_runnable(struct task_str
>  	return 1;
>  }
>  
> +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu);
> +
>  static inline bool ttwu_do_migrate(struct rq *rq, struct task_struct *p, int cpu)
>  {
>  	struct rq *p_rq = rq ? : task_rq(p);
> @@ -3801,6 +3839,52 @@ static inline bool ttwu_do_migrate(struc
>  	return true;
>  }
>  
> +static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
> +			struct rq_flags *rf)
> +{
> +	struct rq *p_rq = task_rq(p);
> +	int cpu;
> +
> +	/*
> +	 * Notably it is possible for on-rq entities to get migrated -- even
> +	 * sched_delayed ones. This should be rare though, so flip the locks
> +	 * rather than IPI chase after it.
> +	 */
> +	if (unlikely(rq != p_rq)) {
> +		rq_unlock(rq, rf);
> +		p_rq = __task_rq_lock(p, rf);
> +		update_rq_clock(p_rq);
> +	}
> +
> +	if (task_on_rq_queued(p))
> +		dequeue_task(p_rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> +
> +	/*
> +	 * NOTE: unlike the regular try_to_wake_up() path, this runs both
> +	 * select_task_rq() and ttwu_do_migrate() while holding rq->lock
> +	 * rather than p->pi_lock.
> +	 */
> +	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
> +	if (ttwu_do_migrate(rq, p, cpu))
> +

This doesn't compile because ttwu_do_migrate() doesn't take a *rq.

It's easy enough to fix up and I'll try to have our perf team try these
out. 

Thanks,
Phil



		wake_flags |= WF_MIGRATED;
> +
> +	if (unlikely(rq != p_rq)) {
> +		__task_rq_unlock(p_rq, rf);
> +		rq_lock(rq, rf);
> +	}
> +
> +	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
> +	p->sched_remote_delayed = 0;
> +
> +	/* it wants to run here */
> +	if (cpu_of(rq) == cpu)
> +		return 0;
> +
> +	/* shoot it to the CPU it wants to run on */
> +	__ttwu_queue_wakelist(p, cpu);
> +	return 1;
> +}
> +
>  void sched_ttwu_pending(void *arg)
>  {
>  	struct llist_node *llist = arg;
> @@ -3819,12 +3903,13 @@ void sched_ttwu_pending(void *arg)
>  		if (WARN_ON_ONCE(p->on_cpu))
>  			smp_cond_load_acquire(&p->on_cpu, !VAL);
>  
> -		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
> -			set_task_cpu(p, cpu_of(rq));
> -
>  		if (p->sched_remote_wakeup)
>  			wake_flags |= WF_MIGRATED;
>  
> +		if (p->sched_remote_delayed &&
> +		    ttwu_delayed(rq, p, wake_flags | WF_DELAYED, &guard.rf))
> +			continue;
> +
>  		ttwu_do_activate(rq, p, wake_flags, &guard.rf);
>  	}
>  
> @@ -3964,12 +4049,13 @@ static inline bool ttwu_queue_cond(struc
>  
>  static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
>  {
> -	bool def = sched_feat(TTWU_QUEUE_DEFAULT);
> +	bool def = sched_feat(TTWU_QUEUE_DEFAULT) || (wake_flags & WF_DELAYED);
>  
>  	if (!ttwu_queue_cond(p, cpu, def))
>  		return false;
>  
>  	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
> +	p->sched_remote_delayed = !!(wake_flags & WF_DELAYED);
>  
>  	__ttwu_queue_wakelist(p, cpu);
>  	return true;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5327,7 +5327,10 @@ static __always_inline void return_cfs_r
>  
>  static void set_delayed(struct sched_entity *se)
>  {
> -	se->sched_delayed = 1;
> +	/*
> +	 * See TTWU_QUEUE_DELAYED in ttwu_runnable().
> +	 */
> +	smp_store_release(&se->sched_delayed, 1);
>  
>  	/*
>  	 * Delayed se of cfs_rq have no tasks queued on them.
> @@ -8481,6 +8484,18 @@ select_task_rq_fair(struct task_struct *
>  	/* SD_flags and WF_flags share the first nibble */
>  	int sd_flag = wake_flags & 0xF;
>  
> +	if (wake_flags & WF_DELAYED) {
> +		/*
> +		 * This is the ttwu_delayed() case; where prev_cpu is in fact
> +		 * the CPU that did the wakeup, while @p is running on the
> +		 * current CPU.
> +		 *
> +		 * Make sure to flip them the right way around, otherwise
> +		 * wake-affine is going to do the wrong thing.
> +		 */
> +		swap(cpu, new_cpu);
> +	}
> +
>  	/*
>  	 * required for stable ->cpus_allowed
>  	 */
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -82,6 +82,7 @@ SCHED_FEAT(TTWU_QUEUE, false)
>  SCHED_FEAT(TTWU_QUEUE, true)
>  #endif
>  SCHED_FEAT(TTWU_QUEUE_ON_CPU, true)
> +SCHED_FEAT(TTWU_QUEUE_DELAYED, true)
>  SCHED_FEAT(TTWU_QUEUE_DEFAULT, false)
>  
>  /*
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2280,6 +2280,7 @@ static inline int task_on_rq_migrating(s
>  #define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */
>  
>  #define WF_ON_CPU		0x0100
> +#define WF_DELAYED		0x0200
>  
>  static_assert(WF_EXEC == SD_BALANCE_EXEC);
>  static_assert(WF_FORK == SD_BALANCE_FORK);
> 
> 
> 

--

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Peter Zijlstra 3 months ago

On Thu, Jul 03, 2025 at 12:00:27PM -0400, Phil Auld wrote:

> > +	if (ttwu_do_migrate(rq, p, cpu))
> > +
> 
> This doesn't compile because ttwu_do_migrate() doesn't take a *rq.
> 
> It's easy enough to fix up and I'll try to have our perf team try these
> out. 

I'm confused, isn't that what patch 7 does?

Also, I updated the git tree today, fixing a silly mistake. But I don't
remember build failures here.

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by K Prateek Nayak 3 months ago

Hello Peter,

On 7/3/2025 10:17 PM, Peter Zijlstra wrote:
> Also, I updated the git tree today, fixing a silly mistake. But I don't
> remember build failures here.

Running HammerDB + MySQL on baremetal results in a splats from
assert_clock_updated() ~10min into the run on peterz:sched/core at
commit 098ac7dd8a57 ("sched: Add ttwu_queue support for delayed
tasks") which I hope is the updated one.

I'm running with the following diff and haven't seen a splat yet
(slightly longer than 10min into HammerDB; still testing):

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9855121c2440..71ac0e7effeb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3871,7 +3871,7 @@ static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
  	if (unlikely(rq != p_rq)) {
  		__task_rq_unlock(p_rq, rf);
  		rq_lock(rq, rf);
-		update_rq_clock(p_rq);
+		update_rq_clock(rq);
  	}
  
  	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
---

P.S. The full set of splats are as follows:

     ------------[ cut here ]------------
     WARNING: CPU: 141 PID: 8164 at kernel/sched/sched.h:1643 update_curr_se+0x5c/0x60
     Modules linked in: ...
     CPU: 141 UID: 1000 PID: 8164 Comm: mysqld Tainted: G     U              6.16.0-rc1-peterz-queue-098ac7+ #869 PREEMPT(voluntary)
     Tainted: [U]=USER
     Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
     RIP: 0010:update_curr_se+0x5c/0x60
     Code: be a8 00 00 00 00 48 8d 96 80 02 00 00 74 07 48 8d 96 00 01 00 00 48 8b 4a 60 48 39 c1 48 0f 4c c8 48 89 4a 60 e9 1f af d5 ff <0f> 0b eb ae 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f
     RSP: 0000:ffffd162151ffd30 EFLAGS: 00010097
     RAX: 0000000000000001 RBX: ffff8bb25c903500 RCX: ffffffff8afb3ca0
     RDX: 0000000000000009 RSI: ffff8bb25c903500 RDI: ffff8bf07e571b00
     RBP: ffff8bb25ea94400 R08: 0000014f849737b3 R09: 0000000000000009
     R10: 0000000000000001 R11: 0000000000000000 R12: ffff8bf07e571b00
     R13: ffff8bf07e571b00 R14: ffff8bb25c903500 R15: ffff8bb25ea94400
     FS:  00007fcd9e01f640(0000) GS:ffff8bf0f0858000(0000) knlGS:0000000000000000
     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
     CR2: 00007fcf3990c004 CR3: 0000000147f9900b CR4: 0000000000f70ef0
     PKRU: 55555554
     Call Trace:
      <TASK>
      update_curr+0x31/0x240
      enqueue_entity+0x2e/0x470
      enqueue_task_fair+0x122/0x850
      enqueue_task+0x88/0x1c0
      ttwu_do_activate+0x75/0x230
      sched_ttwu_pending+0x2b9/0x430
      __flush_smp_call_function_queue+0x140/0x420
      __sysvec_call_function_single+0x1c/0xb0
      sysvec_call_function_single+0x43/0xb0
      asm_sysvec_call_function_single+0x1a/0x20
     RIP: 0033:0x202a6de
     Code: 89 f1 4d 89 c4 4d 8d 44 24 01 41 0f b6 50 ff 49 8d 71 01 44 0f b6 5e ff 89 d0 44 29 d8 0f 85 01 02 00 00 48 89 ca 48 83 ea 01 <0f> 84 0c 02 00 00 4c 39 c7 75 c7 48 89 4d 80 4c 89 55 88 4c 89 4d
     RSP: 002b:00007fcd9e01a1f0 EFLAGS: 00000202
     RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000003
     RDX: 0000000000000002 RSI: 00007fc2684926c2 RDI: 00007fb96fe862b7
     RBP: 00007fcd9e01a270 R08: 00007fb96fe862b5 R09: 00007fc2684926c1
     R10: 0000000000000000 R11: 0000000000000000 R12: 00007fb96fe862b4
     R13: 0000000000000000 R14: 00000000ffffffff R15: 00007fcd9e01a310
      </TASK>
     ---[ end trace 0000000000000000 ]---

     ------------[ cut here ]------------
     WARNING: CPU: 141 PID: 8164 at kernel/sched/sched.h:1643 update_load_avg+0x6f7/0x780
     Modules linked in: ...
     CPU: 141 UID: 1000 PID: 8164 Comm: mysqld Tainted: G     U  W           6.16.0-rc1-peterz-queue-098ac7+ #869 PREEMPT(voluntary)
     Tainted: [U]=USER, [W]=WARN
     Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
     RIP: 0010:update_load_avg+0x6f7/0x780
     Code: 42 f8 e9 c8 fa ff ff 39 c1 0f 42 c8 e9 4a fb ff ff 31 c0 45 31 c9 e9 cf fb ff ff 4c 8b a7 80 01 00 00 49 29 d4 e9 5a f9 ff ff <0f> 0b e9 42 f9 ff ff 69 d0 7e b6 00 00 e9 f4 fb ff ff 39 c2 0f 42
     RSP: 0000:ffffd162151ffd20 EFLAGS: 00010097
     RAX: ffff8bf07e571b00 RBX: ffff8bb25ea94400 RCX: 0000000000000041
     RDX: 0000000000000000 RSI: ffff8bb237533500 RDI: ffff8bb25ea94400
     RBP: ffff8bb237533500 R08: 000000000000132d R09: 0000000000000009
     R10: 0000000000000001 R11: 0000000000000000 R12: ffff8bf07e571b00
     R13: 0000000000000005 R14: ffff8bb25c903500 R15: ffff8bb25ea94400
     FS:  00007fcd9e01f640(0000) GS:ffff8bf0f0858000(0000) knlGS:0000000000000000
     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
     CR2: 00007fcf3990c004 CR3: 0000000147f9900b CR4: 0000000000f70ef0
     PKRU: 55555554
     Call Trace:
      <TASK>
      ? srso_alias_return_thunk+0x5/0xfbef5
      ? update_curr+0x1bd/0x240
      enqueue_entity+0x3e/0x470
      enqueue_task_fair+0x122/0x850
      enqueue_task+0x88/0x1c0
      ttwu_do_activate+0x75/0x230
      sched_ttwu_pending+0x2b9/0x430
      __flush_smp_call_function_queue+0x140/0x420
      __sysvec_call_function_single+0x1c/0xb0
      sysvec_call_function_single+0x43/0xb0
      asm_sysvec_call_function_single+0x1a/0x20
     RIP: 0033:0x202a6de
     Code: 89 f1 4d 89 c4 4d 8d 44 24 01 41 0f b6 50 ff 49 8d 71 01 44 0f b6 5e ff 89 d0 44 29 d8 0f 85 01 02 00 00 48 89 ca 48 83 ea 01 <0f> 84 0c 02 00 00 4c 39 c7 75 c7 48 89 4d 80 4c 89 55 88 4c 89 4d
     RSP: 002b:00007fcd9e01a1f0 EFLAGS: 00000202
     RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000003
     RDX: 0000000000000002 RSI: 00007fc2684926c2 RDI: 00007fb96fe862b7
     RBP: 00007fcd9e01a270 R08: 00007fb96fe862b5 R09: 00007fc2684926c1
     R10: 0000000000000000 R11: 0000000000000000 R12: 00007fb96fe862b4
     R13: 0000000000000000 R14: 00000000ffffffff R15: 00007fcd9e01a310
      </TASK>
     ---[ end trace 0000000000000000 ]---

     ------------[ cut here ]------------
     WARNING: CPU: 141 PID: 8164 at kernel/sched/sched.h:1643 enqueue_task+0x168/0x1c0
     Modules linked in: ...
     CPU: 141 UID: 1000 PID: 8164 Comm: mysqld Tainted: G     U  W           6.16.0-rc1-peterz-queue-098ac7+ #869 PREEMPT(voluntary)
     Tainted: [U]=USER, [W]=WARN
     Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.7.3 03/30/2022
     RIP: 0010:enqueue_task+0x168/0x1c0
     Code: ee 4c 89 e7 5d 41 5c 41 5d e9 24 d0 ff ff 41 f7 c5 00 02 00 00 0f 85 e4 fe ff ff e9 18 ff ff ff e8 3d f0 ff ff e9 b9 fe ff ff <0f> 0b eb ab 85 c0 74 10 80 fa 01 19 d2 83 e2 f6 83 c2 0e e9 7a ff
     RSP: 0000:ffffd162151ffe00 EFLAGS: 00010097
     RAX: 000000003cc00001 RBX: ffff8bf07e571b00 RCX: 0000000000000000
     RDX: 000000002721340a RSI: 0000014fabb84509 RDI: ffff8bf07e55cd80
     RBP: ffff8bb237533480 R08: 0000014fabb84509 R09: 0000000000000001
     R10: 0000000000000001 R11: 0000000000000000 R12: ffff8bf07e571b00
     R13: 0000000000000009 R14: ffffd162151ffe90 R15: 0000000000000008
     FS:  00007fcd9e01f640(0000) GS:ffff8bf0f0858000(0000) knlGS:0000000000000000
     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
     CR2: 00007fcf3990c004 CR3: 0000000147f9900b CR4: 0000000000f70ef0
     PKRU: 55555554
     Call Trace:
      <TASK>
      ttwu_do_activate+0x75/0x230
      sched_ttwu_pending+0x2b9/0x430
      __flush_smp_call_function_queue+0x140/0x420
      __sysvec_call_function_single+0x1c/0xb0
      sysvec_call_function_single+0x43/0xb0
      asm_sysvec_call_function_single+0x1a/0x20
     RIP: 0033:0x202a6de
     Code: 89 f1 4d 89 c4 4d 8d 44 24 01 41 0f b6 50 ff 49 8d 71 01 44 0f b6 5e ff 89 d0 44 29 d8 0f 85 01 02 00 00 48 89 ca 48 83 ea 01 <0f> 84 0c 02 00 00 4c 39 c7 75 c7 48 89 4d 80 4c 89 55 88 4c 89 4d
     RSP: 002b:00007fcd9e01a1f0 EFLAGS: 00000202
     RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000003
     RDX: 0000000000000002 RSI: 00007fc2684926c2 RDI: 00007fb96fe862b7
     RBP: 00007fcd9e01a270 R08: 00007fb96fe862b5 R09: 00007fc2684926c1
     R10: 0000000000000000 R11: 0000000000000000 R12: 00007fb96fe862b4
     R13: 0000000000000000 R14: 00000000ffffffff R15: 00007fcd9e01a310
      </TASK>
     ---[ end trace 0000000000000000 ]---

-- 
Thanks and Regards,
Prateek

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Peter Zijlstra 3 months ago

On Fri, Jul 04, 2025 at 11:43:43AM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 7/3/2025 10:17 PM, Peter Zijlstra wrote:
> > Also, I updated the git tree today, fixing a silly mistake. But I don't
> > remember build failures here.
> 
> Running HammerDB + MySQL on baremetal results in a splats from
> assert_clock_updated() ~10min into the run on peterz:sched/core at
> commit 098ac7dd8a57 ("sched: Add ttwu_queue support for delayed
> tasks") which I hope is the updated one.
> 
> I'm running with the following diff and haven't seen a splat yet
> (slightly longer than 10min into HammerDB; still testing):
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9855121c2440..71ac0e7effeb 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3871,7 +3871,7 @@ static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags,
>  	if (unlikely(rq != p_rq)) {
>  		__task_rq_unlock(p_rq, rf);
>  		rq_lock(rq, rf);
> -		update_rq_clock(p_rq);
> +		update_rq_clock(rq);
>  	}
>  	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);

Damn,... I did the edit right on the test box and then messed it up when
editing the patch :-(

I'll go fix.

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Phil Auld 3 months ago

On Thu, Jul 03, 2025 at 06:47:08PM +0200 Peter Zijlstra wrote:
> On Thu, Jul 03, 2025 at 12:00:27PM -0400, Phil Auld wrote:
> 
> > > +	if (ttwu_do_migrate(rq, p, cpu))
> > > +
> > 
> > This doesn't compile because ttwu_do_migrate() doesn't take a *rq.
> > 
> > It's easy enough to fix up and I'll try to have our perf team try these
> > out. 
> 
> I'm confused, isn't that what patch 7 does?
>

Heh,  I seem to have not had patch 7 (psi did not make it through my
stupid gmail filters).  I did look through all the ones I had but did
not look at the numbers and see I was missing one... 

Nevermind.


Cheers,
Phil

> Also, I updated the git tree today, fixing a silly mistake. But I don't
> remember build failures here.
> 

--

Re: [PATCH v2 12/12] sched: Add ttwu_queue support for delayed tasks

Posted by Phil Auld 2 months, 3 weeks ago

Hi Peter,

On Thu, Jul 03, 2025 at 01:11:27PM -0400 Phil Auld wrote:
> On Thu, Jul 03, 2025 at 06:47:08PM +0200 Peter Zijlstra wrote:
> > On Thu, Jul 03, 2025 at 12:00:27PM -0400, Phil Auld wrote:
> > 
> > > > +	if (ttwu_do_migrate(rq, p, cpu))
> > > > +
> > > 
> > > This doesn't compile because ttwu_do_migrate() doesn't take a *rq.
> > > 
> > > It's easy enough to fix up and I'll try to have our perf team try these
> > > out. 
> > 

For the second part if this email...

Our perf team reports that this series (as originally posted without the minor
fixups) addresses the randwrite issue we were seeing with delayed dequeue
and shows a bit better performance on some other fs related tests.

It's neutral on other tests so modulo the issues others reported I think
we'd be happy to have these in.  We did not see any new regressions.

Thanks!

Cheers,
Phil
--