[PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup

K Prateek Nayak posted 3 patches 1 month, 2 weeks ago
[PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by K Prateek Nayak 1 month, 2 weeks ago
Since sched_delayed tasks remain on "rq->cfs_tasks" list even after
blocking, they can be migrated from one runqueue to another in a delayed
state by the load balancer. When they are eventually requeued or woken
up on the same CPU via the try_to_wake_up() path, the eventual
activation is clueless about the migration. This trips the PSI
accounting since, in terms of enqueue flags, PSI only considers the
following as a wakeup for PSI accounting:

    (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)

This can lead inconsistent PSI task state splat similar to:

    psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4 # Without partial fixup from this patch
    psi: inconsistent task state! task=... cpu=... psi_flags=0 clear=4 set=1 # With partial fixup from this patch

Tracking the PSI changes along with task's state revealed the following
series of events:

    psi_task_switch: psi_flags=4 clear=4 set=1 # sched_delayed is set to 1
    psi_dequeue:     psi_flags=1 clear=1 set=0 # dequeued for migration
    psi_enqueue:     psi_flags=0 clear=0 set=4 # enqueued after migration
    psi_enqueue:     psi_flags=4 clear=1 set=4 # wakeup after migration
    psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=1 set=4

Moving psi_enqueue() to after "p->sched_class->enqueue_task()" and
skipping enqueue until the delayed task is actually woken up (referred
to partial fixup previously) changes the above scenario to the
following:

    psi_task_switch: psi_flags=4 clear=4 set=1 # sched_delayed is set to 1
    psi_dequeue:     psi_flags=1 clear=1 set=0 # dequeued for migration
    psi_enqueue:     psi_flags=0 clear=0 set=0 # enqueued after migration, sched delayed
    psi_enqueue:     psi_flags=0 clear=1 set=4 # wakeup of delayed task
    psi: inconsistent task state! task=... cpu=... psi_flags=0 clear=1 set=4

psi_enqueue() tries to clear the TSK_IOWAIT since it believes the task
has not migrated due to the lack of ENQUEUE_MIGRATED flag in case of a
requeue or a full wakeup on "p->wake_cpu", but in-fact TSK_IOWAIT was
cleared during dequeue for migration and was never set again.

Define "DELAYED_MIGRATED" and set it in "p->migration_flags" when a
delayed task is migrated. This flag is consumed when the delayed entity
is finally woken up, and psi_enqueue() is notified of the migration.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Link: https://lore.kernel.org/lkml/f82def74-a64a-4a05-c8d4-4eeb3e03d0c0@amd.com/
Tested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
 kernel/sched/core.c  | 19 ++++++++++++++++++-
 kernel/sched/sched.h |  1 +
 kernel/sched/stats.h | 10 ++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52be38021ebb..1a353fa69a54 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2009,12 +2009,19 @@ unsigned long get_wchan(struct task_struct *p)
 
 void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
+	bool wakee_not_migrated = (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED);
+
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
 	if (!(flags & ENQUEUE_RESTORE)) {
 		sched_info_enqueue(rq, p);
-		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
+
+		/* Notify PSI that the task was migrated in a delayed state before wakeup. */
+		if ((p->migration_flags & DELAYED_MIGRATED) && !task_on_rq_migrating(p)) {
+			wakee_not_migrated = false;
+			p->migration_flags &= ~DELAYED_MIGRATED;
+		}
 	}
 
 	p->sched_class->enqueue_task(rq, p, flags);
@@ -2023,6 +2030,8 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * ->sched_delayed.
 	 */
 	uclamp_rq_inc(rq, p);
+	if (!(flags & ENQUEUE_RESTORE))
+		psi_enqueue(p, wakee_not_migrated);
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
@@ -2042,6 +2051,14 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & DEQUEUE_SAVE)) {
 		sched_info_dequeue(rq, p);
 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
+
+		/*
+		 * Indicate that a sched_delayed task was migrated.
+		 * enqueue_task() needs this for correct accounting
+		 * when the delayed task eventually wakes up.
+		 */
+		if (p->se.sched_delayed && task_on_rq_migrating(p))
+			p->migration_flags |= DELAYED_MIGRATED;
 	}
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..2dc2c4cb4f5f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1326,6 +1326,7 @@ static inline int cpu_of(struct rq *rq)
 }
 
 #define MDF_PUSH		0x01
+#define DELAYED_MIGRATED	0x02 /* Task was migrated when in DELAYED_DEQUEUE state */
 
 static inline bool is_migration_disabled(struct task_struct *p)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..06a2c6d3ec1e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/*
+	 * Delayed task is not ready to run yet!
+	 * Wait for a requeue before accounting.
+	 */
+	if (p->se.sched_delayed)
+		return;
+
 	if (p->in_memstall)
 		set |= TSK_MEMSTALL_RUNNING;
 
@@ -148,6 +155,9 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/* Delayed task can only be dequeued for migration. */
+	WARN_ON_ONCE(p->se.sched_delayed && sleep);
+
 	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
 	 * avoid walking all ancestors twice, psi_task_switch() handles
-- 
2.34.1
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Johannes Weiner 1 month, 2 weeks ago
Hi Prateek,

patches 1 and 2 make obvious sense to me.

On Thu, Oct 10, 2024 at 08:28:38AM +0000, K Prateek Nayak wrote:
> @@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
>  	if (static_branch_likely(&psi_disabled))
>  		return;
>  
> +	/*
> +	 * Delayed task is not ready to run yet!
> +	 * Wait for a requeue before accounting.
> +	 */
> +	if (p->se.sched_delayed)
> +		return;

This one is problematic. It clears sleeping state (memstall, iowait)
during the dequeue of the migration but doesn't restore it until the
wakeup, which could presumably be much later. This leaves a gap in the
accounting.

psi really wants the dequeue and enqueue of the migration, even when a
task is delay-dequeued. We just have to get the context parsing right
to not confuse migration queues with wakeups.

I'll try to come up with a suitable solution as well, please don't
apply this one for now.
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by K Prateek Nayak 1 month, 2 weeks ago
Hello Johannes,

On 10/10/2024 6:33 PM, Johannes Weiner wrote:
> Hi Prateek,
> 
> patches 1 and 2 make obvious sense to me.
> 
> On Thu, Oct 10, 2024 at 08:28:38AM +0000, K Prateek Nayak wrote:
>> @@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
>>   	if (static_branch_likely(&psi_disabled))
>>   		return;
>>   
>> +	/*
>> +	 * Delayed task is not ready to run yet!
>> +	 * Wait for a requeue before accounting.
>> +	 */
>> +	if (p->se.sched_delayed)
>> +		return;
> 
> This one is problematic. It clears sleeping state (memstall, iowait)
> during the dequeue of the migration but doesn't restore it until the
> wakeup, which could presumably be much later. This leaves a gap in the
> accounting.
> 
> psi really wants the dequeue and enqueue of the migration, even when a
> task is delay-dequeued. We just have to get the context parsing right
> to not confuse migration queues with wakeups.

I see! I was confused in my interpretation of the expectations. Sorry
about that! Can you please give the attached patch a try on top of the
first two patches of series. It survived my (decently stressy) stress
test but it is not as extensive the ones you have :)

> 
> I'll try to come up with a suitable solution as well, please don't
> apply this one for now.

Thank you Peter for holding off the patches and sorry for the late
surprise!

-- 
Thanks and Regards,
PrateekFrom 4ad25e7e6a0eb539c5544a81bb3b22ca7cc05fe0 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 10 Oct 2024 15:31:41 +0000
Subject: [PATCH v1.1 3/3] sched/core: Move PSI flags when delayed task is migrated

Since sched_delayed tasks remain on "rq->cfs_tasks" list even after
blocking, they can be migrated from one runqueue to another in a delayed
state by the load balancer. When they are eventually requeued or woken
up on the same CPU via the try_to_wake_up() path, the eventual
activation is clueless about the migration. This trips the PSI
accounting since, in terms of enqueue flags, PSI only considers the
following as a wakeup for PSI accounting:

    (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)

This can lead inconsistent PSI task state splat similar to:

    psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4

Migrate the PSI flags as well when delayed tasks are migrated. Since
psi_dequeue() clears p->psi_flags, use the same to cache the original
PSI flags and requeue it after enqueue during migration.

Note: Delayed tasks will not track TSK_RUNNING or TSK_MEMSTALL_RUNNING
signals despite being queued on the runqueue , however they'll still
track and migrate TSK_IOWAIT and TSK_MEMSTALL signals.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Link: https://lore.kernel.org/lkml/20241010130316.GA181795@cmpxchg.org/
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Changelog v1..v1.1:

o Alternate approach that migrates the PSI flags too when the delayed
  task is migrated by the load balancer.
---
 kernel/sched/core.c  |  9 ++++++---
 kernel/sched/stats.h | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52be38021ebb..c3ac63d3eb9d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2012,10 +2012,8 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
+	if (!(flags & ENQUEUE_RESTORE))
 		sched_info_enqueue(rq, p);
-		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
-	}
 
 	p->sched_class->enqueue_task(rq, p, flags);
 	/*
@@ -2023,6 +2021,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * ->sched_delayed.
 	 */
 	uclamp_rq_inc(rq, p);
+	if (!(flags & ENQUEUE_RESTORE)) {
+		/* Tasks can only remain dealyed after an enqueue if they are migrating */
+		WARN_ON_ONCE(p->se.sched_delayed && !task_on_rq_migrating(p));
+		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
+	}
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..d267a187304c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -129,6 +129,15 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	if (p->se.sched_delayed) {
+		unsigned int flags = p->psi_flags;
+
+		/* Restore the state saved by psi_dequeue() */
+		p->psi_flags = 0;
+		psi_task_change(p, 0, flags);
+		return;
+	}
+
 	if (p->in_memstall)
 		set |= TSK_MEMSTALL_RUNNING;
 
@@ -145,9 +154,14 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 
 static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
+	unsigned int flags = p->psi_flags;
+
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/* Delayed task can only be dequeued for migration. */
+	WARN_ON_ONCE(p->se.sched_delayed && sleep);
+
 	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
 	 * avoid walking all ancestors twice, psi_task_switch() handles
@@ -158,6 +172,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 		return;
 
 	psi_task_change(p, p->psi_flags, 0);
+
+	/*
+	 * Since the delayed entity is migrating, the PSI flags need to
+	 * be migrated too. Since p->psi_flags is not cleared, cache the
+	 * previous state there for psi_enqueue() to restore.
+	 */
+	if (p->se.sched_delayed)
+		p->psi_flags = flags;
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
-- 
2.34.1

Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Peter Zijlstra 1 month, 2 weeks ago
On Thu, Oct 10, 2024 at 09:03:16AM -0400, Johannes Weiner wrote:

> I'll try to come up with a suitable solution as well, please don't
> apply this one for now.

I'll make sure it doesn't end up in tip as-is.
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Johannes Weiner 1 month, 2 weeks ago
On Thu, Oct 10, 2024 at 03:06:21PM +0200, Peter Zijlstra wrote:
> On Thu, Oct 10, 2024 at 09:03:16AM -0400, Johannes Weiner wrote:
> 
> > I'll try to come up with a suitable solution as well, please don't
> > apply this one for now.
> 
> I'll make sure it doesn't end up in tip as-is.

Thanks.

This would be a replacement patch for #2 and #3 that handles migration
of delayed tasks. It's slightly more invasive on the psi callback
side, but I think it keeps the sched core bits simpler. Thoughts?

---

From d72a665d7c7c7d9c806424f473d13452754471d3 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 10 Oct 2024 14:37:43 -0400
Subject: [PATCH] sched: psi: handle delayed-dequeue task migration

Since sched_delayed tasks remain queued even after blocking, the load
balancer can migrate them between runqueues while PSI considers them
to be asleep. As a result, it misreads the migration requeue followed
by a wakeup as a double queue:

  psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4

First, call psi_enqueue() after p->sched_class->enqueue_task(). A
wakeup will clear p->se.sched_delayed while a migration will not, so
psi can use that flag to tell them apart.

Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
are being migrated.

Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
call down with a new ENQUEUE_DELAYED. Instead of further complicating
the wakeup conditional in enqueue_task(), identify migration contexts
instead and default to wakeup handling for all other cases.

Debugged-by-and-original-fix-by: K Prateek Nayak <kprateek.nayak@amd.com>
Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Link: https://lore.kernel.org/lkml/f82def74-a64a-4a05-c8d4-4eeb3e03d0c0@amd.com/
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/sched/core.c  | 12 +++++------
 kernel/sched/stats.h | 48 ++++++++++++++++++++++++++++++--------------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88cbfc671fb6..527502a86ff9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2012,11 +2012,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
-		sched_info_enqueue(rq, p);
-		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
-	}
-
 	p->sched_class->enqueue_task(rq, p, flags);
 	/*
 	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
@@ -2024,6 +2019,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p);
 
+	if (!(flags & ENQUEUE_RESTORE)) {
+		sched_info_enqueue(rq, p);
+		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
+	}
+
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
 }
@@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!(flags & DEQUEUE_SAVE)) {
 		sched_info_dequeue(rq, p);
-		psi_dequeue(p, flags & DEQUEUE_SLEEP);
+		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
 	}
 
 	/*
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780aa3c53..767e098a3bd1 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
  * memory stalls. As a result, it has to distinguish between sleeps,
- * where a task's runnable state changes, and requeues, where a task
- * and its state are being moved between CPUs and runqueues.
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
  */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+static inline void psi_enqueue(struct task_struct *p, bool migrate)
 {
-	int clear = 0, set = TSK_RUNNING;
+	int clear = 0, set = 0;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (p->in_memstall)
-		set |= TSK_MEMSTALL_RUNNING;
-
-	if (!wakeup) {
+	if (p->se.sched_delayed) {
+		/* CPU migration of "sleeping" task */
+		SCHED_WARN_ON(!migrate);
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
+		if (p->in_iowait)
+			set |= TSK_IOWAIT;
+	} else if (migrate) {
+		/* CPU migration of runnable task */
+		set = TSK_RUNNING;
+		if (p->in_memstall)
+			set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
 	} else {
+		/* Wakeup of new or sleeping task */
 		if (p->in_iowait)
 			clear |= TSK_IOWAIT;
+		set = TSK_RUNNING;
+		if (p->in_memstall)
+			set |= TSK_MEMSTALL_RUNNING;
 	}
 
 	psi_task_change(p, clear, set);
 }
 
-static inline void psi_dequeue(struct task_struct *p, bool sleep)
+static inline void psi_dequeue(struct task_struct *p, bool migrate)
 {
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/*
+	 * When migrating a task to another CPU, clear all psi
+	 * state. The enqueue callback above will work it out.
+	 */
+	if (migrate)
+		psi_task_change(p, p->psi_flags, 0);
+
 	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
 	 * avoid walking all ancestors twice, psi_task_switch() handles
 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
 	 * Do nothing here.
 	 */
-	if (sleep)
-		return;
-
-	psi_task_change(p, p->psi_flags, 0);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
 }
 
 #else /* CONFIG_PSI */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
-static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
 static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
-- 
2.46.2
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Peter Zijlstra 1 month, 2 weeks ago
On Thu, Oct 10, 2024 at 03:37:12PM -0400, Johannes Weiner wrote:

> This would be a replacement patch for #2 and #3 that handles migration
> of delayed tasks.

Thanks!

> It's slightly more invasive on the psi callback
> side, but I think it keeps the sched core bits simpler. Thoughts?

I wouldn't mind if psi_{en,de}queue() get the full flags argument in the
future. For now the one boolean seems to work, but perhaps it makes more
sense to just pass the flags along in their entirety.
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Johannes Weiner 1 month, 2 weeks ago
On Fri, Oct 11, 2024 at 10:33:23AM +0200, Peter Zijlstra wrote:
> On Thu, Oct 10, 2024 at 03:37:12PM -0400, Johannes Weiner wrote:
> > It's slightly more invasive on the psi callback
> > side, but I think it keeps the sched core bits simpler. Thoughts?
> 
> I wouldn't mind if psi_{en,de}queue() get the full flags argument in the
> future. For now the one boolean seems to work, but perhaps it makes more
> sense to just pass the flags along in their entirety.

Something like this?

I like it better too. There is a weird asymmetry between passing
ENQ_MIGRATED to one and !ENQ_SLEEP to the other both as "migrate".

No strong preference for whether the ENQUEUE_RESTORE check should be
in caller or callee, but I figured if we pass the flags anyway...

I toyed with a separate branch for ENQUEUE_INITIAL. But it saves one
branch during fork while adding one to repeat enqueues. The latter
should be hotter on average, so I removed it again.

Completely untested. But if it looks good, I'll send a proper patch.

---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 527502a86ff9..42cf181bf3ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2019,10 +2019,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
+	psi_enqueue(p, flags);
+
+	if (!(flags & ENQUEUE_RESTORE))
 		sched_info_enqueue(rq, p);
-		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
-	}
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
@@ -2039,10 +2039,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & DEQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & DEQUEUE_SAVE)) {
+	if (!(flags & DEQUEUE_SAVE))
 		sched_info_dequeue(rq, p);
-		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
-	}
+
+	psi_dequeue(p, flags);
 
 	/*
 	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 767e098a3bd1..8ee0add5a48a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
  * go through migration requeues. In this case, *sleeping* states need
  * to be transferred.
  */
-static inline void psi_enqueue(struct task_struct *p, bool migrate)
+static inline void psi_enqueue(struct task_struct *p, int flags)
 {
 	int clear = 0, set = 0;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/* Same runqueue, nothing changed for psi */
+	if (flags & ENQUEUE_RESTORE)
+		return;
+
 	if (p->se.sched_delayed) {
 		/* CPU migration of "sleeping" task */
-		SCHED_WARN_ON(!migrate);
+		SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
 		if (p->in_iowait)
 			set |= TSK_IOWAIT;
-	} else if (migrate) {
+	} else if (flags & ENQUEUE_MIGRATED) {
 		/* CPU migration of runnable task */
 		set = TSK_RUNNING;
 		if (p->in_memstall)
@@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate)
 	psi_task_change(p, clear, set);
 }
 
-static inline void psi_dequeue(struct task_struct *p, bool migrate)
+static inline void psi_dequeue(struct task_struct *p, int flags)
 {
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	/*
-	 * When migrating a task to another CPU, clear all psi
-	 * state. The enqueue callback above will work it out.
-	 */
-	if (migrate)
-		psi_task_change(p, p->psi_flags, 0);
+	/* Same runqueue, nothing changed for psi */
+	if (flags & DEQUEUE_SAVE)
+		return;
 
 	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
@@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate)
 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
 	 * Do nothing here.
 	 */
+	if (flags & DEQUEUE_SLEEP)
+		return;
+
+	/*
+	 * When migrating a task to another CPU, clear all psi
+	 * state. The enqueue callback above will work it out.
+	 */
+	psi_task_change(p, p->psi_flags, 0);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Peter Zijlstra 1 month, 2 weeks ago
On Fri, Oct 11, 2024 at 06:08:03AM -0400, Johannes Weiner wrote:

> Something like this?

Yeah, something like that indeed :-)

> I like it better too. There is a weird asymmetry between passing
> ENQ_MIGRATED to one and !ENQ_SLEEP to the other both as "migrate".

So I have a note to stare at the whole {EN,DE}QUEUE_MIGRATING,
ENQUEUE_MIGRATED and task_on_rq_migrating() situation, it has the
feeling that something could be done to clean up there.

> No strong preference for whether the ENQUEUE_RESTORE check should be
> in caller or callee, but I figured if we pass the flags anyway...

Right, and this way it's behind the static key, so win, right :-)

> I toyed with a separate branch for ENQUEUE_INITIAL. But it saves one
> branch during fork while adding one to repeat enqueues. The latter
> should be hotter on average, so I removed it again.
> 
> Completely untested. But if it looks good, I'll send a proper patch.

Sure. Thanks for doing this.
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by Johannes Weiner 1 month, 1 week ago
On Fri, Oct 11, 2024 at 12:39:58PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 11, 2024 at 06:08:03AM -0400, Johannes Weiner wrote:
> > Completely untested. But if it looks good, I'll send a proper patch.
> 
> Sure. Thanks for doing this.

Ok here goes. Built, booted and runtime-tested.

---

From 91f230caa0119877cb861047e1af1371bf00d908 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 11 Oct 2024 06:18:07 -0400
Subject: [PATCH] sched: psi: pass enqueue/dequeue flags to psi callbacks
 directly

What psi needs to do on each enqueue and dequeue has gotten more
subtle, and the generic sched code trying to distill this into a bool
for the callbacks is awkward.

Pass the flags directly and let psi parse them. For that to work, the
#include "stats.h" (which has the psi callback implementations) needs
to be below the flag definitions in "sched.h". Move that section
further down, next to some of the other accounting stuff.

This also puts the ENQUEUE_SAVE/RESTORE branch behind the psi jump
label, slightly reducing overhead when PSI=y but runtime disabled.

Link: https://lore.kernel.org/lkml/20241011083323.GL17263@noisy.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/sched/core.c  | 12 +++++-----
 kernel/sched/sched.h | 56 ++++++++++++++++++++++----------------------
 kernel/sched/stats.h | 29 +++++++++++++++--------
 3 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 527502a86ff9..42cf181bf3ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2019,10 +2019,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
+	psi_enqueue(p, flags);
+
+	if (!(flags & ENQUEUE_RESTORE))
 		sched_info_enqueue(rq, p);
-		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
-	}
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
@@ -2039,10 +2039,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & DEQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & DEQUEUE_SAVE)) {
+	if (!(flags & DEQUEUE_SAVE))
 		sched_info_dequeue(rq, p);
-		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
-	}
+
+	psi_dequeue(p, flags);
 
 	/*
 	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..bbda03f17126 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2085,34 +2085,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
 
 #endif /* CONFIG_SMP */
 
-#include "stats.h"
-
-#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-
-extern void __sched_core_account_forceidle(struct rq *rq);
-
-static inline void sched_core_account_forceidle(struct rq *rq)
-{
-	if (schedstat_enabled())
-		__sched_core_account_forceidle(rq);
-}
-
-extern void __sched_core_tick(struct rq *rq);
-
-static inline void sched_core_tick(struct rq *rq)
-{
-	if (sched_core_enabled(rq) && schedstat_enabled())
-		__sched_core_tick(rq);
-}
-
-#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-
-static inline void sched_core_account_forceidle(struct rq *rq) { }
-
-static inline void sched_core_tick(struct rq *rq) { }
-
-#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
-
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
@@ -3166,6 +3138,34 @@ extern void nohz_run_idle_balance(int cpu);
 static inline void nohz_run_idle_balance(int cpu) { }
 #endif
 
+#include "stats.h"
+
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+	if (schedstat_enabled())
+		__sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+	if (sched_core_enabled(rq) && schedstat_enabled())
+		__sched_core_tick(rq);
+}
+
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 
 struct irqtime {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 767e098a3bd1..8ee0add5a48a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
  * go through migration requeues. In this case, *sleeping* states need
  * to be transferred.
  */
-static inline void psi_enqueue(struct task_struct *p, bool migrate)
+static inline void psi_enqueue(struct task_struct *p, int flags)
 {
 	int clear = 0, set = 0;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/* Same runqueue, nothing changed for psi */
+	if (flags & ENQUEUE_RESTORE)
+		return;
+
 	if (p->se.sched_delayed) {
 		/* CPU migration of "sleeping" task */
-		SCHED_WARN_ON(!migrate);
+		SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
 		if (p->in_iowait)
 			set |= TSK_IOWAIT;
-	} else if (migrate) {
+	} else if (flags & ENQUEUE_MIGRATED) {
 		/* CPU migration of runnable task */
 		set = TSK_RUNNING;
 		if (p->in_memstall)
@@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate)
 	psi_task_change(p, clear, set);
 }
 
-static inline void psi_dequeue(struct task_struct *p, bool migrate)
+static inline void psi_dequeue(struct task_struct *p, int flags)
 {
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	/*
-	 * When migrating a task to another CPU, clear all psi
-	 * state. The enqueue callback above will work it out.
-	 */
-	if (migrate)
-		psi_task_change(p, p->psi_flags, 0);
+	/* Same runqueue, nothing changed for psi */
+	if (flags & DEQUEUE_SAVE)
+		return;
 
 	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
@@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate)
 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
 	 * Do nothing here.
 	 */
+	if (flags & DEQUEUE_SLEEP)
+		return;
+
+	/*
+	 * When migrating a task to another CPU, clear all psi
+	 * state. The enqueue callback above will work it out.
+	 */
+	psi_task_change(p, p->psi_flags, 0);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
-- 
2.47.0
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by K Prateek Nayak 1 month, 1 week ago
Hello Johannes,

On 10/14/2024 8:13 PM, Johannes Weiner wrote:
> On Fri, Oct 11, 2024 at 12:39:58PM +0200, Peter Zijlstra wrote:
>> On Fri, Oct 11, 2024 at 06:08:03AM -0400, Johannes Weiner wrote:
>>> Completely untested. But if it looks good, I'll send a proper patch.
>>
>> Sure. Thanks for doing this.
> 
> Ok here goes. Built, booted and runtime-tested.
> 
> ---
> 
>  From 91f230caa0119877cb861047e1af1371bf00d908 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <hannes@cmpxchg.org>
> Date: Fri, 11 Oct 2024 06:18:07 -0400
> Subject: [PATCH] sched: psi: pass enqueue/dequeue flags to psi callbacks
>   directly
> 
> What psi needs to do on each enqueue and dequeue has gotten more
> subtle, and the generic sched code trying to distill this into a bool
> for the callbacks is awkward.
> 
> Pass the flags directly and let psi parse them. For that to work, the
> #include "stats.h" (which has the psi callback implementations) needs
> to be below the flag definitions in "sched.h". Move that section
> further down, next to some of the other accounting stuff.
> 
> This also puts the ENQUEUE_SAVE/RESTORE branch behind the psi jump
> label, slightly reducing overhead when PSI=y but runtime disabled.
> 
> Link: https://lore.kernel.org/lkml/20241011083323.GL17263@noisy.programming.kicks-ass.net/
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

I tested this series on top of tip:sched/urgent at commit cd9626e9ebc7
("sched/fair: Fix external p->on_rq users") and did not observe any
splats related to PSI or otherwise. Please feel free to add:

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

-- 
Thanks and Regards,
Prateek

> ---
>   kernel/sched/core.c  | 12 +++++-----
>   kernel/sched/sched.h | 56 ++++++++++++++++++++++----------------------
>   kernel/sched/stats.h | 29 +++++++++++++++--------
>   3 files changed, 53 insertions(+), 44 deletions(-)
> 
> [..snip..]
[tip: sched/core] sched: psi: pass enqueue/dequeue flags to psi callbacks directly
Posted by tip-bot2 for Johannes Weiner 4 weeks, 1 day ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     1a6151017ee5a30cb2d959f110ab18fc49646467
Gitweb:        https://git.kernel.org/tip/1a6151017ee5a30cb2d959f110ab18fc49646467
Author:        Johannes Weiner <hannes@cmpxchg.org>
AuthorDate:    Mon, 14 Oct 2024 10:43:58 -04:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sat, 26 Oct 2024 09:28:38 +02:00

sched: psi: pass enqueue/dequeue flags to psi callbacks directly

What psi needs to do on each enqueue and dequeue has gotten more
subtle, and the generic sched code trying to distill this into a bool
for the callbacks is awkward.

Pass the flags directly and let psi parse them. For that to work, the
#include "stats.h" (which has the psi callback implementations) needs
to be below the flag definitions in "sched.h". Move that section
further down, next to some of the other accounting stuff.

This also puts the ENQUEUE_SAVE/RESTORE branch behind the psi jump
label, slightly reducing overhead when PSI=y but runtime disabled.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241014144358.GB1021@cmpxchg.org
---
 kernel/sched/core.c  | 12 ++++-----
 kernel/sched/sched.h | 56 +++++++++++++++++++++----------------------
 kernel/sched/stats.h | 29 ++++++++++++++--------
 3 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9bad282..c57a79e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2024,10 +2024,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
+	psi_enqueue(p, flags);
+
+	if (!(flags & ENQUEUE_RESTORE))
 		sched_info_enqueue(rq, p);
-		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
-	}
 
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
@@ -2044,10 +2044,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & DEQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & DEQUEUE_SAVE)) {
+	if (!(flags & DEQUEUE_SAVE))
 		sched_info_dequeue(rq, p);
-		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
-	}
+
+	psi_dequeue(p, flags);
 
 	/*
 	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b13901..e51bf5a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2093,34 +2093,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
 
 #endif /* CONFIG_SMP */
 
-#include "stats.h"
-
-#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-
-extern void __sched_core_account_forceidle(struct rq *rq);
-
-static inline void sched_core_account_forceidle(struct rq *rq)
-{
-	if (schedstat_enabled())
-		__sched_core_account_forceidle(rq);
-}
-
-extern void __sched_core_tick(struct rq *rq);
-
-static inline void sched_core_tick(struct rq *rq)
-{
-	if (sched_core_enabled(rq) && schedstat_enabled())
-		__sched_core_tick(rq);
-}
-
-#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-
-static inline void sched_core_account_forceidle(struct rq *rq) { }
-
-static inline void sched_core_tick(struct rq *rq) { }
-
-#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
-
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
@@ -3191,6 +3163,34 @@ extern void nohz_run_idle_balance(int cpu);
 static inline void nohz_run_idle_balance(int cpu) { }
 #endif
 
+#include "stats.h"
+
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+	if (schedstat_enabled())
+		__sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+	if (sched_core_enabled(rq) && schedstat_enabled())
+		__sched_core_tick(rq);
+}
+
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 
 struct irqtime {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 767e098..8ee0add 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
  * go through migration requeues. In this case, *sleeping* states need
  * to be transferred.
  */
-static inline void psi_enqueue(struct task_struct *p, bool migrate)
+static inline void psi_enqueue(struct task_struct *p, int flags)
 {
 	int clear = 0, set = 0;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	/* Same runqueue, nothing changed for psi */
+	if (flags & ENQUEUE_RESTORE)
+		return;
+
 	if (p->se.sched_delayed) {
 		/* CPU migration of "sleeping" task */
-		SCHED_WARN_ON(!migrate);
+		SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
 		if (p->in_iowait)
 			set |= TSK_IOWAIT;
-	} else if (migrate) {
+	} else if (flags & ENQUEUE_MIGRATED) {
 		/* CPU migration of runnable task */
 		set = TSK_RUNNING;
 		if (p->in_memstall)
@@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate)
 	psi_task_change(p, clear, set);
 }
 
-static inline void psi_dequeue(struct task_struct *p, bool migrate)
+static inline void psi_dequeue(struct task_struct *p, int flags)
 {
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	/*
-	 * When migrating a task to another CPU, clear all psi
-	 * state. The enqueue callback above will work it out.
-	 */
-	if (migrate)
-		psi_task_change(p, p->psi_flags, 0);
+	/* Same runqueue, nothing changed for psi */
+	if (flags & DEQUEUE_SAVE)
+		return;
 
 	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
@@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate)
 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
 	 * Do nothing here.
 	 */
+	if (flags & DEQUEUE_SLEEP)
+		return;
+
+	/*
+	 * When migrating a task to another CPU, clear all psi
+	 * state. The enqueue callback above will work it out.
+	 */
+	psi_task_change(p, p->psi_flags, 0);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was migrated before wakeup
Posted by K Prateek Nayak 1 month, 2 weeks ago
Hello Johannes,

On 10/11/2024 1:07 AM, Johannes Weiner wrote:
> On Thu, Oct 10, 2024 at 03:06:21PM +0200, Peter Zijlstra wrote:
>> On Thu, Oct 10, 2024 at 09:03:16AM -0400, Johannes Weiner wrote:
>>
>>> I'll try to come up with a suitable solution as well, please don't
>>> apply this one for now.
>>
>> I'll make sure it doesn't end up in tip as-is.
> 
> Thanks.
> 
> This would be a replacement patch for #2 and #3 that handles migration
> of delayed tasks. It's slightly more invasive on the psi callback
> side, but I think it keeps the sched core bits simpler. Thoughts?
> 
> ---
> 
>  From d72a665d7c7c7d9c806424f473d13452754471d3 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <hannes@cmpxchg.org>
> Date: Thu, 10 Oct 2024 14:37:43 -0400
> Subject: [PATCH] sched: psi: handle delayed-dequeue task migration
> 
> Since sched_delayed tasks remain queued even after blocking, the load
> balancer can migrate them between runqueues while PSI considers them
> to be asleep. As a result, it misreads the migration requeue followed
> by a wakeup as a double queue:
> 
>    psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4
> 
> First, call psi_enqueue() after p->sched_class->enqueue_task(). A
> wakeup will clear p->se.sched_delayed while a migration will not, so
> psi can use that flag to tell them apart.
> 
> Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
> are being migrated.
> 
> Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
> call down with a new ENQUEUE_DELAYED. Instead of further complicating
> the wakeup conditional in enqueue_task(), identify migration contexts
> instead and default to wakeup handling for all other cases.
> 
> Debugged-by-and-original-fix-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
> Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
> Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
> Link: https://lore.kernel.org/lkml/f82def74-a64a-4a05-c8d4-4eeb3e03d0c0@amd.com/
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

This approach looks good to me as well! Thank you. I added this on top
of Patch 1 and I haven't seen any PSI splats after my stress test. Feel
free to add:

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>

-- 
Thanks and Regards,
Prateek

> ---
>   kernel/sched/core.c  | 12 +++++------
>   kernel/sched/stats.h | 48 ++++++++++++++++++++++++++++++--------------
>   2 files changed, 39 insertions(+), 21 deletions(-)
> 
> [..snip..]
[tip: sched/urgent] sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug
Posted by tip-bot2 for Johannes Weiner 1 month, 2 weeks ago
The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     c6508124193d42bbc3224571eb75bfa4c1821fbb
Gitweb:        https://git.kernel.org/tip/c6508124193d42bbc3224571eb75bfa4c1821fbb
Author:        Johannes Weiner <hannes@cmpxchg.org>
AuthorDate:    Fri, 11 Oct 2024 10:49:33 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Mon, 14 Oct 2024 09:11:42 +02:00

sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug

Since sched_delayed tasks remain queued even after blocking, the load
balancer can migrate them between runqueues while PSI considers them
to be asleep. As a result, it misreads the migration requeue followed
by a wakeup as a double queue:

  psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4

First, call psi_enqueue() after p->sched_class->enqueue_task(). A
wakeup will clear p->se.sched_delayed while a migration will not, so
psi can use that flag to tell them apart.

Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
are being migrated.

Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
call down with a new ENQUEUE_DELAYED. Instead of further complicating
the wakeup conditional in enqueue_task(), identify migration contexts
instead and default to wakeup handling for all other cases.

It's not just the warning in dmesg, the task state corruption causes a
permanent CPU pressure indication, which messes with workload/machine
health monitoring.

Debugged-by-and-original-fix-by: K Prateek Nayak <kprateek.nayak@amd.com>
Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20241010193712.GC181795@cmpxchg.org
---
 kernel/sched/core.c  | 12 +++++------
 kernel/sched/stats.h | 48 +++++++++++++++++++++++++++++--------------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e09140..71232f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2012,11 +2012,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
-		sched_info_enqueue(rq, p);
-		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
-	}
-
 	p->sched_class->enqueue_task(rq, p, flags);
 	/*
 	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
@@ -2024,6 +2019,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p);
 
+	if (!(flags & ENQUEUE_RESTORE)) {
+		sched_info_enqueue(rq, p);
+		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
+	}
+
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
 }
@@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!(flags & DEQUEUE_SAVE)) {
 		sched_info_dequeue(rq, p);
-		psi_dequeue(p, flags & DEQUEUE_SLEEP);
+		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
 	}
 
 	/*
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780a..767e098 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
  * memory stalls. As a result, it has to distinguish between sleeps,
- * where a task's runnable state changes, and requeues, where a task
- * and its state are being moved between CPUs and runqueues.
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
  */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+static inline void psi_enqueue(struct task_struct *p, bool migrate)
 {
-	int clear = 0, set = TSK_RUNNING;
+	int clear = 0, set = 0;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (p->in_memstall)
-		set |= TSK_MEMSTALL_RUNNING;
-
-	if (!wakeup) {
+	if (p->se.sched_delayed) {
+		/* CPU migration of "sleeping" task */
+		SCHED_WARN_ON(!migrate);
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
+		if (p->in_iowait)
+			set |= TSK_IOWAIT;
+	} else if (migrate) {
+		/* CPU migration of runnable task */
+		set = TSK_RUNNING;
+		if (p->in_memstall)
+			set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
 	} else {
+		/* Wakeup of new or sleeping task */
 		if (p->in_iowait)
 			clear |= TSK_IOWAIT;
+		set = TSK_RUNNING;
+		if (p->in_memstall)
+			set |= TSK_MEMSTALL_RUNNING;
 	}
 
 	psi_task_change(p, clear, set);
 }
 
-static inline void psi_dequeue(struct task_struct *p, bool sleep)
+static inline void psi_dequeue(struct task_struct *p, bool migrate)
 {
 	if (static_branch_likely(&psi_disabled))
 		return;
 
 	/*
+	 * When migrating a task to another CPU, clear all psi
+	 * state. The enqueue callback above will work it out.
+	 */
+	if (migrate)
+		psi_task_change(p, p->psi_flags, 0);
+
+	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
 	 * avoid walking all ancestors twice, psi_task_switch() handles
 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
 	 * Do nothing here.
 	 */
-	if (sleep)
-		return;
-
-	psi_task_change(p, p->psi_flags, 0);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
 }
 
 #else /* CONFIG_PSI */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
-static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
 static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
[tip: sched/urgent] Since sched_delayed tasks remain queued even after blocking, the load
Posted by tip-bot2 for Johannes Weiner 1 month, 2 weeks ago
The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     f2c9767170bead8d0ceb9c01d074c6916350310d
Gitweb:        https://git.kernel.org/tip/f2c9767170bead8d0ceb9c01d074c6916350310d
Author:        Johannes Weiner <hannes@cmpxchg.org>
AuthorDate:    Fri, 11 Oct 2024 10:49:33 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 11 Oct 2024 10:49:33 +02:00

Since sched_delayed tasks remain queued even after blocking, the load
balancer can migrate them between runqueues while PSI considers them
to be asleep. As a result, it misreads the migration requeue followed
by a wakeup as a double queue:

  psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4

First, call psi_enqueue() after p->sched_class->enqueue_task(). A
wakeup will clear p->se.sched_delayed while a migration will not, so
psi can use that flag to tell them apart.

Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
are being migrated.

Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
call down with a new ENQUEUE_DELAYED. Instead of further complicating
the wakeup conditional in enqueue_task(), identify migration contexts
instead and default to wakeup handling for all other cases.

Debugged-by-and-original-fix-by: K Prateek Nayak <kprateek.nayak@amd.com>
Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20241010193712.GC181795@cmpxchg.org
---
 kernel/sched/core.c  | 12 +++++------
 kernel/sched/stats.h | 48 +++++++++++++++++++++++++++++--------------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e09140..71232f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2012,11 +2012,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	if (!(flags & ENQUEUE_RESTORE)) {
-		sched_info_enqueue(rq, p);
-		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
-	}
-
 	p->sched_class->enqueue_task(rq, p, flags);
 	/*
 	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
@@ -2024,6 +2019,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	uclamp_rq_inc(rq, p);
 
+	if (!(flags & ENQUEUE_RESTORE)) {
+		sched_info_enqueue(rq, p);
+		psi_enqueue(p, flags & ENQUEUE_MIGRATED);
+	}
+
 	if (sched_core_enabled(rq))
 		sched_core_enqueue(rq, p);
 }
@@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!(flags & DEQUEUE_SAVE)) {
 		sched_info_dequeue(rq, p);
-		psi_dequeue(p, flags & DEQUEUE_SLEEP);
+		psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
 	}
 
 	/*
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 237780a..767e098 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
 /*
  * PSI tracks state that persists across sleeps, such as iowaits and
  * memory stalls. As a result, it has to distinguish between sleeps,
- * where a task's runnable state changes, and requeues, where a task
- * and its state are being moved between CPUs and runqueues.
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
  */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+static inline void psi_enqueue(struct task_struct *p, bool migrate)
 {
-	int clear = 0, set = TSK_RUNNING;
+	int clear = 0, set = 0;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	if (p->in_memstall)
-		set |= TSK_MEMSTALL_RUNNING;
-
-	if (!wakeup) {
+	if (p->se.sched_delayed) {
+		/* CPU migration of "sleeping" task */
+		SCHED_WARN_ON(!migrate);
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
+		if (p->in_iowait)
+			set |= TSK_IOWAIT;
+	} else if (migrate) {
+		/* CPU migration of runnable task */
+		set = TSK_RUNNING;
+		if (p->in_memstall)
+			set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
 	} else {
+		/* Wakeup of new or sleeping task */
 		if (p->in_iowait)
 			clear |= TSK_IOWAIT;
+		set = TSK_RUNNING;
+		if (p->in_memstall)
+			set |= TSK_MEMSTALL_RUNNING;
 	}
 
 	psi_task_change(p, clear, set);
 }
 
-static inline void psi_dequeue(struct task_struct *p, bool sleep)
+static inline void psi_dequeue(struct task_struct *p, bool migrate)
 {
 	if (static_branch_likely(&psi_disabled))
 		return;
 
 	/*
+	 * When migrating a task to another CPU, clear all psi
+	 * state. The enqueue callback above will work it out.
+	 */
+	if (migrate)
+		psi_task_change(p, p->psi_flags, 0);
+
+	/*
 	 * A voluntary sleep is a dequeue followed by a task switch. To
 	 * avoid walking all ancestors twice, psi_task_switch() handles
 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
 	 * Do nothing here.
 	 */
-	if (sleep)
-		return;
-
-	psi_task_change(p, p->psi_flags, 0);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
 }
 
 #else /* CONFIG_PSI */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
-static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
 static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,