Donor Migration for Proxy Execution (v24)

[PATCH v24 11/11] sched: Migrate whole chain in proxy_migrate_task()

Posted by John Stultz 2 months, 2 weeks ago

Instead of migrating one task each time through find_proxy_task(),
we can walk up the blocked_donor ptrs and migrate the entire
current chain in one go.

This was broken out of earlier patches and held back while the
series was being stabilized, but I wanted to re-introduce it.

Signed-off-by: John Stultz <jstultz@google.com>
---
v12:
* Earlier this was re-using blocked_node, but I hit
  a race with activating blocked entities, and to
  avoid it introduced a new migration_node listhead
v18:
* Add init_task initialization of migration_node as suggested
  by Suleiman
v22:
* Move migration_node under CONFIG_SCHED_PROXY_EXEC as suggested
  by K Prateek
Cc: Joel Fernandes <joelagnelf@nvidia.com>
Cc: Qais Yousef <qyousef@layalina.io>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Zimuzo Ezeozue <zezeozue@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Metin Kaya <Metin.Kaya@arm.com>
Cc: Xuewen Yan <xuewen.yan94@gmail.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: kuyo chang <kuyo.chang@mediatek.com>
Cc: hupu <hupu.gm@gmail.com>
Cc: kernel-team@android.com
---
 include/linux/sched.h |  3 +++
 init/init_task.c      |  3 +++
 kernel/fork.c         |  3 +++
 kernel/sched/core.c   | 24 +++++++++++++++++-------
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178ed37850470..775cc06f756d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1243,6 +1243,9 @@ struct task_struct {
 	struct mutex			*blocked_on;	/* lock we're blocked on */
 	struct task_struct		*blocked_donor;	/* task that is boosting this task */
 	raw_spinlock_t			blocked_lock;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+	struct list_head		migration_node;
+#endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	/*
diff --git a/init/init_task.c b/init/init_task.c
index 34853a511b4d8..78fb7cb83fa5d 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -178,6 +178,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 						 &init_task.alloc_lock),
 #endif
 	.blocked_donor = NULL,
+#ifdef CONFIG_SCHED_PROXY_EXEC
+	.migration_node = LIST_HEAD_INIT(init_task.migration_node),
+#endif
 #ifdef CONFIG_RT_MUTEXES
 	.pi_waiters	= RB_ROOT_CACHED,
 	.pi_top_task	= NULL,
diff --git a/kernel/fork.c b/kernel/fork.c
index 0a9a17e25b85d..a7561480e879e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2137,6 +2137,9 @@ __latent_entropy struct task_struct *copy_process(
 
 	p->blocked_on = NULL; /* not blocked yet */
 	p->blocked_donor = NULL; /* nobody is boosting p yet */
+#ifdef CONFIG_SCHED_PROXY_EXEC
+	INIT_LIST_HEAD(&p->migration_node);
+#endif
 
 #ifdef CONFIG_BCACHE
 	p->sequential_io	= 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f42ec01192dc..0c50d154050a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6723,6 +6723,7 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 			       struct task_struct *p, int target_cpu)
 {
 	struct rq *target_rq = cpu_rq(target_cpu);
+	LIST_HEAD(migrate_list);
 
 	lockdep_assert_rq_held(rq);
 
@@ -6739,11 +6740,16 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 	 */
 	proxy_resched_idle(rq);
 
-	WARN_ON(p == rq->curr);
-
-	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
-	proxy_set_task_cpu(p, target_cpu);
-
+	for (; p; p = p->blocked_donor) {
+		WARN_ON(p == rq->curr);
+		deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+		proxy_set_task_cpu(p, target_cpu);
+		/*
+		 * We can abuse blocked_node to migrate the thing,
+		 * because @p was still on the rq.
+		 */
+		list_add(&p->migration_node, &migrate_list);
+	}
 	/*
 	 * We have to zap callbacks before unlocking the rq
 	 * as another CPU may jump in and call sched_balance_rq
@@ -6755,8 +6761,12 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 	raw_spin_rq_unlock(rq);
 
 	raw_spin_rq_lock(target_rq);
-	activate_task(target_rq, p, 0);
-	wakeup_preempt(target_rq, p, 0);
+	while (!list_empty(&migrate_list)) {
+		p = list_first_entry(&migrate_list, struct task_struct, migration_node);
+		list_del_init(&p->migration_node);
+		activate_task(target_rq, p, 0);
+		wakeup_preempt(target_rq, p, 0);
+	}
 	raw_spin_rq_unlock(target_rq);
 
 	raw_spin_rq_lock(rq);
-- 
2.52.0.487.g5c8c507ade-goog

Re: [PATCH v24 11/11] sched: Migrate whole chain in proxy_migrate_task()

Posted by K Prateek Nayak 1 month, 1 week ago

Hello John,

On 11/25/2025 4:01 AM, John Stultz wrote:
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 178ed37850470..775cc06f756d0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1243,6 +1243,9 @@ struct task_struct {
>  	struct mutex			*blocked_on;	/* lock we're blocked on */
>  	struct task_struct		*blocked_donor;	/* task that is boosting this task */
>  	raw_spinlock_t			blocked_lock;
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> +	struct list_head		migration_node;

We can just reuse the "p->se.group_node" here instead of adding the
"migration_node" - it is only used to move tasks during load balancing,
and once we deactivate a task, it can no longer be picked for load
balancing so we should be safe (serialized by the rq_lock).

> +#endif
>  
>  #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
>  	/*
> diff --git a/init/init_task.c b/init/init_task.c
> index 34853a511b4d8..78fb7cb83fa5d 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -178,6 +178,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
>  						 &init_task.alloc_lock),
>  #endif
>  	.blocked_donor = NULL,
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> +	.migration_node = LIST_HEAD_INIT(init_task.migration_node),
> +#endif
>  #ifdef CONFIG_RT_MUTEXES
>  	.pi_waiters	= RB_ROOT_CACHED,
>  	.pi_top_task	= NULL,
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0a9a17e25b85d..a7561480e879e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2137,6 +2137,9 @@ __latent_entropy struct task_struct *copy_process(
>  
>  	p->blocked_on = NULL; /* not blocked yet */
>  	p->blocked_donor = NULL; /* nobody is boosting p yet */
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> +	INIT_LIST_HEAD(&p->migration_node);
> +#endif
>  
>  #ifdef CONFIG_BCACHE
>  	p->sequential_io	= 0;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f42ec01192dc..0c50d154050a3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6723,6 +6723,7 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
>  			       struct task_struct *p, int target_cpu)
>  {
>  	struct rq *target_rq = cpu_rq(target_cpu);
> +	LIST_HEAD(migrate_list);
>  
>  	lockdep_assert_rq_held(rq);
>  
> @@ -6739,11 +6740,16 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
>  	 */
>  	proxy_resched_idle(rq);
>  
> -	WARN_ON(p == rq->curr);
> -
> -	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
> -	proxy_set_task_cpu(p, target_cpu);
> -
> +	for (; p; p = p->blocked_donor) {
> +		WARN_ON(p == rq->curr);
> +		deactivate_task(rq, p, DEQUEUE_NOCLOCK);
> +		proxy_set_task_cpu(p, target_cpu);
> +		/*
> +		 * We can abuse blocked_node to migrate the thing,
> +		 * because @p was still on the rq.
> +		 */
> +		list_add(&p->migration_node, &migrate_list);
> +	}
>  	/*
>  	 * We have to zap callbacks before unlocking the rq
>  	 * as another CPU may jump in and call sched_balance_rq
> @@ -6755,8 +6761,12 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
>  	raw_spin_rq_unlock(rq);
>  
>  	raw_spin_rq_lock(target_rq);
> -	activate_task(target_rq, p, 0);
> -	wakeup_preempt(target_rq, p, 0);
> +	while (!list_empty(&migrate_list)) {
> +		p = list_first_entry(&migrate_list, struct task_struct, migration_node);
> +		list_del_init(&p->migration_node);
> +		activate_task(target_rq, p, 0);
> +		wakeup_preempt(target_rq, p, 0);
> +	}
>  	raw_spin_rq_unlock(target_rq);

Here, we can extract the core logic of attach_tasks() into a helper and
reuse is as:

  (Included stuff from the suggestion on Patch 6 + the above suggestion;
   build + boot tested with test-ww_mutex and sched-messaging)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5a6ac4d13112..700fd08b2392 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6786,7 +6786,7 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 		 * We can abuse blocked_node to migrate the thing,
 		 * because @p was still on the rq.
 		 */
-		list_add(&p->migration_node, &migrate_list);
+		list_add(&p->se.group_node, &migrate_list);
 	}
 	/*
 	 * Zap balance callbacks and drop the rq_lock
@@ -6794,14 +6794,8 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 	 */
 	guard(proxy_drop_reacquire_rq_lock)(rq, rf);
 
-	raw_spin_rq_lock(target_rq);
-	while (!list_empty(&migrate_list)) {
-		p = list_first_entry(&migrate_list, struct task_struct, migration_node);
-		list_del_init(&p->migration_node);
-		activate_task(target_rq, p, 0);
-		wakeup_preempt(target_rq, p, 0);
-	}
-	raw_spin_rq_unlock(target_rq);
+	/* Move migrate_list to target_rq. */
+	__attach_tasks(target_rq, &migrate_list);
 }
 
 static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d2e92a55b16..f24ca09ded8a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9700,27 +9700,28 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
 	rq_unlock(rq, &rf);
 }
 
-/*
- * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
- * new rq.
- */
-static void attach_tasks(struct lb_env *env)
+void __attach_tasks(struct rq *rq, struct list_head *tasks)
 {
-	struct list_head *tasks = &env->tasks;
-	struct task_struct *p;
-	struct rq_flags rf;
-
-	rq_lock(env->dst_rq, &rf);
-	update_rq_clock(env->dst_rq);
+	guard(rq_lock)(rq);
+	update_rq_clock(rq);
 
 	while (!list_empty(tasks)) {
+		struct task_struct *p;
+
 		p = list_first_entry(tasks, struct task_struct, se.group_node);
 		list_del_init(&p->se.group_node);
 
-		attach_task(env->dst_rq, p);
+		attach_task(rq, p);
 	}
+}
 
-	rq_unlock(env->dst_rq, &rf);
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+	__attach_tasks(env->dst_rq, &env->tasks);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 424c40bd46e2..43b9ee90c67f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1368,6 +1368,8 @@ static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
 }
 #endif
 
+void __attach_tasks(struct rq *rq, struct list_head *tasks);
+
 #ifdef CONFIG_SCHED_CORE
 static inline struct cpumask *sched_group_span(struct sched_group *sg);
 
---

Thoughts?

>  
>  	raw_spin_rq_lock(rq);

-- 
Thanks and Regards,
Prateek