[PATCH] sched/core: Push tasks on force idle

Fernand Sieber posted 1 patch 3 days, 8 hours ago
kernel/sched/core.c  | 88 +++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c  | 11 ++++++
kernel/sched/sched.h |  1 +
3 files changed, 98 insertions(+), 2 deletions(-)
[PATCH] sched/core: Push tasks on force idle
Posted by Fernand Sieber 3 days, 8 hours ago
When a cpu enters force idle, it will
1) try to steal cookie matching tasks from other CPUs
2) do the newidle balance

If the stealing fails, we are out of options to get out of force idle
properly. New idle balance might decide to pull other tasks, but they won't
necessarily be matching anyways.

Introduce a step in between where we try to push the runnable tasks that
are blocked in force idle to a more suitable CPU.

=== Testing setup ===

Similar setup as in:
https://lore.kernel.org/lkml/20251127202719.963766-1-sieberf@amazon.com

Testing is aimed at measuring perceived guest noise on hypervisor system
with time shared scenarios.

Setup is on system where the load is nearing 100% which should allow no
steal time. The system has 64 CPUs, with 8 VMs, each VM using core
scheduling with 8 vCPUs per VM, time shared.

7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
running the hwlat tracer with a width of 100ms, a period of 300ms, and
a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
adds a light level of noise which forces some level of load balancing.

The test scenario is ran 10x60s and the average noise is measured (we use
breaches scaled up to period/width to estimate noise).

=== Testing results ===

Baseline noise: 1.20%
After patch noise: 0.66% (-45%)

Signed-off-by: Fernand Sieber <sieberf@amazon.com>
---
 kernel/sched/core.c  | 88 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/fair.c  | 11 ++++++
 kernel/sched/sched.h |  1 +
 3 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..852863eda8b8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6338,6 +6338,81 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
 	return false;
 }
 
+static bool forceidle_try_push_task(int this, int that)
+{
+	struct rq *dst = cpu_rq(that), *src = cpu_rq(this);
+	struct task_struct *p;
+	int cpu;
+	bool cookie_check = false;
+	bool success = false;
+	const struct sched_class *class;
+
+	if (!available_idle_cpu(that))
+		return false;
+
+	if (sched_core_enabled(dst)) {
+		for_each_cpu(cpu, cpu_smt_mask(that)) {
+			if (cpu == that)
+				continue;
+			if (!available_idle_cpu(cpu)) {
+				cookie_check = true;
+				break;
+			}
+		}
+	}
+
+	guard(irq)();
+	double_rq_lock(dst, src);
+
+	for_each_class(class) {
+		if (!class->select_next_task_push)
+			continue;
+
+		p = class->select_next_task_push(src, NULL);
+		while (p) {
+			if (!is_cpu_allowed(p, that))
+				goto next;
+
+			if (sched_task_is_throttled(p, that))
+				goto next;
+
+			if (cookie_check && dst->core->core_cookie != p->core_cookie)
+				goto next;
+
+			deactivate_task(src, p, 0);
+			set_task_cpu(p, that);
+			activate_task(dst, p, 0);
+			wakeup_preempt(dst, p, 0);
+
+			success = true;
+			break;
+
+next:
+			p = class->select_next_task_push(src, p);
+		}
+	}
+
+	double_rq_unlock(dst, src);
+	return success;
+}
+
+static bool forceidle_push_tasks(int cpu, struct sched_domain *sd)
+{
+	int i;
+
+	for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
+		if (cpumask_test_cpu(i, cpu_smt_mask(cpu)))
+			continue;
+
+		if (need_resched())
+			break;
+
+		if (forceidle_try_push_task(cpu, i))
+			return true;
+	}
+	return false;
+}
+
 static void sched_core_balance(struct rq *rq)
 {
 	struct sched_domain *sd;
@@ -6349,11 +6424,20 @@ static void sched_core_balance(struct rq *rq)
 	raw_spin_rq_unlock_irq(rq);
 	for_each_domain(cpu, sd) {
 		if (need_resched())
-			break;
+			goto out;
 
 		if (steal_cookie_task(cpu, sd))
-			break;
+			goto out;
+	}
+	for_each_domain(cpu, sd) {
+		if (need_resched())
+			goto out;
+
+		if (forceidle_push_tasks(cpu, sd))
+			goto out;
 	}
+
+out:
 	raw_spin_rq_lock_irq(rq);
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c86a67762d1..a50cec23458c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13113,6 +13113,16 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
 #endif
 	return throttled_hierarchy(cfs_rq);
 }
+
+static struct task_struct *select_next_task_push_fair(struct rq *rq, struct task_struct *p)
+{
+	p = list_prepare_entry(p, &rq->cfs_tasks, se.group_node);
+	list_for_each_entry_continue_reverse(p, &rq->cfs_tasks, se.group_node) {
+		return p;
+	}
+	return NULL;
+}
+
 #else /* !CONFIG_SCHED_CORE: */
 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
 #endif /* !CONFIG_SCHED_CORE */
@@ -13674,6 +13684,7 @@ DEFINE_SCHED_CLASS(fair) = {
 
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	= task_is_throttled_fair,
+	.select_next_task_push	= select_next_task_push_fair,
 #endif
 
 #ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fdee101b1a66..bdcea16fca54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2477,6 +2477,7 @@ struct sched_class {
 
 #ifdef CONFIG_SCHED_CORE
 	int (*task_is_throttled)(struct task_struct *p, int cpu);
+	struct task_struct* (*select_next_task_push)(struct rq *rq, struct task_struct *p);
 #endif
 };
 
-- 
2.43.0




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
Re: [PATCH] sched/core: Push tasks on force idle
Posted by Peter Zijlstra 3 days, 7 hours ago
On Fri, Nov 28, 2025 at 03:19:54PM +0200, Fernand Sieber wrote:
> When a cpu enters force idle, it will
> 1) try to steal cookie matching tasks from other CPUs
> 2) do the newidle balance
> 
> If the stealing fails, we are out of options to get out of force idle
> properly. New idle balance might decide to pull other tasks, but they won't
> necessarily be matching anyways.
> 
> Introduce a step in between where we try to push the runnable tasks that
> are blocked in force idle to a more suitable CPU.
> 
> === Testing setup ===
> 
> Similar setup as in:
> https://lore.kernel.org/lkml/20251127202719.963766-1-sieberf@amazon.com
> 
> Testing is aimed at measuring perceived guest noise on hypervisor system
> with time shared scenarios.
> 
> Setup is on system where the load is nearing 100% which should allow no
> steal time. The system has 64 CPUs, with 8 VMs, each VM using core
> scheduling with 8 vCPUs per VM, time shared.
> 
> 7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
> running the hwlat tracer with a width of 100ms, a period of 300ms, and
> a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
> adds a light level of noise which forces some level of load balancing.
> 
> The test scenario is ran 10x60s and the average noise is measured (we use
> breaches scaled up to period/width to estimate noise).
> 
> === Testing results ===
> 
> Baseline noise: 1.20%
> After patch noise: 0.66% (-45%)

This is similar to that other patch, what happens if you combine the
two?
Re: [PATCH] sched/core: Push tasks on force idle
Posted by Fernand Sieber 3 days, 6 hours ago
On Fri, Nov 28, 2025 at 02:38:22PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 28, 2025 at 03:19:54PM +0200, Fernand Sieber wrote:
> > When a cpu enters force idle, it will
> > 1) try to steal cookie matching tasks from other CPUs
> > 2) do the newidle balance
> >
> > If the stealing fails, we are out of options to get out of force idle
> > properly. New idle balance might decide to pull other tasks, but they
> > won't necessarily be matching anyways.
> >
> > Introduce a step in between where we try to push the runnable tasks
> > that are blocked in force idle to a more suitable CPU.
> >
> > === Testing setup ===
> >
> > Similar setup as in:
> > https://lore.kernel.org/lkml/20251127202719.963766-1-sieberf@amazon.com
> >
> > Testing is aimed at measuring perceived guest noise on hypervisor
> > system with time shared scenarios.
> >
> > Setup is on system where the load is nearing 100% which should allow no
> > steal time. The system has 64 CPUs, with 8 VMs, each VM using core
> > scheduling with 8 vCPUs per VM, time shared.
> >
> > 7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
> > running the hwlat tracer with a width of 100ms, a period of 300ms, and
> > a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
> > adds a light level of noise which forces some level of load balancing.
> >
> > The test scenario is ran 10x60s and the average noise is measured (we
> > use breaches scaled up to period/width to estimate noise).
> >
> > === Testing results ===
> >
> > Baseline noise: 1.20%
> > After patch noise: 0.66% (-45%)
>
> This is similar to that other patch, what happens if you combine the
> two?

Noise results:
- Baseline: 1.20%
- Force idle aware LB: 0.63%
  (https://lore.kernel.org/lkml/20251127202719.963766-1-sieberf@amazon.com)
- Push force idle tasks: 0.66% (this patch)
- Both patches combined: 0.45%

Note: I realized I also ran these tests with this patch applied on
baseline:
"sched/fair: Add more core cookie check in wake up fast path"
https://lore.kernel.org/lkml/20251120101955.968586-1-sieberf@amazon.com
Ideally I would revert it and compute all improvements independently.
Prateek already reviewed that patch, I would appreciate if you could
take a look too.

I could post all the patches together, though I thought they are fairly
independent so it's easier to keep them separate.

Additionally, to craft these patches I examined inefficiency
opportunities tracked with scheduling ftrace dumps, for which I also
relied on a cookie tracepoint proposed here:
https://lore.kernel.org/lkml/20250128113410.263994-1-sieberf@amazon.com/



Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07