[v1] sched: The newidle balance regression

[PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 3 months ago

Add a randomized algorithm that runs newidle balancing proportional to
its success rate.

This improves schbench significantly:

 6.18-rc4:			2.22 Mrps/s
 6.18-rc4+revert:		2.04 Mrps/s
 6.18-rc4+revert+random:	2.18 Mrps/S

Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:

 6.17:			-6%
 6.17+revert:		 0%
 6.17+revert+random:	-1%

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/topology.h |    3 ++
 kernel/sched/core.c            |    3 ++
 kernel/sched/fair.c            |   43 +++++++++++++++++++++++++++++++++++++----
 kernel/sched/features.h        |    5 ++++
 kernel/sched/sched.h           |    7 ++++++
 kernel/sched/topology.c        |    6 +++++
 6 files changed, 63 insertions(+), 4 deletions(-)

--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
 	/* idle_balance() stats */
+	unsigned int newidle_call;
+	unsigned int newidle_success;
+	unsigned int newidle_ratio;
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
 
 #ifdef CONFIG_SCHED_PROXY_EXEC
 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
 {
 	sched_init_numa(NUMA_NO_NODE);
 
+	prandom_init_once(&sched_rnd_state);
+
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
 	 * CPU masks are stable and all blatant races in the below code cannot
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12146,11 +12146,26 @@ void update_max_interval(void)
 	max_load_balance_interval = HZ*num_online_cpus()/10;
 }
 
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+	sd->newidle_call++;
+	sd->newidle_success += success;
+
+	if (sd->newidle_call >= 1024) {
+		sd->newidle_ratio = sd->newidle_success;
+		sd->newidle_call /= 2;
+		sd->newidle_success /= 2;
+	}
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
 {
 	unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
 	unsigned long now = jiffies;
 
+	update_newidle_stats(sd, success);
+
 	if (cost > sd->max_newidle_lb_cost) {
 		/*
 		 * Track max cost of a domain to make sure to not delay the
@@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
 		 * Decay the newidle max times here because this is a regular
 		 * visit to all the domains.
 		 */
-		need_decay = update_newidle_cost(sd, 0);
+		need_decay = update_newidle_cost(sd, 0, 0);
 		max_cost += sd->max_newidle_lb_cost;
 
 		/*
@@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
 			break;
 
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			unsigned int weight = 1;
+
+			if (sched_feat(NI_RANDOM)) {
+				/*
+				 * Throw a 1k sided dice; and only run
+				 * newidle_balance according to the success
+				 * rate.
+				 */
+				u32 d1k = sched_rng() % 1024;
+				weight = 1 + sd->newidle_ratio;
+				if (d1k > weight) {
+					update_newidle_stats(sd, 0);
+					continue;
+				}
+				weight = (1024 + weight/2) / weight;
+			}
 
 			pulled_task = sched_balance_rq(this_cpu, this_rq,
 						   sd, CPU_NEWLY_IDLE,
@@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
 
 			t1 = sched_clock_cpu(this_cpu);
 			domain_cost = t1 - t0;
-			update_newidle_cost(sd, domain_cost);
-
 			curr_cost += domain_cost;
 			t0 = t1;
+
+			/*
+			 * Track max cost of a domain to make sure to not delay the
+			 * next wakeup on the CPU.
+			 */
+			update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
 		}
 
 		/*
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 
 SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #ifndef _KERNEL_SCHED_SCHED_H
 #define _KERNEL_SCHED_SCHED_H
 
+#include <linux/prandom.h>
 #include <linux/sched/affinity.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
 }
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+	return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		this_cpu_ptr(&runqueues)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
 
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
+
+		/* 50% success rate */
+		.newidle_call		= 512,
+		.newidle_success	= 256,
+		.newidle_ratio		= 512,
+
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,
 		.child			= child,

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mario Roy 3 weeks ago

The patch "Proportional newidle balance" introduced a regression
with Linux 6.12.65 and 6.18.5. There is noticeable regression with
easyWave testing. [1]

The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
to install easyWave [2]. That is fetching the two tar.gz archives.


#!/bin/bash
# CXXFLAGS="-O3 $CXXFLAGS" ./configure
# make -j8

trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT

OMP_NUM_THREADS=48 ./src/easywave \
   -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
   -time 1200


Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.

easyWave ver.2013-04-11
Model time = 00:00:00,   elapsed: 0 msec
Model time = 00:10:00,   elapsed: 5 msec
Model time = 00:20:00,   elapsed: 10 msec
Model time = 00:30:00,   elapsed: 19 msec
...
Model time = 05:00:00,   elapsed: 2908 msec
Model time = 05:10:00,   elapsed: 3079 msec
Model time = 05:20:00,   elapsed: 3307 msec
Model time = 05:30:00,   elapsed: 3503 msec
...

After results with CachyOS 6.12.66-2 and 6.18.6-2 kernels.

easyWave ver.2013-04-11
Model time = 00:00:00,   elapsed: 0 msec
Model time = 00:10:00,   elapsed: 5 msec
Model time = 00:20:00,   elapsed: 10 msec
Model time = 00:30:00,   elapsed: 18 msec
...
Model time = 05:00:00,   elapsed: 13057 msec  (normal is < 3.0s)
Model time = 05:10:00,   elapsed: 13512 msec
Model time = 05:20:00,   elapsed: 13833 msec
Model time = 05:30:00,   elapsed: 14206 msec
...


Reverting the patch "sched/fair: Proportional newidle balance"
returns back to prior performance.

[1] https://openbenchmarking.org/test/pts/easywave
[2] 
https://openbenchmarking.org/innhold/da7f1cf159033fdfbb925102284aea8a83e8afdc

On 11/7/25 11:06 AM, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
>
> This improves schbench significantly:
>
>   6.18-rc4:			2.22 Mrps/s
>   6.18-rc4+revert:		2.04 Mrps/s
>   6.18-rc4+revert+random:	2.18 Mrps/S
>
> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
>
>   6.17:			-6%
>   6.17+revert:		 0%
>   6.17+revert+random:	-1%
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/sched/topology.h |    3 ++
>   kernel/sched/core.c            |    3 ++
>   kernel/sched/fair.c            |   43 +++++++++++++++++++++++++++++++++++++----
>   kernel/sched/features.h        |    5 ++++
>   kernel/sched/sched.h           |    7 ++++++
>   kernel/sched/topology.c        |    6 +++++
>   6 files changed, 63 insertions(+), 4 deletions(-)
>
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -92,6 +92,9 @@ struct sched_domain {
>   	unsigned int nr_balance_failed; /* initialise to 0 */
>   
>   	/* idle_balance() stats */
> +	unsigned int newidle_call;
> +	unsigned int newidle_success;
> +	unsigned int newidle_ratio;
>   	u64 max_newidle_lb_cost;
>   	unsigned long last_decay_max_lb_cost;
>   
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
>   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>   
>   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
>   
>   #ifdef CONFIG_SCHED_PROXY_EXEC
>   DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> @@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
>   {
>   	sched_init_numa(NUMA_NO_NODE);
>   
> +	prandom_init_once(&sched_rnd_state);
> +
>   	/*
>   	 * There's no userspace yet to cause hotplug operations; hence all the
>   	 * CPU masks are stable and all blatant races in the below code cannot
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12146,11 +12146,26 @@ void update_max_interval(void)
>   	max_load_balance_interval = HZ*num_online_cpus()/10;
>   }
>   
> -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
> +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
> +{
> +	sd->newidle_call++;
> +	sd->newidle_success += success;
> +
> +	if (sd->newidle_call >= 1024) {
> +		sd->newidle_ratio = sd->newidle_success;
> +		sd->newidle_call /= 2;
> +		sd->newidle_success /= 2;
> +	}
> +}
> +
> +static inline bool
> +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
>   {
>   	unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
>   	unsigned long now = jiffies;
>   
> +	update_newidle_stats(sd, success);
> +
>   	if (cost > sd->max_newidle_lb_cost) {
>   		/*
>   		 * Track max cost of a domain to make sure to not delay the
> @@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
>   		 * Decay the newidle max times here because this is a regular
>   		 * visit to all the domains.
>   		 */
> -		need_decay = update_newidle_cost(sd, 0);
> +		need_decay = update_newidle_cost(sd, 0, 0);
>   		max_cost += sd->max_newidle_lb_cost;
>   
>   		/*
> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>   			break;
>   
>   		if (sd->flags & SD_BALANCE_NEWIDLE) {
> +			unsigned int weight = 1;
> +
> +			if (sched_feat(NI_RANDOM)) {
> +				/*
> +				 * Throw a 1k sided dice; and only run
> +				 * newidle_balance according to the success
> +				 * rate.
> +				 */
> +				u32 d1k = sched_rng() % 1024;
> +				weight = 1 + sd->newidle_ratio;
> +				if (d1k > weight) {
> +					update_newidle_stats(sd, 0);
> +					continue;
> +				}
> +				weight = (1024 + weight/2) / weight;
> +			}
>   
>   			pulled_task = sched_balance_rq(this_cpu, this_rq,
>   						   sd, CPU_NEWLY_IDLE,
> @@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
>   
>   			t1 = sched_clock_cpu(this_cpu);
>   			domain_cost = t1 - t0;
> -			update_newidle_cost(sd, domain_cost);
> -
>   			curr_cost += domain_cost;
>   			t0 = t1;
> +
> +			/*
> +			 * Track max cost of a domain to make sure to not delay the
> +			 * next wakeup on the CPU.
> +			 */
> +			update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
>   		}
>   
>   		/*
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
>   SCHED_FEAT(UTIL_EST, true)
>   
>   SCHED_FEAT(LATENCY_WARN, false)
> +
> +/*
> + * Do newidle balancing proportional to its success rate using randomization.
> + */
> +SCHED_FEAT(NI_RANDOM, true)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -5,6 +5,7 @@
>   #ifndef _KERNEL_SCHED_SCHED_H
>   #define _KERNEL_SCHED_SCHED_H
>   
> +#include <linux/prandom.h>
>   #include <linux/sched/affinity.h>
>   #include <linux/sched/autogroup.h>
>   #include <linux/sched/cpufreq.h>
> @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
>   }
>   
>   DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
> +
> +static inline u32 sched_rng(void)
> +{
> +	return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
> +}
>   
>   #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
>   #define this_rq()		this_cpu_ptr(&runqueues)
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
>   
>   		.last_balance		= jiffies,
>   		.balance_interval	= sd_weight,
> +
> +		/* 50% success rate */
> +		.newidle_call		= 512,
> +		.newidle_success	= 256,
> +		.newidle_ratio		= 512,
> +
>   		.max_newidle_lb_cost	= 0,
>   		.last_decay_max_lb_cost	= jiffies,
>   		.child			= child,
>
>

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 2 weeks ago

On 18/01/2026 20:46, Mario Roy wrote:
> CAUTION: This email originated from outside of the organization. Do not 
> click links or open attachments unless you can confirm the sender and 
> know the content is safe.
> 
> 
> 
> The patch "Proportional newidle balance" introduced a regression
> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> easyWave testing. [1]
> 
> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> to install easyWave [2]. That is fetching the two tar.gz archives.
> 
> 
> #!/bin/bash
> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
> # make -j8
> 
> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
> 
> OMP_NUM_THREADS=48 ./src/easywave \
>    -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
>    -time 1200
> 
> 
> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
> 
> easyWave ver.2013-04-11
> Model time = 00:00:00,   elapsed: 0 msec
> Model time = 00:10:00,   elapsed: 5 msec
> Model time = 00:20:00,   elapsed: 10 msec
> Model time = 00:30:00,   elapsed: 19 msec
> ...
> Model time = 05:00:00,   elapsed: 2908 msec
> Model time = 05:10:00,   elapsed: 3079 msec
> Model time = 05:20:00,   elapsed: 3307 msec
> Model time = 05:30:00,   elapsed: 3503 msec
> ...
> 
> After results with CachyOS 6.12.66-2 and 6.18.6-2 kernels.
> 
> easyWave ver.2013-04-11
> Model time = 00:00:00,   elapsed: 0 msec
> Model time = 00:10:00,   elapsed: 5 msec
> Model time = 00:20:00,   elapsed: 10 msec
> Model time = 00:30:00,   elapsed: 18 msec
> ...
> Model time = 05:00:00,   elapsed: 13057 msec  (normal is < 3.0s)
> Model time = 05:10:00,   elapsed: 13512 msec
> Model time = 05:20:00,   elapsed: 13833 msec
> Model time = 05:30:00,   elapsed: 14206 msec
> ...
> 
> 
> Reverting the patch "sched/fair: Proportional newidle balance"
> returns back to prior performance.
> 
> [1] https://openbenchmarking.org/test/pts/easywave
> [2]
> https://openbenchmarking.org/innhold/ 
> da7f1cf159033fdfbb925102284aea8a83e8afdc
> 
> On 11/7/25 11:06 AM, Peter Zijlstra wrote:
>> Add a randomized algorithm that runs newidle balancing proportional to
>> its success rate.
>>
>> This improves schbench significantly:
>>
>>   6.18-rc4:                   2.22 Mrps/s
>>   6.18-rc4+revert:            2.04 Mrps/s
>>   6.18-rc4+revert+random:     2.18 Mrps/S
>>
>> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
>>
>>   6.17:                       -6%
>>   6.17+revert:                 0%
>>   6.17+revert+random: -1%
>>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> ---
>>   include/linux/sched/topology.h |    3 ++
>>   kernel/sched/core.c            |    3 ++
>>   kernel/sched/fair.c            |   43 ++++++++++++++++++++++++++++++ 
>> +++++++----
>>   kernel/sched/features.h        |    5 ++++
>>   kernel/sched/sched.h           |    7 ++++++
>>   kernel/sched/topology.c        |    6 +++++
>>   6 files changed, 63 insertions(+), 4 deletions(-)
>>
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -92,6 +92,9 @@ struct sched_domain {
>>       unsigned int nr_balance_failed; /* initialise to 0 */
>>
>>       /* idle_balance() stats */
>> +     unsigned int newidle_call;
>> +     unsigned int newidle_success;
>> +     unsigned int newidle_ratio;
>>       u64 max_newidle_lb_cost;
>>       unsigned long last_decay_max_lb_cost;
>>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>>
>>   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>> +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
>>
>>   #ifdef CONFIG_SCHED_PROXY_EXEC
>>   DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
>> @@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
>>   {
>>       sched_init_numa(NUMA_NO_NODE);
>>
>> +     prandom_init_once(&sched_rnd_state);
>> +
>>       /*
>>        * There's no userspace yet to cause hotplug operations; hence 
>> all the
>>        * CPU masks are stable and all blatant races in the below code 
>> cannot
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -12146,11 +12146,26 @@ void update_max_interval(void)
>>       max_load_balance_interval = HZ*num_online_cpus()/10;
>>   }
>>
>> -static inline bool update_newidle_cost(struct sched_domain *sd, u64 
>> cost)
>> +static inline void update_newidle_stats(struct sched_domain *sd, 
>> unsigned int success)
>> +{
>> +     sd->newidle_call++;
>> +     sd->newidle_success += success;
>> +
>> +     if (sd->newidle_call >= 1024) {
>> +             sd->newidle_ratio = sd->newidle_success;
>> +             sd->newidle_call /= 2;
>> +             sd->newidle_success /= 2;
>> +     }
>> +}
>> +
>> +static inline bool
>> +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int 
>> success)
>>   {
>>       unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
>>       unsigned long now = jiffies;
>>
>> +     update_newidle_stats(sd, success);
>> +
>>       if (cost > sd->max_newidle_lb_cost) {
>>               /*
>>                * Track max cost of a domain to make sure to not delay the
>> @@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
>>                * Decay the newidle max times here because this is a 
>> regular
>>                * visit to all the domains.
>>                */
>> -             need_decay = update_newidle_cost(sd, 0);
>> +             need_decay = update_newidle_cost(sd, 0, 0);
>>               max_cost += sd->max_newidle_lb_cost;
>>
>>               /*
>> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>>                       break;
>>
>>               if (sd->flags & SD_BALANCE_NEWIDLE) {
>> +                     unsigned int weight = 1;
>> +
>> +                     if (sched_feat(NI_RANDOM)) {
>> +                             /*
>> +                              * Throw a 1k sided dice; and only run
>> +                              * newidle_balance according to the success
>> +                              * rate.
>> +                              */
>> +                             u32 d1k = sched_rng() % 1024;
>> +                             weight = 1 + sd->newidle_ratio;
>> +                             if (d1k > weight) {
>> +                                     update_newidle_stats(sd, 0);
>> +                                     continue;
>> +                             }
>> +                             weight = (1024 + weight/2) / weight;
>> +                     }
>>
>>                       pulled_task = sched_balance_rq(this_cpu, this_rq,
>>                                                  sd, CPU_NEWLY_IDLE,
>> @@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
>>
>>                       t1 = sched_clock_cpu(this_cpu);
>>                       domain_cost = t1 - t0;
>> -                     update_newidle_cost(sd, domain_cost);
>> -
>>                       curr_cost += domain_cost;
>>                       t0 = t1;
>> +
>> +                     /*
>> +                      * Track max cost of a domain to make sure to 
>> not delay the
>> +                      * next wakeup on the CPU.
>> +                      */
>> +                     update_newidle_cost(sd, domain_cost, weight * !! 
>> pulled_task);
>>               }
>>
>>               /*
>> --- a/kernel/sched/features.h
>> +++ b/kernel/sched/features.h
>> @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
>>   SCHED_FEAT(UTIL_EST, true)
>>
>>   SCHED_FEAT(LATENCY_WARN, false)
>> +
>> +/*
>> + * Do newidle balancing proportional to its success rate using 
>> randomization.
>> + */
>> +SCHED_FEAT(NI_RANDOM, true)
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -5,6 +5,7 @@
>>   #ifndef _KERNEL_SCHED_SCHED_H
>>   #define _KERNEL_SCHED_SCHED_H
>>
>> +#include <linux/prandom.h>
>>   #include <linux/sched/affinity.h>
>>   #include <linux/sched/autogroup.h>
>>   #include <linux/sched/cpufreq.h>
>> @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
>>   }
>>
>>   DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>> +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
>> +
>> +static inline u32 sched_rng(void)
>> +{
>> +     return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
>> +}
>>
>>   #define cpu_rq(cpu)         (&per_cpu(runqueues, (cpu)))
>>   #define this_rq()           this_cpu_ptr(&runqueues)
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
>>
>>               .last_balance           = jiffies,
>>               .balance_interval       = sd_weight,
>> +
>> +             /* 50% success rate */
>> +             .newidle_call           = 512,
>> +             .newidle_success        = 256,
>> +             .newidle_ratio          = 512,
>> +
>>               .max_newidle_lb_cost    = 0,
>>               .last_decay_max_lb_cost = jiffies,
>>               .child                  = child,
>>
>>
Hi All,

I can confirm that we are seeing a 4-11% performance regression in 
v6.12.66 on multiple benchmarks running on c7a.4xlarge AWS EC2 instances 
that are powered by AMD EPYC 9R14-series CPU (code-named Genoa) and 
c7i.4xlarge which is powered by 4th-Generation Intel Xeon Scalable 
processor (code-named Sapphire Rapids). The regression is caused by the 
commit 33cf66d88306 ("sched/fair: Proportional newidle balance"). We 
were able to reclaim the performance back after reverting this commit. 
We also noticed that the impact is higher on AMD vs Intel.

Benchmark Name |  Description				    | Unit
postgresql     |  HammerDB workload (TPC-C-like benchmark)  | NOPM
nginx_lb       |  Testing NGINX as a load balancer	    | RPS
memcached      |  Testing using Lancet load generator       | QPS

**Results on v6.12.66**

Benchmark name | SUT EC2 Instance | Regression percentage
postgresql     | c7a.4xlarge      | -4.0%
postgresql     | c7i.4xlarge      | -4.0%
nginx_lb       | c7a.4xlarge      | -5.0%
memcached      | c7a.4xlarge      | -11.0%

We have also seen smaller impact on v6.1.161 which has the mentioned commit.

**Results on v6.1.161**

Benchmark name | SUT EC2 Instance | Regression percentage
nginx_lb       | c7a.4xlarge      | -3.0%
nginx_lb       | c7i.4xlarge      | -4.0%
memcached      | c7a.4xlarge      | -5.0%

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 6 days ago

On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:

> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
> Sapphire Rapids). The regression is caused by the commit 33cf66d88306
> ("sched/fair: Proportional newidle balance"). We were able to reclaim the
> performance back after reverting this commit. We also noticed that the
> impact is higher on AMD vs Intel.
> 
> Benchmark Name |  Description				    | Unit
> postgresql     |  HammerDB workload (TPC-C-like benchmark)  | NOPM
> nginx_lb       |  Testing NGINX as a load balancer	    | RPS
> memcached      |  Testing using Lancet load generator       | QPS
> 
> **Results on v6.12.66**
> 
> Benchmark name | SUT EC2 Instance | Regression percentage
> postgresql     | c7a.4xlarge      | -4.0%
> postgresql     | c7i.4xlarge      | -4.0%
> nginx_lb       | c7a.4xlarge      | -5.0%
> memcached      | c7a.4xlarge      | -11.0%

So only postgres has a regression on Intel? Memcached doesn't show
anything?

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 1 week, 4 days ago

On 27/01/2026 08:50, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> 
>> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
>> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
>> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
>> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
>> Sapphire Rapids). The regression is caused by the commit 33cf66d88306
>> ("sched/fair: Proportional newidle balance"). We were able to reclaim the
>> performance back after reverting this commit. We also noticed that the
>> impact is higher on AMD vs Intel.
>>
>> Benchmark Name |  Description                             | Unit
>> postgresql     |  HammerDB workload (TPC-C-like benchmark)  | NOPM
>> nginx_lb       |  Testing NGINX as a load balancer        | RPS
>> memcached      |  Testing using Lancet load generator       | QPS
>>
>> **Results on v6.12.66**
>>
>> Benchmark name | SUT EC2 Instance | Regression percentage
>> postgresql     | c7a.4xlarge      | -4.0%
>> postgresql     | c7i.4xlarge      | -4.0%
>> nginx_lb       | c7a.4xlarge      | -5.0%
>> memcached      | c7a.4xlarge      | -11.0%
> 
> So only postgres has a regression on Intel? Memcached doesn't show
> anything?
True, memcached performance on Intel is exactly the same with and 
without commit 33cf66d88306 ("sched/fair: Proportional newidle 
balance"). The memcached regression is only visible on AMD. I also 
tested on arm64 VMs using Graviton3(based on Arm Neoverse V1) and 
Graviton4(based on Arm Neoverse V2) and I don't see any memcached 
regression there.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 5 days ago

On Tue, Jan 27, 2026 at 09:50:25AM +0100, Peter Zijlstra wrote:
> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> 
> > I can confirm that we are seeing a 4-11% performance regression in v6.12.66
> > on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
> > powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
> > is powered by 4th-Generation Intel Xeon Scalable processor (code-named
> > Sapphire Rapids). The regression is caused by the commit 33cf66d88306
> > ("sched/fair: Proportional newidle balance"). We were able to reclaim the
> > performance back after reverting this commit. We also noticed that the
> > impact is higher on AMD vs Intel.
> > 
> > Benchmark Name |  Description				    | Unit
> > postgresql     |  HammerDB workload (TPC-C-like benchmark)  | NOPM
> > nginx_lb       |  Testing NGINX as a load balancer	    | RPS
> > memcached      |  Testing using Lancet load generator       | QPS
> > 
> > **Results on v6.12.66**
> > 
> > Benchmark name | SUT EC2 Instance | Regression percentage
> > postgresql     | c7a.4xlarge      | -4.0%
> > postgresql     | c7i.4xlarge      | -4.0%
> > nginx_lb       | c7a.4xlarge      | -5.0%
> > memcached      | c7a.4xlarge      | -11.0%
> 
> So only postgres has a regression on Intel? Memcached doesn't show
> anything?

And just to be sure, v6.12.43-v6.12.65 have no problem?

That is, afaict those are the kernels that have:

  fc4289233e4b sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails

But not yet have:

  1b9c118fe318 sched/fair: Proportional newidle balance
  c6ae271bc5fd sched/fair: Small cleanup to update_newidle_cost()
  52aa889c6f57 sched/fair: Small cleanup to sched_balance_newidle()
  81343616e712 sched/fair: Revert max_newidle_lb_cost bump

Because fc4289233e4b was also causing a ton of regressions (but also
improving some workloads). 81343616e712 then reverts this and
1b9c118fe318 is supposed to be a compromise between these two.

So if your workloads are not affected by fc4289233e4b and 81343616e712,
but somehow 1b9c118fe318 is causing fail, then I'm a little puzzled.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 1 week, 4 days ago

On 27/01/2026 09:13, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Tue, Jan 27, 2026 at 09:50:25AM +0100, Peter Zijlstra wrote:
>> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>>
>>> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
>>> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
>>> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
>>> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
>>> Sapphire Rapids). The regression is caused by the commit 33cf66d88306
>>> ("sched/fair: Proportional newidle balance"). We were able to reclaim the
>>> performance back after reverting this commit. We also noticed that the
>>> impact is higher on AMD vs Intel.
>>>
>>> Benchmark Name |  Description                                   | Unit
>>> postgresql     |  HammerDB workload (TPC-C-like benchmark)  | NOPM
>>> nginx_lb       |  Testing NGINX as a load balancer      | RPS
>>> memcached      |  Testing using Lancet load generator       | QPS
>>>
>>> **Results on v6.12.66**
>>>
>>> Benchmark name | SUT EC2 Instance | Regression percentage
>>> postgresql     | c7a.4xlarge      | -4.0%
>>> postgresql     | c7i.4xlarge      | -4.0%
>>> nginx_lb       | c7a.4xlarge      | -5.0%
>>> memcached      | c7a.4xlarge      | -11.0%
>>
>> So only postgres has a regression on Intel? Memcached doesn't show
>> anything?
> 
> And just to be sure, v6.12.43-v6.12.65 have no problem?
> 
> That is, afaict those are the kernels that have:
> 
>    fc4289233e4b sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails
> 
> But not yet have:
> 
>    1b9c118fe318 sched/fair: Proportional newidle balance
>    c6ae271bc5fd sched/fair: Small cleanup to update_newidle_cost()
>    52aa889c6f57 sched/fair: Small cleanup to sched_balance_newidle()
>    81343616e712 sched/fair: Revert max_newidle_lb_cost bump
> 
> Because fc4289233e4b was also causing a ton of regressions (but also
> improving some workloads). 81343616e712 then reverts this and
> 1b9c118fe318 is supposed to be a compromise between these two.
> 
> So if your workloads are not affected by fc4289233e4b and 81343616e712,
> but somehow 1b9c118fe318 is causing fail, then I'm a little puzzled.
> 
We have definitely seen significant performance regression specifically 
on DB workloads because of fc4289233e4b ("sched/fair: Bump 
sd->max_newidle_lb_cost when newidle balance fails") which we reported 
in [1]. We were able to reclaim the performance back with ("81343616e712 
sched/fair: Revert max_newidle_lb_cost bump") before we start seeing 
negative impact from 1b9c118fe318 sched/fair: Proportional newidle balance.



[1] 
https://lore.kernel.org/all/006c9df2-b691-47f1-82e6-e233c3f91faf@oracle.com/T/#mb96105e4a320659b5aa68ec112bbeafaae37e769

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 6 days ago

On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
> Sapphire Rapids).

For those not speaking Amazon; what actual system setup is that Xeon? Is
that single socket or multi-socket?

Going by the name, the 4x would suggest a quad-socket Xeon, which are
somewhat beastly, but if I google this 'c7i.4xlarge' identifier, I get a
puny single socket 16cpu thing.

What is it?

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 1 week, 4 days ago


On 27/01/2026 08:44, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
>> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
>> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
>> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
>> Sapphire Rapids).
> 
> For those not speaking Amazon; what actual system setup is that Xeon? Is
> that single socket or multi-socket?
> 
> Going by the name, the 4x would suggest a quad-socket Xeon, which are
> somewhat beastly, but if I google this 'c7i.4xlarge' identifier, I get a
> puny single socket 16cpu thing.
> 
> What is it?

Hi Peter,

Apologize for the confusion, the "4x" is just an Amazon naming for EC2 
instance sizing,  basically number of CPUs & Memory and Network 
Bandwidth. The naming has nothing to do with exact number of sockets 
within the VM.

Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge.

c7i.4xlarge

CPU Model: Intel(R) Xeon(R) Platinum 8488C
Number of CPUs: 16
Memory: 32 GB
Number of sockets: 1

-------------------------------------------------------------------------

c7a.4xlarge

CPU Model: AMD EPYC 9R14
Number of CPUs: 16
Memory: 32 GB
Number of sockets: 1

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 3 days ago

On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote:

> Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge.
> 
> c7i.4xlarge
> 
> CPU Model: Intel(R) Xeon(R) Platinum 8488C
> Number of CPUs: 16
> Memory: 32 GB
> Number of sockets: 1

But the 8488C is a 56 core part, with 112 threads. So you're handing out
8 core partitions of that thing, for 7 such instances on one machine?

(Also, calling anything 16 core with 32GB 'large' is laughable, that's
laptop territory.)

> -------------------------------------------------------------------------
> 
> c7a.4xlarge
> 
> CPU Model: AMD EPYC 9R14
> Number of CPUs: 16
> Memory: 32 GB
> Number of sockets: 1

And that 9r14 is a 96 core part, 12 CCDs, 8 cores each. So you're again
handing out partitions of that.

For both cases, are these partitions fixed? Specifically in the AMD case,
are you handing out exactly 1 CCDs per partition?

Because if so, I'm mighty confused by the results. 8 cores, 16 threads
is exactly one CCD worth of Zen4 and should therefore be a single L3 and
behave exactly like the Intel thing.

Something is missing here.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 1 week, 2 days ago

On 29/01/2026 09:19, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> 
>> Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge.
>>
>> c7i.4xlarge
>>
>> CPU Model: Intel(R) Xeon(R) Platinum 8488C
>> Number of CPUs: 16
>> Memory: 32 GB
>> Number of sockets: 1
> 
> But the 8488C is a 56 core part, with 112 threads. So you're handing out
> 8 core partitions of that thing, for 7 such instances on one machine?
> 

c7i.4xlarge is an EC2 instance which is basically a virtual machine 
running on Nitro KVM based hypervisor. The VM is sharing the host with 
other VMs which explain why Amazon doesn't allocate all the host CPU 
resources to a single VM. There are larger EC2 instance sizes where a 
single VM would occupy the whole host for example c7i.48xlarge which has 
192 vCPUs. Your conclusion is right c7i.4xlarge has 8 Physical cores 
with HT enabled which adds up to 16 vCPU.

> (Also, calling anything 16 core with 32GB 'large' is laughable, that's
> laptop territory.)
> 
>> -------------------------------------------------------------------------
>>
>> c7a.4xlarge
>>
>> CPU Model: AMD EPYC 9R14
>> Number of CPUs: 16
>> Memory: 32 GB
>> Number of sockets: 1
> 
> And that 9r14 is a 96 core part, 12 CCDs, 8 cores each. So you're again
> handing out partitions of that.
> 
> 
> 
> For both cases, are these partitions fixed? Specifically in the AMD case,
> are you handing out exactly 1 CCDs per partition?
> 
> Because if so, I'm mighty confused by the results. 8 cores, 16 threads
> is exactly one CCD worth of Zen4 and should therefore be a single L3 and
> behave exactly like the Intel thing.
> 
> Something is missing here.

The main difference between Intel based c7i.4xlarge vs AMD based 
c7a.4xlarge is that on Intel we have HT enabled so the instance has 16 
vCPU which are really 8 Physical cores with HT enabled. On AMD the VM 
comes with 16 physical cores with no HT so it has 2 CCDs while on Intel 
we have a single L3 cache. I am also adding the output of lscpu on both 
instances to clarify architectural discrepancies between both.


**c7i.4xlarge**

# lscpu
Architecture:                x86_64
   CPU op-mode(s):            32-bit, 64-bit
   Address sizes:             46 bits physical, 48 bits virtual
   Byte Order:                Little Endian
CPU(s):                      16
   On-line CPU(s) list:       0-15
Vendor ID:                   GenuineIntel
   BIOS Vendor ID:            Intel(R) Corporation
   Model name:                Intel(R) Xeon(R) Platinum 8488C
     BIOS Model name:         Intel(R) Xeon(R) Platinum 8488C
     CPU family:              6
     Model:                   143
     Thread(s) per core:      2
     Core(s) per socket:      8
     Socket(s):               1
     Stepping:                8
     BogoMIPS:                4800.00
     Flags:                   fpu vme de pse tsc msr pae mce cx8 apic 
sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall 
nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology 
nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ss
                              se3 fma cx16 pdcm pcid sse4_1 sse4_2 
x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand 
hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd 
ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 
erms invpc
                              id avx512f avx512dq rdseed adx smap 
avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt 
xsavec xgetbv1 xsaves avx_vnni avx512_bf16 wbnoinvd ida arat avx512vbmi 
umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx5
                              12_bitalg tme avx512_vpopcntdq rdpid 
cldemote movdiri movdir64b md_clear serialize amx_bf16 avx512_fp16 
amx_tile amx_int8 flush_l1d arch_capabilities
Virtualization features:
   Hypervisor vendor:         KVM
   Virtualization type:       full
Caches (sum of all):
   L1d:                       384 KiB (8 instances)
   L1i:                       256 KiB (8 instances)
   L2:                        16 MiB (8 instances)
   L3:                        105 MiB (1 instance)
NUMA:
   NUMA node(s):              1
   NUMA node0 CPU(s):         0-15

-------------------------------------------------------------------------

**c7a.4xlarge**

# lscpu
Architecture:                x86_64
   CPU op-mode(s):            32-bit, 64-bit
   Address sizes:             48 bits physical, 48 bits virtual
   Byte Order:                Little Endian
CPU(s):                      16
   On-line CPU(s) list:       0-15
Vendor ID:                   AuthenticAMD
   BIOS Vendor ID:            Advanced Micro Devices, Inc.
   Model name:                AMD EPYC 9R14
     BIOS Model name:         AMD EPYC 9R14
     CPU family:              25
     Model:                   17
     Thread(s) per core:      1
     Core(s) per socket:      16
     Socket(s):               1
     Stepping:                1
     BogoMIPS:                5199.99
     Flags:                   fpu vme de pse tsc msr pae mce cx8 apic 
sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx 
mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc 
cpuid extd_apicid aperfmperf tsc_known_freq pni pclmulqdq monitor
                              ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic 
movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy 
cr8_legacy abm sse4a misalignsse 3dnowprefetch topoext perfctr_core 
invpcid_single ssbd perfmon_v2 ibrs ibpb stibp ibrs_enhanced vmmcall fsgs
                              base bmi1 avx2 smep bmi2 invpcid avx512f 
avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni 
avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx512_bf16 clzero 
xsaveerptr rdpru wbnoinvd arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes
                               vpclmulqdq avx512_vnni avx512_bitalg 
avx512_vpopcntdq rdpid flush_l1d
Virtualization features:
   Hypervisor vendor:         KVM
   Virtualization type:       full
Caches (sum of all):
   L1d:                       512 KiB (16 instances)
   L1i:                       512 KiB (16 instances)
   L2:                        16 MiB (16 instances)
   L3:                        64 MiB (2 instances)
NUMA:
   NUMA node(s):              1
   NUMA node0 CPU(s):         0-15

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 6 days, 22 hours ago

On Fri, Jan 30, 2026 at 01:16:52PM +0000, Mohamed Abuelfotoh, Hazem wrote:

> The main difference between Intel based c7i.4xlarge vs AMD based c7a.4xlarge
> is that on Intel we have HT enabled so the instance has 16 vCPU which are
> really 8 Physical cores with HT enabled. On AMD the VM comes with 16
> physical cores with no HT so it has 2 CCDs while on Intel we have a single
> L3 cache. I am also adding the output of lscpu on both instances to clarify
> architectural discrepancies between both.

OK, that clarifies.

How does the NI_RATE patch work for you?

  https://lkml.kernel.org/r/20260127151748.GA1079264@noisy.programming.kicks-ass.net

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 6 days, 21 hours ago

On 02/02/2026 10:51, Peter Zijlstra wrote:
> 
> OK, that clarifies.
> 
> How does the NI_RATE patch work for you?
> 
>    https://lkml.kernel.org/r/20260127151748.GA1079264@noisy.programming.kicks-ass.net

Sure I am going to apply this patch and report the results.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 4 days, 20 hours ago

On 02/02/2026 11:07, Mohamed Abuelfotoh, Hazem wrote:
> On 02/02/2026 10:51, Peter Zijlstra wrote:
>>
>> OK, that clarifies.
>>
>> How does the NI_RATE patch work for you?
>>
>>    https://lkml.kernel.org/ 
>> r/20260127151748.GA1079264@noisy.programming.kicks-ass.net
> 
> Sure I am going to apply this patch and report the results.
> 
> 
> 
I have tested the patch proposed in [1] on top of v6.12.66 and 
unfortunately I haven't seen any improvement. I mainly compared the 
results between v6.12.66 & v6.12.66_revert_1b9c118fe318 which has the 
revert for 1b9c118fe318 ("sched/fair: Proportional newidle balance") and 
v6.12.66_proposed which is v6.12.66 + the patch proposed in [1]. I 
mainly focused on AMD based c7a.4xlarge which was highly impacted as 
previously discussed in this thread. The baseline is 
v6.12.66_revert_1b9c118fe318 as it showed the best performance among the 
other available kernel options.

Version           | Benchmark name   | SUT EC2 Instance| diff %
v6.12.66          | postgresql       | c7a.4xlarge     | -4.0%
v6.12.66          | nginx_lb         | c7a.4xlarge     | -5.0%
v6.12.66          | memcached        | c7a.4xlarge     | -11.0%
v6.12.66_proposed | postgresql       | c7a.4xlarge     | -4.0%
v6.12.66_proposed | nginx_lb         | c7a.4xlarge     | -5.0%
v6.12.66_proposed | memcached        | c7a.4xlarge     | -13.0%


Furthermore We have also seen around 10-20% randwrite fio performance 
regression on v6.18.5 only on AMD based VMs. We confirmed that this 
regression is caused by 1b9c118fe318 ("sched/fair: Proportional newidle 
balance"). We are currently testing if the patch proposed in [1] to see 
if it will help with this regression and will be sharing the results & 
reproduction steps and environment in the next update.


[1 
https://lkml.kernel.org/r/20260127151748.GA1079264@noisy.programming.kicks-ass.net

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 4 days, 19 hours ago

On Wed, Feb 04, 2026 at 12:45:41PM +0000, Mohamed Abuelfotoh, Hazem wrote:

> Version           | Benchmark name   | SUT EC2 Instance| diff %
> v6.12.66          | postgresql       | c7a.4xlarge     | -4.0%
> v6.12.66          | nginx_lb         | c7a.4xlarge     | -5.0%
> v6.12.66          | memcached        | c7a.4xlarge     | -11.0%
> v6.12.66_proposed | postgresql       | c7a.4xlarge     | -4.0%
> v6.12.66_proposed | nginx_lb         | c7a.4xlarge     | -5.0%
> v6.12.66_proposed | memcached        | c7a.4xlarge     | -13.0%

*sigh*, that actually made it worse for memcached :/ I'm not familiar
with the memcached benchmark, is this something I can run on a single
machine, or does it require high speed networking and 2 machines?

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 4 days, 19 hours ago

On 04/02/2026 13:27, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Wed, Feb 04, 2026 at 12:45:41PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> 
>> Version           | Benchmark name   | SUT EC2 Instance| diff %
>> v6.12.66          | postgresql       | c7a.4xlarge     | -4.0%
>> v6.12.66          | nginx_lb         | c7a.4xlarge     | -5.0%
>> v6.12.66          | memcached        | c7a.4xlarge     | -11.0%
>> v6.12.66_proposed | postgresql       | c7a.4xlarge     | -4.0%
>> v6.12.66_proposed | nginx_lb         | c7a.4xlarge     | -5.0%
>> v6.12.66_proposed | memcached        | c7a.4xlarge     | -13.0%
> 
> *sigh*, that actually made it worse for memcached :/ I'm not familiar
> with the memcached benchmark, is this something I can run on a single
> machine, or does it require high speed networking and 2 machines?

Yup that's true it's slightly worse on memcached with the proposed fix:( 
The memcached benchmark is kind of multi-layer test where you need at at 
least 2 client machines and 1 server machine and 1 machine as a test 
coordinator. The server VM is able to achieve 12.5 Gbps of network 
bandwidth and the client each one is able to achieve 30 Gbps so I think 
it will be tricky and likely impossible to reproduce this on a single 
machine. I will try to come-up with standalone reproduction steps that 
can be used to investigate this memcached regression. Meanwhile we will 
share the fio regression reproduction steps that I mentioned in my 
previous update. This should be much simpler in steps and can be done on 
a single machine.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 4 days, 18 hours ago

On Wed, Feb 04, 2026 at 01:59:24PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> On 04/02/2026 13:27, Peter Zijlstra wrote:
> > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > 
> > 
> > 
> > On Wed, Feb 04, 2026 at 12:45:41PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> > 
> > > Version           | Benchmark name   | SUT EC2 Instance| diff %
> > > v6.12.66          | postgresql       | c7a.4xlarge     | -4.0%
> > > v6.12.66          | nginx_lb         | c7a.4xlarge     | -5.0%
> > > v6.12.66          | memcached        | c7a.4xlarge     | -11.0%
> > > v6.12.66_proposed | postgresql       | c7a.4xlarge     | -4.0%
> > > v6.12.66_proposed | nginx_lb         | c7a.4xlarge     | -5.0%
> > > v6.12.66_proposed | memcached        | c7a.4xlarge     | -13.0%
> > 
> > *sigh*, that actually made it worse for memcached :/ I'm not familiar
> > with the memcached benchmark, is this something I can run on a single
> > machine, or does it require high speed networking and 2 machines?
> 
> Yup that's true it's slightly worse on memcached with the proposed fix:( The
> memcached benchmark is kind of multi-layer test where you need at at least 2
> client machines and 1 server machine and 1 machine as a test coordinator.
> The server VM is able to achieve 12.5 Gbps of network bandwidth and the
> client each one is able to achieve 30 Gbps so I think it will be tricky and
> likely impossible to reproduce this on a single machine.

Urgh, yeah, while I have multiple machines, not two of them are the same
and I can only offer 1 Gbps of networking, that's not going to keep
anything busy.

> I will try to
> come-up with standalone reproduction steps that can be used to investigate
> this memcached regression. Meanwhile we will share the fio regression
> reproduction steps that I mentioned in my previous update. This should be
> much simpler in steps and can be done on a single machine.

Thanks! I have a few machines with a 'spare' nvme drive to run things
on, hopefully that is sufficient.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 4 days, 10 hours ago

On 04/02/2026 14:05, Peter Zijlstra wrote:

>> I will try to
>> come-up with standalone reproduction steps that can be used to investigate
>> this memcached regression. Meanwhile we will share the fio regression
>> reproduction steps that I mentioned in my previous update. This should be
>> much simpler in steps and can be done on a single machine.
> 
> Thanks! I have a few machines with a 'spare' nvme drive to run things
> on, hopefully that is sufficient.

It looks like the previously reported fio regression has been fully 
mitigated by the proposed patch [1]. I verified this on both 6.18.5 & 
6.12.66. I will try to come-up with standalone reproduction for the 
memcached regression to make it easier for debugging.


**fio regression reproduction environment**

AWS EC2 instance: c5ad.24xlarge

96 vCPUs = 48 Cores with HT
12 CCDs
Memory : 192 GiB
SSD Disk space: 1900 GiB
SSD Disk Max write IOPS: 180K
SSD Disk Max Write B.W: 760 MB/sec

Below are the results of three different runs.

6.18.5

6.18.5_revert 6.18.5 with the revert of 1b9c118fe318 ("sched/fair: 
Proportional newidle balance")

6.18.5_proposed 6.18.5 with patch[1]

---------------------------------------------------------------

Version 6.18.5

# sudo fio --time_based --name=benchmark --size=50G --runtime=60 
--filename=/dev/nvme1n1 --ioengine=psync --randrepeat=0 --iodepth=1 
--fsync=64 --invalidate=1 --verify=0 --verify_fatal=0 --blocksize=4k 
--group_reporting --rw=randwrite --numjobs=4

Run status group 0 (all jobs):
   WRITE: bw=478MiB/s (501MB/s), 478MiB/s-478MiB/s (501MB/s-501MB/s), 
io=28.0GiB (30.1GB), run=60003-60003msec

----------------------------------------------------------------

Version 6.18.5_revert

# sudo fio --time_based --name=benchmark --size=50G --runtime=60 
--filename=/dev/nvme1n1 --ioengine=psync --randrepeat=0 --iodepth=1 
--fsync=64 --invalidate=1 --verify=0 --verify_fatal=0 --blocksize=4k 
--group_reporting --rw=randwrite --numjobs=4

Run status group 0 (all jobs):
   WRITE: bw=549MiB/s (575MB/s), 549MiB/s-549MiB/s (575MB/s-575MB/s), 
io=32.2GiB (34.5GB), run=60002-60002msec

-----------------------------------------------------------------

Version 6.18.5_proposed

# sudo fio --time_based --name=benchmark --size=50G --runtime=60 
--filename=/dev/nvme1n1 --ioengine=psync --randrepeat=0 --iodepth=1 
--fsync=64 --invalidate=1 --verify=0 --verify_fatal=0 --blocksize=4k 
--group_reporting --rw=randwrite --numjobs=4

Run status group 0 (all jobs):
   WRITE: bw=551MiB/s (578MB/s), 551MiB/s-551MiB/s (578MB/s-578MB/s), 
io=32.3GiB (34.7GB), run=60003-60003msec


[1] 
https://lore.kernel.org/all/20260127151748.GA1079264@noisy.programming.kicks-ass.net/T/#u

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 3 days ago

On Thu, Jan 29, 2026 at 10:19:37AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> 
> > Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge.
> > 
> > c7i.4xlarge
> > 
> > CPU Model: Intel(R) Xeon(R) Platinum 8488C
> > Number of CPUs: 16
> > Memory: 32 GB
> > Number of sockets: 1
> 
> But the 8488C is a 56 core part, with 112 threads. So you're handing out
> 8 core partitions of that thing, for 7 such instances on one machine?
> 
> (Also, calling anything 16 core with 32GB 'large' is laughable, that's
> laptop territory.)

Also, are you employing Intel-CAT on these partitions to separate the
L3s?

(Not immediately relevant I suppose, but I was curious)

> > -------------------------------------------------------------------------
> > 
> > c7a.4xlarge
> > 
> > CPU Model: AMD EPYC 9R14
> > Number of CPUs: 16
> > Memory: 32 GB
> > Number of sockets: 1
> 
> And that 9r14 is a 96 core part, 12 CCDs, 8 cores each. So you're again
> handing out partitions of that.
> 
> 
> 
> For both cases, are these partitions fixed? Specifically in the AMD case,
> are you handing out exactly 1 CCDs per partition?
> 
> Because if so, I'm mighty confused by the results. 8 cores, 16 threads
> is exactly one CCD worth of Zen4 and should therefore be a single L3 and
> behave exactly like the Intel thing.
> 
> Something is missing here.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mohamed Abuelfotoh, Hazem 1 week, 2 days ago

On 29/01/2026 09:24, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Thu, Jan 29, 2026 at 10:19:37AM +0100, Peter Zijlstra wrote:
>> On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>>
>>> Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge.
>>>
>>> c7i.4xlarge
>>>
>>> CPU Model: Intel(R) Xeon(R) Platinum 8488C
>>> Number of CPUs: 16
>>> Memory: 32 GB
>>> Number of sockets: 1
>>
>> But the 8488C is a 56 core part, with 112 threads. So you're handing out
>> 8 core partitions of that thing, for 7 such instances on one machine?
>>
>> (Also, calling anything 16 core with 32GB 'large' is laughable, that's
>> laptop territory.)
> 
> Also, are you employing Intel-CAT on these partitions to separate the
> L3s?
> 
> (Not immediately relevant I suppose, but I was curious)
> 
We don't enable Intel-CAT to partition L3 cache between VMs.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 2 weeks, 2 days ago

On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> The patch "Proportional newidle balance" introduced a regression
> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> easyWave testing. [1]
> 
> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> to install easyWave [2]. That is fetching the two tar.gz archives.

What is the actual configuration of that chip? Is it like 3*8 or 4*6
(CCX wise). A quick google couldn't find me the answer :/

> #!/bin/bash
> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
> # make -j8
> 
> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
> 
> OMP_NUM_THREADS=48 ./src/easywave \
>   -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
>   -time 1200
> 
> 
> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.

So the problem is that 6.12 -> 6.18 is an enormous amount of kernel
releases :/ This patch in particular was an effort to fix a regression
caused by:

  155213a2aed4 ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")

I'm thinking that if you revert all 4 patches of this series your
performance will be even worse?

Anyway, my guess is that somehow this benchmark likes doing newidle even
if it is often not successful. I'll see if I can reproduce this on one
of my machine, but that might take a little while.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 2 weeks, 2 days ago

On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> > The patch "Proportional newidle balance" introduced a regression
> > with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> > easyWave testing. [1]
> > 
> > The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> > to install easyWave [2]. That is fetching the two tar.gz archives.
> 
> What is the actual configuration of that chip? Is it like 3*8 or 4*6
> (CCX wise). A quick google couldn't find me the answer :/

Obviously I found it right after sending this. It's a 4x6 config.
Meaning it needs newidle to balance between those 4 domains.

Pratheek -- are you guys still considering that SIS_NODE thing? That
worked really well for workstation chips, but there were some issues on
Epyc or so.

> > #!/bin/bash
> > # CXXFLAGS="-O3 $CXXFLAGS" ./configure
> > # make -j8
> > 
> > trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
> > 
> > OMP_NUM_THREADS=48 ./src/easywave \
> >   -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
> >   -time 1200
> > 
> > 
> > Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
> 
> So the problem is that 6.12 -> 6.18 is an enormous amount of kernel
> releases :/ This patch in particular was an effort to fix a regression
> caused by:
> 
>   155213a2aed4 ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")
> 
> I'm thinking that if you revert all 4 patches of this series your
> performance will be even worse?
> 
> Anyway, my guess is that somehow this benchmark likes doing newidle even
> if it is often not successful. I'll see if I can reproduce this on one
> of my machine, but that might take a little while.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 5 days ago

On Fri, Jan 23, 2026 at 12:03:06PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
> > On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> > > The patch "Proportional newidle balance" introduced a regression
> > > with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> > > easyWave testing. [1]
> > > 
> > > The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> > > to install easyWave [2]. That is fetching the two tar.gz archives.
> > 
> > What is the actual configuration of that chip? Is it like 3*8 or 4*6
> > (CCX wise). A quick google couldn't find me the answer :/
> 
> Obviously I found it right after sending this. It's a 4x6 config.
> Meaning it needs newidle to balance between those 4 domains.

So with the below patch on top of my Xeon w7-2495X (which is 24-core
48-thread) I too have 4 LLC :-)

And I think I can see a slight difference, but nowhere near as terrible.

Let me go stick some tracing on.

cpu0 0 0 0 0 0 0 199480591279 9327118209 21136
domain0 SMT 0000,01000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 1111,11111111 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 205007928818 2654503460 14772
domain0 SMT 0000,02000002 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 2222,22222222 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu2 0 0 0 0 0 0 190458000839 2361863044 13265
domain0 SMT 0000,04000004 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 4444,44444444 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu3 0 0 0 0 0 0 193040171114 2769182152 16215
domain0 SMT 0000,08000008 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 8888,88888888 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
...


easywave# echo NI_RANDOM > /debug/sched/features; OMP_NUM_THREADS=48 ./src/easywave   -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt   -time 300

easyWave ver.2013-04-11
Model time = 00:00:00,   elapsed: 2 msec
Model time = 00:10:00,   elapsed: 6 msec
Model time = 00:20:00,   elapsed: 13 msec
Model time = 00:30:00,   elapsed: 21 msec
Model time = 00:40:00,   elapsed: 33 msec
Model time = 00:50:00,   elapsed: 59 msec
Model time = 01:00:00,   elapsed: 136 msec
Model time = 01:10:00,   elapsed: 160 msec
Model time = 01:20:00,   elapsed: 189 msec
Model time = 01:30:00,   elapsed: 266 msec
Model time = 01:40:00,   elapsed: 321 msec
Model time = 01:50:00,   elapsed: 401 msec
Model time = 02:00:00,   elapsed: 482 msec
Model time = 02:10:00,   elapsed: 619 msec
Model time = 02:20:00,   elapsed: 731 msec
Model time = 02:30:00,   elapsed: 856 msec
Model time = 02:40:00,   elapsed: 1013 msec
Model time = 02:50:00,   elapsed: 1204 msec
Model time = 03:00:00,   elapsed: 1437 msec
Model time = 03:10:00,   elapsed: 1715 msec
Model time = 03:20:00,   elapsed: 1952 msec
Model time = 03:30:00,   elapsed: 2713 msec
Model time = 03:40:00,   elapsed: 3090 msec
Model time = 03:50:00,   elapsed: 3644 msec
Model time = 04:00:00,   elapsed: 4157 msec
Model time = 04:10:00,   elapsed: 4632 msec
Model time = 04:20:00,   elapsed: 5131 msec
Model time = 04:30:00,   elapsed: 5685 msec
Model time = 04:40:00,   elapsed: 6404 msec
Model time = 04:50:00,   elapsed: 7154 msec
Model time = 05:00:00,   elapsed: 8143 msec


easywave# echo NO_NI_RANDOM > /debug/sched/features; OMP_NUM_THREADS=48 ./src/easywave   -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt   -time 300

easyWave ver.2013-04-11
Model time = 00:00:00,   elapsed: 1 msec
Model time = 00:10:00,   elapsed: 6 msec
Model time = 00:20:00,   elapsed: 12 msec
Model time = 00:30:00,   elapsed: 21 msec
Model time = 00:40:00,   elapsed: 33 msec
Model time = 00:50:00,   elapsed: 94 msec
Model time = 01:00:00,   elapsed: 114 msec
Model time = 01:10:00,   elapsed: 138 msec
Model time = 01:20:00,   elapsed: 191 msec
Model time = 01:30:00,   elapsed: 227 msec
Model time = 01:40:00,   elapsed: 272 msec
Model time = 01:50:00,   elapsed: 322 msec
Model time = 02:00:00,   elapsed: 381 msec
Model time = 02:10:00,   elapsed: 458 msec
Model time = 02:20:00,   elapsed: 634 msec
Model time = 02:30:00,   elapsed: 861 msec
Model time = 02:40:00,   elapsed: 1050 msec
Model time = 02:50:00,   elapsed: 1265 msec
Model time = 03:00:00,   elapsed: 1463 msec
Model time = 03:10:00,   elapsed: 1658 msec
Model time = 03:20:00,   elapsed: 1892 msec
Model time = 03:30:00,   elapsed: 2243 msec
Model time = 03:40:00,   elapsed: 2672 msec
Model time = 03:50:00,   elapsed: 3038 msec
Model time = 04:00:00,   elapsed: 3462 msec
Model time = 04:10:00,   elapsed: 3961 msec
Model time = 04:20:00,   elapsed: 4455 msec
Model time = 04:30:00,   elapsed: 5040 msec
Model time = 04:40:00,   elapsed: 5594 msec
Model time = 04:50:00,   elapsed: 6190 msec
Model time = 05:00:00,   elapsed: 7065 msec


---
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a24c7805acdb..d0d7cefb6cd3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -699,6 +699,11 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu)
 	return per_cpu(cpu_info.topo.l2c_id, cpu);
 }
 
+static inline u32 per_cpu_core_id(unsigned int cpu)
+{
+	return per_cpu(cpu_info.topo.core_id, cpu);
+}
+
 #ifdef CONFIG_CPU_SUP_AMD
 /*
  * Issue a DIV 0/1 insn to clear any division data from previous DIV
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5cd6950ab672..5e7349c0f6ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,6 +438,9 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
 		return false;
 
+	if ((per_cpu_core_id(cpu1) % 4) != (per_cpu_core_id(cpu2) % 4))
+		return false;
+
 	/*
 	 * Allow the SNC topology without warning. Return of false
 	 * means 'c' does not share the LLC of 'o'. This will be

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 1 week, 5 days ago

On Tue, Jan 27, 2026 at 11:40:41AM +0100, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 12:03:06PM +0100, Peter Zijlstra wrote:
> > On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
> > > On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> > > > The patch "Proportional newidle balance" introduced a regression
> > > > with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> > > > easyWave testing. [1]
> > > > 
> > > > The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> > > > to install easyWave [2]. That is fetching the two tar.gz archives.
> > > 
> > > What is the actual configuration of that chip? Is it like 3*8 or 4*6
> > > (CCX wise). A quick google couldn't find me the answer :/
> > 
> > Obviously I found it right after sending this. It's a 4x6 config.
> > Meaning it needs newidle to balance between those 4 domains.
> 
> So with the below patch on top of my Xeon w7-2495X (which is 24-core
> 48-thread) I too have 4 LLC :-)
> 
> And I think I can see a slight difference, but nowhere near as terrible.
> 
> Let me go stick some tracing on.

Does this help some?

Turns out, this easywave thing has a very low newidle rate, but then
also a fairly low success rate. But since it doesn't do it that often,
the cost isn't that significant so we might as well always do it etc..

This adds a second term to the ratio computation that takes time into
account, For low rate newidle this term will dominate, while for higher
rate the success ratio is more important.

Chris, afaict this still DTRT for schbench, but if this works for Mario,
could you also re-run things at your end?

[ the 4 'second' thing is a bit random, but looking at the timings
  between easywave and schbench this seems to be a reasonable middle
  ground. Although I think 8 'seconds' -- 23 shift -- would also work.

  That would give:

    1024 -  8  s -   64 Hz                                                                                                                                                               
     512 -  4  s -  128 Hz
     256 -  2  s -  256 Hz
     128 -  1  s -  512 Hz
      64 - .5  s - 1024 Hz
      32 - .25 s - 2048 Hz
]

---

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 45c0022b91ce..a1e1032426dc 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -95,6 +95,7 @@ struct sched_domain {
 	unsigned int newidle_call;
 	unsigned int newidle_success;
 	unsigned int newidle_ratio;
+	u64 newidle_stamp;
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eca642295c4b..ab9cf06c6a76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12224,8 +12224,31 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
 	sd->newidle_call++;
 	sd->newidle_success += success;
 
 	if (sd->newidle_call >= 1024) {
-		sd->newidle_ratio = sd->newidle_success;
+		u64 now = sched_clock();
+		s64 delta = now - sd->newidle_stamp;
+		sd->newidle_stamp = now;
+		int ratio = 0;
+
+		if (delta < 0)
+			delta = 0;
+
+		if (sched_feat(NI_RATE)) {
+			/*
+			 * ratio  delta   freq
+			 *
+			 * 1024 -  4  s -  128 Hz
+			 *  512 -  2  s -  256 Hz
+			 *  256 -  1  s -  512 Hz
+			 *  128 - .5  s - 1024 Hz
+			 *   64 - .25 s - 2048 Hz
+			 */
+			ratio = delta >> 22;
+		}
+
+		ratio += sd->newidle_success;
+
+		sd->newidle_ratio = min(1024, ratio);
 		sd->newidle_call /= 2;
 		sd->newidle_success /= 2;
 	}
@@ -12932,7 +12959,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			unsigned int weight = 1;
 
-			if (sched_feat(NI_RANDOM)) {
+			if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
 				/*
 				 * Throw a 1k sided dice; and only run
 				 * newidle_balance according to the success
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 980d92bab8ab..7aba7523c6c1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false)
  * Do newidle balancing proportional to its success rate using randomization.
  */
 SCHED_FEAT(NI_RANDOM, true)
+SCHED_FEAT(NI_RATE, true)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..05741f18f334 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
 #include <linux/bsearch.h>
 #include "sched.h"
 
@@ -1637,6 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl,
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	int sd_id, sd_weight, sd_flags = 0;
 	struct cpumask *sd_span;
+	u64 now = sched_clock();
 
 	sd_weight = cpumask_weight(tl->mask(tl, cpu));
 
@@ -1674,6 +1676,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		.newidle_call		= 512,
 		.newidle_success	= 256,
 		.newidle_ratio		= 512,
+		.newidle_stamp		= now,
 
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mario Roy 1 week, 3 days ago

I missed stating higher is better for the stress-ng socket tests.

For clarity, I find it difficult to know for certain if a scheduler
patch is good or bad without the prefer-idle-core results. A fix may
resolve an issue. But, only to introduce another. I'm unqualified
to know for certain if the fix in question introduces another.
Because, of the limited CPU saturation anomaly with EEVDF.

EEVDF turns out to be amazing. However, the folks in my circle
including myself are constantly worried about the ups and downs
with EEVDF. Mainly with the stable kernels.

We consider varied testing one way to be certain, including limited
CPU saturation testing. Well, a wish request for the test machines
to include limited CPU saturation, e.g. 100%, 50%, 31.25%, and 25%.
The 25% is helpful in the case the test does 2x the number of given
parameter. Plus wanting to be at/below the number of physical cores.

Thank you for your efforts with EEVDF.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mario Roy 1 week, 3 days ago

Peter, thank you for your fix to improve EEVDF.

Cc'd Andrea Righi
Thank you for the is_idle_core() function and help. [0]

Cc'd Shubhang Kaushik
Your patch inspired me to perform trial and error testing.
What has now become the 0280 patch in CachyMod GitHub repo. [0]

Together with the help of CachyOS community members, we concluded
the prefcore + prefer-idle-core to be surreal. I enjoy the EEVDF
scheduler a lot more, since lesser favoring the SMT siblings.

For comparison, I added results for sched-ext cosmos.

Limited CPU saturation can be revealing of potential scheduler issues.
Testing includes 100%, 50%, 31.25%, and 25% CPU saturation.
All kernels built with GCC to factor out CLANG/AutoFDO.

A) 6.18.8-rc1
    with sched/fair: Proportional newidle balance

                     48cpus(100%)  24cpus(50%)  15cpus(31.25%) 12cpus(25%)
    algorithm3 [1]       9.462s      14.181s        20.311s 24.498s
    darktable  [2]       2.811s       3.715s         5.315s  6.434s
    easywave   [3]      19.747s      10.804s        20.207s 21.571s
    stress-ng  [4]     37632.06     56220.21       41694.50  34740.58

B) 6.18.8-rc1
    Peter Z's fix for sched/fair: Proportional newidle balance

                     48cpus(100%)  24cpus(50%)  15cpus(31.25%) 12cpus(25%)
    algorithm3 [1]       9.340s      14.733s        21.339s 25.069s
    darktable  [2]       2.493s       3.616s         5.148s  5.968s
    easywave   [3]      11.357s      13.312s *      18.483s 20.741s
    stress-ng  [4]     37533.24     55419.85       39452.17  32217.55

    algorithm3 and stress-ng regressed, possibly limited CPU saturation 
anomaly
    easywave (*) wierd result, repeatable and all over the place

C) 6.18.8-rc1
    Revert sched/fair: Proportional newidle balance

                     48cpus(100%)  24cpus(50%)  15cpus(31.25%) 12cpus(25%)
    algorithm3 [1]       9.286s      15.101s        21.417s 25.126s
    darktable  [2]       2.484s       3.531s         5.185s  6.002s
    easywave   [3]      11.517s      12.300s        18.466s 20.428s
    stress-ng  [4]     42231.92     47306.18 *     32438.03 *  28820.83 *

    stress-ng (*) lack-luster with limited CPU saturation

D) 6.18.8-rc1
    Revert sched/fair: Proportional newidle balance
    Plus apply the prefer-idle-core patch [0]

                     48cpus(100%)  24cpus(50%)  15cpus(31.25%) 12cpus(25%)
    algorithm3 [1]       9.312s      11.292s        17.243s 21.811s
    darktable  [2]       2.418s       3.711s *       5.499s *  6.510s *
    easywave   [3]      10.035s       9.832s        15.738s 18.805s
    stress-ng  [4]     44837.41     63364.56       55646.26  48202.58

    darktable (*) lesser performance with limited CPU saturation
    noticeably better performance, otherwise

E) scx_cosmos -m 0-5 -s 800 -l 8000 -f -c 1 -p 0 [5]

                     48cpus(100%)  24cpus(50%)  15cpus(31.25%) 12cpus(25%)
    algorithm3 [1]       9.218s      11.188s        17.045s 21.130s
    darktable  [2]       2.365s       3.900s         4.626s  5.664s
    easywave   [3]       9.187s      16.528s *      15.933s 16.991s
    stress-ng  [4]     21065.70     36417.65       27185.95  23141.87

    easywave (*) sched-ext cosmos appears to favor SMT siblings

---
[0] https://github.com/marioroy/cachymod
     the prefer-idle-core is 0280-prefer-prevcpu-for-wakeup.patch
     more about mindfulness for limited CPU saturation versus accepting 
patch
     surreal is prefcore + prefer-idle-core, improving many workloads

[1] https://github.com/marioroy/mce-sandbox
     ./algorithm3.pl 1e12 --threads=N
     algorithm3.pl is akin to server/client application; chatty
     primesieve.pl is more CPU-bound; less chatty
     optionally, compare with primesieve binary (fully cpu bound, no chatty)
     https://github.com/kimwalisch/primesieve

[2] https://math.dartmouth.edu/~sarunas/darktable_bench.html
     OMP_NUM_THREADS=N darktable-cli setubal.orf setubal.orf.xmp test.jpg \
     --core --disable-opencl -d perf
     result: pixel pipeline processing took {...} secs

[3] https://openbenchmarking.org/test/pts/easywave
     OMP_NUM_THREADS=N ./src/easywave \
     -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
     -time 600
     result: Model time = 10:00:00,   elapsed: {...} msec

[4] https://openbenchmarking.org/test/pts/stress-ng
     stress-ng -t 30 --metrics-brief --sock N --no-rand-seed --sock-zerocopy
     result: bogo ops real time  usr time  sys time   bogo ops/s  bogo ops/s
                        (secs)    (secs)    (secs)   (real time) 
(usr+sys time)
                                                        {...}
     this involves 2x NCPUs due to { writer, reader } threads per sock
     thus the reason adding 12cpus result (12 x 2 = 24 <= 50% saturation)

[5] https://github.com/sched-ext/scx
     cargo build --release -p scx_cosmos

On 1/27/26 10:17 AM, Peter Zijlstra wrote:
> On Tue, Jan 27, 2026 at 11:40:41AM +0100, Peter Zijlstra wrote:
>> On Fri, Jan 23, 2026 at 12:03:06PM +0100, Peter Zijlstra wrote:
>>> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>>>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>>>> The patch "Proportional newidle balance" introduced a regression
>>>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>>>> easyWave testing. [1]
>>>>>
>>>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>>>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>>>> (CCX wise). A quick google couldn't find me the answer :/
>>> Obviously I found it right after sending this. It's a 4x6 config.
>>> Meaning it needs newidle to balance between those 4 domains.
>> So with the below patch on top of my Xeon w7-2495X (which is 24-core
>> 48-thread) I too have 4 LLC :-)
>>
>> And I think I can see a slight difference, but nowhere near as terrible.
>>
>> Let me go stick some tracing on.
> Does this help some?
>
> Turns out, this easywave thing has a very low newidle rate, but then
> also a fairly low success rate. But since it doesn't do it that often,
> the cost isn't that significant so we might as well always do it etc..
>
> This adds a second term to the ratio computation that takes time into
> account, For low rate newidle this term will dominate, while for higher
> rate the success ratio is more important.
>
> Chris, afaict this still DTRT for schbench, but if this works for Mario,
> could you also re-run things at your end?
>
> [ the 4 'second' thing is a bit random, but looking at the timings
>    between easywave and schbench this seems to be a reasonable middle
>    ground. Although I think 8 'seconds' -- 23 shift -- would also work.
>
>    That would give:
>
>      1024 -  8  s -   64 Hz
>       512 -  4  s -  128 Hz
>       256 -  2  s -  256 Hz
>       128 -  1  s -  512 Hz
>        64 - .5  s - 1024 Hz
>        32 - .25 s - 2048 Hz
> ]
>
> ---
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 45c0022b91ce..a1e1032426dc 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -95,6 +95,7 @@ struct sched_domain {
>   	unsigned int newidle_call;
>   	unsigned int newidle_success;
>   	unsigned int newidle_ratio;
> +	u64 newidle_stamp;
>   	u64 max_newidle_lb_cost;
>   	unsigned long last_decay_max_lb_cost;
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index eca642295c4b..ab9cf06c6a76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12224,8 +12224,31 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
>   	sd->newidle_call++;
>   	sd->newidle_success += success;
>   
>   	if (sd->newidle_call >= 1024) {
> -		sd->newidle_ratio = sd->newidle_success;
> +		u64 now = sched_clock();
> +		s64 delta = now - sd->newidle_stamp;
> +		sd->newidle_stamp = now;
> +		int ratio = 0;
> +
> +		if (delta < 0)
> +			delta = 0;
> +
> +		if (sched_feat(NI_RATE)) {
> +			/*
> +			 * ratio  delta   freq
> +			 *
> +			 * 1024 -  4  s -  128 Hz
> +			 *  512 -  2  s -  256 Hz
> +			 *  256 -  1  s -  512 Hz
> +			 *  128 - .5  s - 1024 Hz
> +			 *   64 - .25 s - 2048 Hz
> +			 */
> +			ratio = delta >> 22;
> +		}
> +
> +		ratio += sd->newidle_success;
> +
> +		sd->newidle_ratio = min(1024, ratio);
>   		sd->newidle_call /= 2;
>   		sd->newidle_success /= 2;
>   	}
> @@ -12932,7 +12959,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
>   		if (sd->flags & SD_BALANCE_NEWIDLE) {
>   			unsigned int weight = 1;
>   
> -			if (sched_feat(NI_RANDOM)) {
> +			if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
>   				/*
>   				 * Throw a 1k sided dice; and only run
>   				 * newidle_balance according to the success
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 980d92bab8ab..7aba7523c6c1 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false)
>    * Do newidle balancing proportional to its success rate using randomization.
>    */
>   SCHED_FEAT(NI_RANDOM, true)
> +SCHED_FEAT(NI_RATE, true)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..05741f18f334 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -4,6 +4,7 @@
>    */
>   
>   #include <linux/sched/isolation.h>
> +#include <linux/sched/clock.h>
>   #include <linux/bsearch.h>
>   #include "sched.h"
>   
> @@ -1637,6 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl,
>   	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
>   	int sd_id, sd_weight, sd_flags = 0;
>   	struct cpumask *sd_span;
> +	u64 now = sched_clock();
>   
>   	sd_weight = cpumask_weight(tl->mask(tl, cpu));
>   
> @@ -1674,6 +1676,7 @@ sd_init(struct sched_domain_topology_level *tl,
>   		.newidle_call		= 512,
>   		.newidle_success	= 256,
>   		.newidle_ratio		= 512,
> +		.newidle_stamp		= now,
>   
>   		.max_newidle_lb_cost	= 0,
>   		.last_decay_max_lb_cost	= jiffies,

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Mario Roy 1 week, 6 days ago

I tried the Stress-NG socket activity test. Plus prefer-idle-core patch.
The patch is about mindfulness for limited CPU saturation testing.

AMD Ryzen Threadripper 9960X CPU (24/48)

                     Bogo operations/second, More is better
                 A        B        C        D        E        F
SocketAct    12128.7  13907.6  12377.7  10551.7  12158.7  11842.2
SocketAct24  64553.3  20072.0  67018.7  62182.3  18133.5  66756.6
SocketAct15  49206.3  22170.7  57038.7  44077.6  19884.1  56727.5
SocketAct10  35263.5  20140.3  40092.1  33040.3  19701.6  41346.3

The kernels are built with clang without LTO/AutoFDO

A. 6.19-rc7 next_buddy ena with sched/fair: Proportional newidle balance
B. 6.19-rc7 next_buddy ena without sched/fair: Proportional newidle balance
C. 6.19-rc7 next_buddy ena without sched regression; with prefer-idle-core

D. 6.19-rc7 next_buddy dis with sched/fair: Proportional newidle balance
E. 6.19-rc7 next_buddy dis without sched/fair: Proportional newidle balance
F. 6.19-rc7 next_buddy dis without sched regression; with prefer-idle-core

Without sched regression:
   this is without sched/fair: Proportional newidle balance

With prefer-idle-core:
https://github.com/marioroy/cachymod/blob/main/linux-cachymod-6.18/
   0280-prefer-prevcpu-for-wakeup.patch

Stress-NG 0.20.00: SocketAct, SocketAct24, SocketAct15, SocketAct10
   stress-ng -t 30 --metrics-brief --sock -1 --no-rand-seed --sock-zerocopy
   stress-ng -t 30 --metrics-brief --sock 24 --no-rand-seed --sock-zerocopy
   stress-ng -t 30 --metrics-brief --sock 15 --no-rand-seed --sock-zerocopy
   stress-ng -t 30 --metrics-brief --sock 10 --no-rand-seed --sock-zerocopy

Basically 100%, 50%, and 31.25% times 2 (writer, reader)
I ran also, --sock 10 because 10 x 2 is less than 50% (24 threads)

Linux 6.18.7 results: granted, both are built with LTO + AutoFDO profile

              CachyOS 6.18.7-2  CachyMod 6.18.7-2 [1]
SocketAct    40799.2           46784.3
SocketAct24  61057.6           71414.5
SocketAct15  45056.4           61772.3
SocketAct10  32691.6           44244.6

[1] https://github.com/marioroy/cachymod
     the sched regression reverted (0040 patch)
     prefer-idle-core (0280 patch)


On 1/23/26 6:03 AM, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>> The patch "Proportional newidle balance" introduced a regression
>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>> easyWave testing. [1]
>>>
>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>> (CCX wise). A quick google couldn't find me the answer :/
> Obviously I found it right after sending this. It's a 4x6 config.
> Meaning it needs newidle to balance between those 4 domains.
>
> Pratheek -- are you guys still considering that SIS_NODE thing? That
> worked really well for workstation chips, but there were some issues on
> Epyc or so.
>
>>> #!/bin/bash
>>> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
>>> # make -j8
>>>
>>> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
>>>
>>> OMP_NUM_THREADS=48 ./src/easywave \
>>>    -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
>>>    -time 1200
>>>
>>>
>>> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
>> So the problem is that 6.12 -> 6.18 is an enormous amount of kernel
>> releases :/ This patch in particular was an effort to fix a regression
>> caused by:
>>
>>    155213a2aed4 ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")
>>
>> I'm thinking that if you revert all 4 patches of this series your
>> performance will be even worse?
>>
>> Anyway, my guess is that somehow this benchmark likes doing newidle even
>> if it is often not successful. I'll see if I can reproduce this on one
>> of my machine, but that might take a little while.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by K Prateek Nayak 2 weeks, 2 days ago

Hello Peter,

On 1/23/2026 4:33 PM, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>> The patch "Proportional newidle balance" introduced a regression
>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>> easyWave testing. [1]
>>>
>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>>
>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>> (CCX wise). A quick google couldn't find me the answer :/
> 
> Obviously I found it right after sending this. It's a 4x6 config.
> Meaning it needs newidle to balance between those 4 domains.
> 
> Pratheek -- are you guys still considering that SIS_NODE thing? That
> worked really well for workstation chips, but there were some issues on
> Epyc or so.

SIS_NODE was really turned out to be a trade-off between search
time vs search opportunity, especially when the system was heavily
overloaded.

Let me rebase those old patches and give it a spin over the weekend
on a couple of those large machines (128C/256T and 192C/384T per
socket) to see the damage. I'll update here by Tuesday or post out
a series if I see the situation having changed on the recent
kernels - some benchmarks had a completely different bottleneck
there when we looked closer last.

> 
>>> #!/bin/bash
>>> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
>>> # make -j8
>>>
>>> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
>>>
>>> OMP_NUM_THREADS=48 ./src/easywave \
>>>   -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
>>>   -time 1200
>>>
>>>
>>> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.

I'll go look at the benchmark too to see if I can reproduce on my end
and get some stats for these too. Thanks for bringing it to my notice.

-- 
Thanks and Regards,
Prateek

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by K Prateek Nayak 1 week, 5 days ago

On 1/23/2026 5:54 PM, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 1/23/2026 4:33 PM, Peter Zijlstra wrote:
>> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>>> The patch "Proportional newidle balance" introduced a regression
>>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>>> easyWave testing. [1]
>>>>
>>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>>>
>>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>>> (CCX wise). A quick google couldn't find me the answer :/
>>
>> Obviously I found it right after sending this. It's a 4x6 config.
>> Meaning it needs newidle to balance between those 4 domains.
>>
>> Pratheek -- are you guys still considering that SIS_NODE thing? That
>> worked really well for workstation chips, but there were some issues on
>> Epyc or so.
> 
> SIS_NODE was really turned out to be a trade-off between search
> time vs search opportunity, especially when the system was heavily
> overloaded.
> 
> Let me rebase those old patches and give it a spin over the weekend
> on a couple of those large machines (128C/256T and 192C/384T per
> socket) to see the damage. I'll update here by Tuesday or post out
> a series if I see the situation having changed on the recent
> kernels - some benchmarks had a completely different bottleneck
> there when we looked closer last.

So these are the results on tip:sched/core merged onto tip:sched/urgent
with SIS_NODE and SIS_NODE + SIS_UTIL [1] on a 512 CPUs machine with
(2 sockets x 16 CCXs (LLCs) x 8C/16T Zen4c cores):

tl;dr

(*) Consistent regressions, even with SIS_UTIL bailout on higher domain;
    Benchmark are mainly measuring tail-latency or have a thundering
    heard behavior that SIS_UTIL uwith default imbalance_pct isn't able
    to fully adjust to.

(#) Data has run-to-run variance but is still worse on average.

Note: Although "new-schbench-wakeup-latency" shows regression, the
baseline is few "us" and a couple more "us" addition appears as a
~ 20%-30% regression.

I'm still fighting dependency hell to get some of the longer running
benchmarks running on this system but I expect a few pct regressions
like last time [2].

System:

- 2 x 128C/256T Zen4c system with 16CCXs per socket
- Boost on
- C2 disabled
- Each socket is a NUMA node

Kernels:

tip: tip:sched/core at commit 377521af0341 ("sched: remove
     task_struct->faults_disabled_mapping") merged onto
     tip:sched/urgent at commit 15257cc2f905 ("sched/fair: Revert
     force wakeup preemption")

sis_node: tip + sis_node patch + cpumask_and() moved to after
          SIS_UTIL bailout [3]

sis_node: Tree from [1] based on tip:sched/core merged onto
          tip:sched/urgent

Full results:

  ==================================================================
  Test          : hackbench
  Units         : Normalized time in seconds
  Interpretation: Lower is better
  Statistic     : AMean
  ==================================================================
  Case:           tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
   1-groups     1.00 [ -0.00](11.61)     0.76 [ 24.30]( 4.43)     0.76 [ 24.05]( 2.93)
   2-groups     1.00 [ -0.00]( 9.73)     0.86 [ 14.22](17.59)     0.80 [ 19.85](15.31)
   4-groups     1.00 [ -0.00]( 5.88)     0.78 [ 21.87](11.93)     0.78 [ 21.64](14.33)
   8-groups     1.00 [ -0.00]( 2.93)     0.92 [  8.44]( 3.99)     0.92 [  7.79]( 4.04)
  16-groups     1.00 [ -0.00]( 1.77)     0.90 [ 10.47]( 5.61)     0.94 [  5.92]( 5.65)


  ==================================================================
  Test          : tbench
  Units         : Normalized throughput
  Interpretation: Higher is better
  Statistic     : AMean
  ==================================================================
  Clients:    tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
      1     1.00 [  0.00]( 0.20)     1.00 [ -0.07]( 0.16)     1.01 [  0.53]( 0.23)
      2     1.00 [  0.00]( 0.35)     1.00 [ -0.03]( 0.58)     1.00 [  0.12]( 0.20)
      4     1.00 [  0.00]( 0.09)     1.01 [  0.60]( 0.60)     1.00 [  0.16]( 0.15)
      8     1.00 [  0.00]( 0.63)     1.00 [ -0.35]( 0.53)     1.00 [  0.26]( 0.19)
     16     1.00 [  0.00]( 0.97)     1.00 [  0.33]( 0.30)     1.01 [  1.16]( 0.50)
     32     1.00 [  0.00]( 0.98)     1.02 [  1.54]( 0.91)     1.01 [  1.10]( 0.26)
     64     1.00 [  0.00]( 3.45)     1.02 [  1.88]( 0.48)     1.02 [  1.78]( 1.29)
    128     1.00 [  0.00]( 2.49)     1.00 [ -0.01]( 1.63)     0.99 [ -0.68]( 1.88)
    256     1.00 [  0.00]( 0.57)     1.01 [  0.73]( 0.45)     1.01 [  0.92]( 0.35)
    512     1.00 [  0.00]( 3.92)     0.51 [-48.55]( 0.11)     0.80 [-19.59]( 6.31)	(*)
   1024     1.00 [  0.00]( 0.10)     0.98 [ -2.11]( 0.09)     0.97 [ -3.29]( 0.28)
   2048     1.00 [  0.00]( 0.09)     0.98 [ -2.08]( 0.28)     0.99 [ -0.75]( 0.48)


  ==================================================================
  Test          : stream-10
  Units         : Normalized Bandwidth, MB/s
  Interpretation: Higher is better
  Statistic     : HMean
  ==================================================================
  Test:       tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
   Copy     1.00 [  0.00]( 0.31)     0.99 [ -0.70]( 0.57)     1.00 [ -0.09]( 1.44)
  Scale     1.00 [  0.00]( 0.38)     0.99 [ -1.00]( 0.49)     1.00 [  0.32]( 1.41)
    Add     1.00 [  0.00]( 0.31)     0.99 [ -0.95]( 0.63)     1.00 [  0.43]( 1.16)
  Triad     1.00 [  0.00]( 0.18)     0.99 [ -0.84]( 0.68)     1.00 [  0.16]( 1.12)


  ==================================================================
  Test          : stream-100
  Units         : Normalized Bandwidth, MB/s
  Interpretation: Higher is better
  Statistic     : HMean
  ==================================================================
  Test:       tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
   Copy     1.00 [  0.00]( 1.46)     1.00 [  0.39]( 1.57)     1.01 [  0.82]( 0.52)
  Scale     1.00 [  0.00]( 1.45)     1.00 [  0.49]( 1.37)     1.01 [  1.20]( 0.55)
    Add     1.00 [  0.00]( 1.09)     1.00 [  0.31]( 0.94)     1.01 [  0.79]( 0.35)
  Triad     1.00 [  0.00]( 1.06)     1.00 [  0.22]( 1.02)     1.01 [  0.56]( 0.19)


  ==================================================================
  Test          : netperf
  Units         : Normalized Througput
  Interpretation: Higher is better
  Statistic     : AMean
  ==================================================================
  Clients:         tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
   1-clients     1.00 [  0.00]( 0.27)     0.99 [ -0.82]( 0.26)     0.99 [ -0.78]( 0.16)
   2-clients     1.00 [  0.00]( 0.28)     0.99 [ -0.87]( 0.19)     1.00 [ -0.17]( 0.67)
   4-clients     1.00 [  0.00]( 0.38)     1.00 [ -0.47]( 0.33)     0.99 [ -0.53]( 0.31)
   8-clients     1.00 [  0.00]( 0.34)     0.99 [ -0.55]( 0.18)     1.00 [ -0.33]( 0.24)
  16-clients     1.00 [  0.00]( 0.30)     1.00 [ -0.39]( 0.23)     1.00 [ -0.19]( 0.26)
  32-clients     1.00 [  0.00]( 0.43)     1.00 [ -0.40]( 0.57)     1.00 [ -0.24]( 0.68)
  64-clients     1.00 [  0.00]( 0.82)     1.00 [ -0.12]( 0.45)     1.00 [ -0.14]( 0.70)
  128-clients    1.00 [  0.00]( 1.21)     1.00 [  0.10]( 1.28)     1.00 [  0.08]( 1.19)
  256-clients    1.00 [  0.00]( 1.38)     1.01 [  0.65]( 0.89)     1.00 [  0.34]( 0.89)
  512-clients    1.00 [  0.00]( 8.76)     0.47 [-52.76]( 1.64)     0.77 [-23.10](10.06)	(*)
  768-clients    1.00 [  0.00](34.29)     0.83 [-16.89](30.45)     0.98 [ -2.16](36.19)
  1024-clients   1.00 [  0.00](47.96)     0.91 [ -9.29](36.02)     0.98 [ -1.93](46.36)


  ==================================================================
  Test          : schbench
  Units         : Normalized 99th percentile latency in us
  Interpretation: Lower is better
  Statistic     : Median
  ==================================================================
  #workers: tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
    1     1.00 [ -0.00](14.20)     1.72 [-72.00](15.01)     0.88 [ 12.00]( 4.55)
    2     1.00 [ -0.00]( 1.68)     1.09 [ -8.82]( 6.96)     0.97 [  2.94]( 9.90)
    4     1.00 [ -0.00]( 4.45)     1.18 [-17.65]( 5.29)     1.03 [ -2.94]( 3.24)
    8     1.00 [ -0.00]( 2.44)     1.12 [-12.20]( 4.35)     1.02 [ -2.44]( 2.38)
   16     1.00 [ -0.00]( 0.00)     1.04 [ -3.64]( 1.75)     0.98 [  1.82]( 1.85)
   32     1.00 [ -0.00]( 2.87)     1.03 [ -2.53]( 2.80)     0.99 [  1.27]( 1.47)
   64     1.00 [ -0.00]( 3.17)     1.02 [ -1.57]( 5.72)     0.98 [  2.36]( 2.30)
  128     1.00 [ -0.00]( 2.95)     1.01 [ -1.35]( 3.03)     1.00 [ -0.00]( 1.13)
  256     1.00 [ -0.00]( 1.17)     0.99 [  1.23]( 1.75)     0.99 [  1.43]( 1.56)
  512     1.00 [ -0.00]( 4.54)     1.14 [-13.60]( 2.41)     0.97 [  2.50]( 0.42)
  768     1.00 [ -0.00]( 2.24)     1.27 [-27.44]( 3.18)     1.12 [-11.54]( 5.64)	(*)
  1024    1.00 [ -0.00]( 0.28)     1.14 [-14.20]( 0.56)     1.13 [-13.00]( 1.01)	(*)


  ==================================================================
  Test          : new-schbench-requests-per-second
  Units         : Normalized Requests per second
  Interpretation: Higher is better
  Statistic     : Median
  ==================================================================
  #workers: tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
    1     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.15)
    2     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.15)
    4     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.00)     1.00 [  0.29]( 0.15)
    8     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.00)     1.00 [  0.29]( 0.00)
   16     1.00 [  0.00]( 0.15)     1.00 [ -0.29]( 0.15)     1.00 [  0.00]( 0.00)
   32     1.00 [  0.00]( 0.15)     1.00 [ -0.29]( 0.00)     1.00 [  0.00]( 0.15)
   64     1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.00)     1.00 [  0.29]( 0.00)
  128     1.00 [  0.00]( 0.27)     1.00 [  0.00](18.48)     0.65 [-34.50](24.12)	(#)
  256     1.00 [  0.00]( 0.00)     0.99 [ -0.58]( 0.00)     0.99 [ -0.58]( 0.00)
  512     1.00 [  0.00]( 1.05)     1.00 [  0.00]( 0.20)     1.00 [  0.39]( 0.87)
  768     1.00 [  0.00]( 0.95)     0.98 [ -1.88]( 0.93)     0.99 [ -0.71]( 0.53)
  1024    1.00 [  0.00]( 0.49)     0.99 [ -0.81]( 0.57)     1.00 [  0.00]( 0.74)


  ==================================================================
  Test          : new-schbench-wakeup-latency
  Units         : Normalized 99th percentile latency in us
  Interpretation: Lower is better
  Statistic     : Median
  ==================================================================
  #workers: tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
    1     1.00 [ -0.00]( 6.74)     2.38 [-137.50](29.34)    1.75 [-75.00]( 9.53)
    2     1.00 [ -0.00](12.06)     1.27 [-27.27]( 9.53)     1.36 [-36.36]( 6.59)
    4     1.00 [ -0.00](11.71)     1.33 [-33.33]( 3.30)     1.33 [-33.33]( 3.16)
    8     1.00 [ -0.00]( 0.00)     1.27 [-27.27](12.69)     1.09 [ -9.09]( 4.43)
   16     1.00 [ -0.00]( 4.84)     1.09 [ -9.09]( 4.43)     1.18 [-18.18](10.79)
   32     1.00 [ -0.00]( 0.00)     1.00 [ -0.00]( 0.00)     1.10 [-10.00]( 4.56)
   64     1.00 [ -0.00](13.22)     1.00 [ -0.00]( 5.00)     1.00 [ -0.00]( 9.68)
  128     1.00 [ -0.00]( 8.13)     1.00 [ -0.00]( 8.85)     1.18 [-18.18](13.76)
  256     1.00 [ -0.00]( 2.97)     1.02 [ -1.94]( 3.80)     1.08 [ -7.77]( 7.13)
  512     1.00 [ -0.00]( 1.25)     1.00 [  0.37]( 0.68)     1.00 [ -0.37]( 1.81)
  768     1.00 [ -0.00]( 0.00)     1.00 [ -0.00]( 0.00)     1.00 [ -0.00]( 0.00)
  1024    1.00 [ -0.00]( 0.63)     1.00 [ -0.11]( 4.06)     1.00 [ -0.11]( 3.13)


  ==================================================================
  Test          : new-schbench-request-latency
  Units         : Normalized 99th percentile latency in us
  Interpretation: Lower is better
  Statistic     : Median
  ==================================================================
  #workers: tip[pct imp](CV)       sis-node[pct imp](CV)    sis-node-w-sis-util[pct imp](CV)
    1     1.00 [ -0.00]( 0.14)     1.00 [ -0.26]( 0.14)     1.00 [ -0.00]( 0.14)
    2     1.00 [ -0.00]( 0.14)     1.00 [ -0.26]( 0.00)     1.00 [ -0.00]( 0.14)
    4     1.00 [ -0.00]( 0.00)     1.00 [ -0.00]( 0.00)     1.00 [  0.26]( 0.14)
    8     1.00 [ -0.00]( 0.00)     1.00 [ -0.00]( 0.00)     1.00 [  0.26]( 0.14)
   16     1.00 [ -0.00]( 0.00)     1.00 [ -0.00]( 0.00)     1.01 [ -0.53]( 1.18)
   32     1.00 [ -0.00]( 0.54)     1.01 [ -1.05]( 0.59)     0.99 [  0.53]( 0.27)
   64     1.00 [ -0.00]( 0.00)     1.00 [  0.26]( 1.08)     1.00 [  0.26](31.75)
  128     1.00 [ -0.00]( 0.61)     1.00 [ -0.00]( 4.19)     1.10 [-10.22]( 4.79)	(#)
  256     1.00 [ -0.00]( 0.43)     1.01 [ -1.39]( 0.74)     1.02 [ -1.63]( 0.66)
  512     1.00 [ -0.00]( 3.32)     1.00 [  0.23]( 1.62)     1.04 [ -3.72]( 3.79)
  768     1.00 [ -0.00]( 0.88)     0.95 [  4.52]( 0.63)     0.98 [  1.94]( 0.54)
  1024    1.00 [ -0.00]( 1.01)     0.98 [  1.54]( 0.91)     1.00 [  0.17]( 0.31)


Let me go play around with imbalance_pct for SIS_UITL at PKG/NODE domain
to see if there is a sweet spot that keeps everything happy while things
are happier on average.

I doubt if Meta's workload will be happy with more aggressive SIS_UTIL
limits since data from David's SHARED_RUNQ series [4] showed that
specific workload requires aggressive search + aggressive newidle balance.

References:

[1] https://github.com/kudureranganath/linux/commits/kudure/sched/sis_node/
[2] https://lore.kernel.org/all/3de5c24f-6437-f21b-ed61-76b86a199e8c@amd.com/
[3] https://github.com/kudureranganath/linux/commit/7639cf7632853b91e6a5b449eee08d3399b10d31
[4] https://lore.kernel.org/lkml/20230809221218.163894-1-void@manifault.com/

-- 
Thanks and Regards,
Prateek

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Shrikanth Hegde 2 months, 4 weeks ago


On 11/7/25 9:36 PM, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
> 
> This improves schbench significantly:
> 
>   6.18-rc4:			2.22 Mrps/s
>   6.18-rc4+revert:		2.04 Mrps/s
>   6.18-rc4+revert+random:	2.18 Mrps/S
> 

Could you please share the schbench command?

I see command like "schbench -t 90 -r 30 -i 30" running on 60 core regress.
Will do more iterations to confirm it (to be sure it is not run/run variation)

> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
> 
>   6.17:			-6%
>   6.17+revert:		 0%
>   6.17+revert+random:	-1%
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   include/linux/sched/topology.h |    3 ++
>   kernel/sched/core.c            |    3 ++
>   kernel/sched/fair.c            |   43 +++++++++++++++++++++++++++++++++++++----
>   kernel/sched/features.h        |    5 ++++
>   kernel/sched/sched.h           |    7 ++++++
>   kernel/sched/topology.c        |    6 +++++
>   6 files changed, 63 insertions(+), 4 deletions(-)
> 
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -92,6 +92,9 @@ struct sched_domain {
>   	unsigned int nr_balance_failed; /* initialise to 0 */
>   
>   	/* idle_balance() stats */
> +	unsigned int newidle_call;
> +	unsigned int newidle_success;
> +	unsigned int newidle_ratio;
>   	u64 max_newidle_lb_cost;
>   	unsigned long last_decay_max_lb_cost;
>   
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
>   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>   
>   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
>   
>   #ifdef CONFIG_SCHED_PROXY_EXEC
>   DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> @@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
>   {
>   	sched_init_numa(NUMA_NO_NODE);
>   
> +	prandom_init_once(&sched_rnd_state);
> +
>   	/*
>   	 * There's no userspace yet to cause hotplug operations; hence all the
>   	 * CPU masks are stable and all blatant races in the below code cannot
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12146,11 +12146,26 @@ void update_max_interval(void)
>   	max_load_balance_interval = HZ*num_online_cpus()/10;
>   }
>   
> -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
> +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
> +{
> +	sd->newidle_call++;
> +	sd->newidle_success += success;
> +
> +	if (sd->newidle_call >= 1024) {
> +		sd->newidle_ratio = sd->newidle_success;
> +		sd->newidle_call /= 2;
> +		sd->newidle_success /= 2;
> +	}

Would it be better to >> 1 ? or compiler takes care of it?

> +}
> +
> +static inline bool
> +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
>   {
>   	unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
>   	unsigned long now = jiffies;
>   
> +	update_newidle_stats(sd, success);
> +
>   	if (cost > sd->max_newidle_lb_cost) {
>   		/*
>   		 * Track max cost of a domain to make sure to not delay the
> @@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
>   		 * Decay the newidle max times here because this is a regular
>   		 * visit to all the domains.
>   		 */
> -		need_decay = update_newidle_cost(sd, 0);
> +		need_decay = update_newidle_cost(sd, 0, 0);
>   		max_cost += sd->max_newidle_lb_cost;
>   
>   		/*
> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>   			break;
>   
>   		if (sd->flags & SD_BALANCE_NEWIDLE) {
> +			unsigned int weight = 1;
> +
> +			if (sched_feat(NI_RANDOM)) {
> +				/*
> +				 * Throw a 1k sided dice; and only run
> +				 * newidle_balance according to the success
> +				 * rate.
> +				 */
> +				u32 d1k = sched_rng() % 1024;
> +				weight = 1 + sd->newidle_ratio;
> +				if (d1k > weight) {
> +					update_newidle_stats(sd, 0);
> +					continue;
> +				}
> +				weight = (1024 + weight/2) / weight;
> +			}
>   
>   			pulled_task = sched_balance_rq(this_cpu, this_rq,
>   						   sd, CPU_NEWLY_IDLE,
> @@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
>   
>   			t1 = sched_clock_cpu(this_cpu);
>   			domain_cost = t1 - t0;
> -			update_newidle_cost(sd, domain_cost);
> -
>   			curr_cost += domain_cost;
>   			t0 = t1;
> +
> +			/*
> +			 * Track max cost of a domain to make sure to not delay the
> +			 * next wakeup on the CPU.
> +			 */
> +			update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
>   		}
>   
>   		/*
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
>   SCHED_FEAT(UTIL_EST, true)
>   
>   SCHED_FEAT(LATENCY_WARN, false)
> +
> +/*
> + * Do newidle balancing proportional to its success rate using randomization.
> + */
> +SCHED_FEAT(NI_RANDOM, true)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -5,6 +5,7 @@
>   #ifndef _KERNEL_SCHED_SCHED_H
>   #define _KERNEL_SCHED_SCHED_H
>   
> +#include <linux/prandom.h>
>   #include <linux/sched/affinity.h>
>   #include <linux/sched/autogroup.h>
>   #include <linux/sched/cpufreq.h>
> @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
>   }
>   
>   DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
> +
> +static inline u32 sched_rng(void)
> +{
> +	return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
> +}
>   
>   #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
>   #define this_rq()		this_cpu_ptr(&runqueues)
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
>   
>   		.last_balance		= jiffies,
>   		.balance_interval	= sd_weight,
> +
> +		/* 50% success rate */
> +		.newidle_call		= 512,
> +		.newidle_success	= 256,
> +		.newidle_ratio		= 512,
> +
>   		.max_newidle_lb_cost	= 0,
>   		.last_decay_max_lb_cost	= jiffies,
>   		.child			= child,
> 
> 


run hackbench with it, Looks like hackbench does better when utilization is very high.
Otherwise, it regresses slightly.

I compared series applied vs on 65177ea9f64d. Let me know if i need to set anything different.
Will do numbers with more loops/iterations to iron out any run/run variations.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 2 months, 3 weeks ago

On Wed, Nov 12, 2025 at 09:12:57PM +0530, Shrikanth Hegde wrote:
> 
> 
> On 11/7/25 9:36 PM, Peter Zijlstra wrote:
> > Add a randomized algorithm that runs newidle balancing proportional to
> > its success rate.
> > 
> > This improves schbench significantly:
> > 
> >   6.18-rc4:			2.22 Mrps/s
> >   6.18-rc4+revert:		2.04 Mrps/s
> >   6.18-rc4+revert+random:	2.18 Mrps/S
> > 
> 
> Could you please share the schbench command?
> 
> I see command like "schbench -t 90 -r 30 -i 30" running on 60 core regress.
> Will do more iterations to confirm it (to be sure it is not run/run variation)

This was:

 schbench -L -m 4 -M auto -t 256 -n 0 -r 60 -s 0

from the original thread:

  https://lkml.kernel.org/r/20250626144017.1510594-2-clm@fb.com

> > +	if (sd->newidle_call >= 1024) {
> > +		sd->newidle_ratio = sd->newidle_success;
> > +		sd->newidle_call /= 2;
> > +		sd->newidle_success /= 2;
> > +	}
> 
> Would it be better to >> 1 ? or compiler takes care of it?

I would be very disappointed if our compilers don't do this.

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Adam Li 2 months, 4 weeks ago

On 11/8/2025 12:06 AM, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
> 
> This improves schbench significantly:
> 
>  6.18-rc4:			2.22 Mrps/s
>  6.18-rc4+revert:		2.04 Mrps/s
>  6.18-rc4+revert+random:	2.18 Mrps/S
> 
> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
> 
>  6.17:			-6%
>  6.17+revert:		 0%
>  6.17+revert+random:	-1%
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Tested-by: Adam Li <adamli@os.amperecomputing.com>

Please see the Specjbb test result on AmpereOne server bellow:
6.18-rc5:			0% (baseline)
6.18-rc5+patchset:		+5%
6.18-rc4+patchset+NO_NI_RANDOM:	+6%
6.18-rc5+revert-155213a2aed4:	+6%

Could you please explain a little the math behind success rate
(sd->newidle_ratio) calculation?

[...]
> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>  			break;
>  
>  		if (sd->flags & SD_BALANCE_NEWIDLE) {
> +			unsigned int weight = 1;
> +
> +			if (sched_feat(NI_RANDOM)) {
> +				/*
> +				 * Throw a 1k sided dice; and only run
> +				 * newidle_balance according to the success
> +				 * rate.
> +				 */
> +				u32 d1k = sched_rng() % 1024;
> +				weight = 1 + sd->newidle_ratio;
> +				if (d1k > weight) {
> +					update_newidle_stats(sd, 0);
> +					continue;
> +				}
> +				weight = (1024 + weight/2) / weight;
> +			}
>  
e.g: Why 'weight = (1024 + weight/2) / weight'


Thanks,
-adam

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 2 months, 4 weeks ago

On Tue, Nov 11, 2025 at 05:07:45PM +0800, Adam Li wrote:
> > @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> >  			break;
> >  
> >  		if (sd->flags & SD_BALANCE_NEWIDLE) {
> > +			unsigned int weight = 1;
> > +
> > +			if (sched_feat(NI_RANDOM)) {
> > +				/*
> > +				 * Throw a 1k sided dice; and only run
> > +				 * newidle_balance according to the success
> > +				 * rate.
> > +				 */
> > +				u32 d1k = sched_rng() % 1024;
> > +				weight = 1 + sd->newidle_ratio;
> > +				if (d1k > weight) {
> > +					update_newidle_stats(sd, 0);
> > +					continue;
> > +				}
> > +				weight = (1024 + weight/2) / weight;
> > +			}
> >  
> e.g: Why 'weight = (1024 + weight/2) / weight'

Not sure what you're asking, so two answers:

That's a rounding divide. We have a helper for that, but I never can
remember what its called.

The transformation as a whole here is from a ratio to a weight, suppose
our ratio is 256, this means that we do 1-in-4 or 25% of the balance
calls. However this also means that each success needs to be weighted as
4 (=1024/256), otherwise we under-account the successes and not even a
100% success rate can lift you out the hole.

Now, I made it a rounding divide to make it a little easier to climb out
of said hole (I even considered ceiling divide).

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Adam Li 2 months, 4 weeks ago

On 11/11/2025 5:20 PM, Peter Zijlstra wrote:
> On Tue, Nov 11, 2025 at 05:07:45PM +0800, Adam Li wrote:
>>> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>>>  			break;
>>>  
>>>  		if (sd->flags & SD_BALANCE_NEWIDLE) {
>>> +			unsigned int weight = 1;
>>> +
>>> +			if (sched_feat(NI_RANDOM)) {
>>> +				/*
>>> +				 * Throw a 1k sided dice; and only run
>>> +				 * newidle_balance according to the success
>>> +				 * rate.
>>> +				 */
>>> +				u32 d1k = sched_rng() % 1024;
>>> +				weight = 1 + sd->newidle_ratio;
>>> +				if (d1k > weight) {
>>> +					update_newidle_stats(sd, 0);
>>> +					continue;
>>> +				}
>>> +				weight = (1024 + weight/2) / weight;
>>> +			}
>>>  
>> e.g: Why 'weight = (1024 + weight/2) / weight'
> 
> Not sure what you're asking, so two answers:
> 
> That's a rounding divide. We have a helper for that, but I never can
> remember what its called.
> 
> The transformation as a whole here is from a ratio to a weight, suppose
> our ratio is 256, this means that we do 1-in-4 or 25% of the balance
> calls. However this also means that each success needs to be weighted as
> 4 (=1024/256), otherwise we under-account the successes and not even a
> 100% success rate can lift you out the hole.
> 
> Now, I made it a rounding divide to make it a little easier to climb out
> of said hole (I even considered ceiling divide).
> 
> 
Thanks for clarification.

If I understand correctly, (sd->newidle_ratio / 1024) is close to
(sd->newidle_success / sd->newidle_call). 'sd->newidle_ratio' means
success rate of newidle balance.

Shall we update newidle stats only from sched_balance_newidle()
as bellow patch? So that sched_balance_domains() will not update sd->newidle_call.

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12171,7 +12171,8 @@ update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
        unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
        unsigned long now = jiffies;

-       update_newidle_stats(sd, success);
+       if (cost)
+               update_newidle_stats(sd, success);

        if (cost > sd->max_newidle_lb_cost) {
                /*
 
I tested this change, Specjbb performance is similar with your patch.

Thanks,
-adam

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Peter Zijlstra 2 months, 4 weeks ago

On Wed, Nov 12, 2025 at 08:04:05PM +0800, Adam Li wrote:
> On 11/11/2025 5:20 PM, Peter Zijlstra wrote:
> > On Tue, Nov 11, 2025 at 05:07:45PM +0800, Adam Li wrote:
> >>> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> >>>  			break;
> >>>  
> >>>  		if (sd->flags & SD_BALANCE_NEWIDLE) {
> >>> +			unsigned int weight = 1;
> >>> +
> >>> +			if (sched_feat(NI_RANDOM)) {
> >>> +				/*
> >>> +				 * Throw a 1k sided dice; and only run
> >>> +				 * newidle_balance according to the success
> >>> +				 * rate.
> >>> +				 */
> >>> +				u32 d1k = sched_rng() % 1024;
> >>> +				weight = 1 + sd->newidle_ratio;
> >>> +				if (d1k > weight) {
> >>> +					update_newidle_stats(sd, 0);
> >>> +					continue;
> >>> +				}
> >>> +				weight = (1024 + weight/2) / weight;
> >>> +			}
> >>>  
> >> e.g: Why 'weight = (1024 + weight/2) / weight'
> > 
> > Not sure what you're asking, so two answers:
> > 
> > That's a rounding divide. We have a helper for that, but I never can
> > remember what its called.
> > 
> > The transformation as a whole here is from a ratio to a weight, suppose
> > our ratio is 256, this means that we do 1-in-4 or 25% of the balance
> > calls. However this also means that each success needs to be weighted as
> > 4 (=1024/256), otherwise we under-account the successes and not even a
> > 100% success rate can lift you out the hole.
> > 
> > Now, I made it a rounding divide to make it a little easier to climb out
> > of said hole (I even considered ceiling divide).
> > 
> > 
> Thanks for clarification.
> 
> If I understand correctly, (sd->newidle_ratio / 1024) is close to
> (sd->newidle_success / sd->newidle_call). 'sd->newidle_ratio' means
> success rate of newidle balance.
> 
> Shall we update newidle stats only from sched_balance_newidle()
> as bellow patch? So that sched_balance_domains() will not update sd->newidle_call.
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12171,7 +12171,8 @@ update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
>         unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
>         unsigned long now = jiffies;
> 
> -       update_newidle_stats(sd, success);
> +       if (cost)
> +               update_newidle_stats(sd, success);
> 
>         if (cost > sd->max_newidle_lb_cost) {
>                 /*
>  
> I tested this change, Specjbb performance is similar with your patch.

Ah yes, that makes sense. Let me make that change.

Thanks!

Re: [PATCH 4/4] sched/fair: Proportional newidle balance

Posted by Dietmar Eggemann 3 months ago

On 07.11.25 17:06, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
> 
> This improves schbench significantly:
> 
>  6.18-rc4:			2.22 Mrps/s
>  6.18-rc4+revert:		2.04 Mrps/s
>  6.18-rc4+revert+random:	2.18 Mrps/S
> 
> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
> 
>  6.17:			-6%
>  6.17+revert:		 0%
>  6.17+revert+random:	-1%
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Results with OLTP 'hammerdb - mysqld' on Arm64 VMs

				NOPM		P50 latency

6.18-rc4			baseline	baseline

6.18-rc4+revert-155213a2aed4	+13%		-8.8%

6.18-rc4+patchset		+11%		-8.2%

6.18-rc4+patchset+NO_NI_RANDOM	+13%		-8.6%

Pretty consistent with the results on the previous version. Although I
hadn't tested NI_TARGET+NI_RANDOM back then.

http://lkml.kernel.org/r/f6379aa6-459d-4205-96ea-9848e55d7f9c@arm.com

In case (pure wakeup) schbench configs are the only workloads profiting
from NI_RANDOM, make NO_NI_RANDOM the default?

Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

[...]

[tip: sched/core] sched/fair: Proportional newidle balance

Posted by tip-bot2 for Peter Zijlstra 2 months, 3 weeks ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     33cf66d88306663d16e4759e9d24766b0aaa2e17
Gitweb:        https://git.kernel.org/tip/33cf66d88306663d16e4759e9d24766b0aaa2e17
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 07 Nov 2025 17:01:31 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 17 Nov 2025 17:13:16 +01:00

sched/fair: Proportional newidle balance

Add a randomized algorithm that runs newidle balancing proportional to
its success rate.

This improves schbench significantly:

 6.18-rc4:			2.22 Mrps/s
 6.18-rc4+revert:		2.04 Mrps/s
 6.18-rc4+revert+random:	2.18 Mrps/S

Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:

 6.17:			-6%
 6.17+revert:		 0%
 6.17+revert+random:	-1%

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
---
 include/linux/sched/topology.h |  3 ++-
 kernel/sched/core.c            |  3 ++-
 kernel/sched/fair.c            | 44 ++++++++++++++++++++++++++++++---
 kernel/sched/features.h        |  5 ++++-
 kernel/sched/sched.h           |  7 +++++-
 kernel/sched/topology.c        |  6 +++++-
 6 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf1..45c0022 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
 	/* idle_balance() stats */
+	unsigned int newidle_call;
+	unsigned int newidle_success;
+	unsigned int newidle_ratio;
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 699db3f..9f10cfb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
 
 #ifdef CONFIG_SCHED_PROXY_EXEC
 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8489,6 +8490,8 @@ void __init sched_init_smp(void)
 {
 	sched_init_numa(NUMA_NO_NODE);
 
+	prandom_init_once(&sched_rnd_state);
+
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
 	 * CPU masks are stable and all blatant races in the below code cannot
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index abcbb67..1855975 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12224,11 +12224,27 @@ void update_max_interval(void)
 	max_load_balance_interval = HZ*num_online_cpus()/10;
 }
 
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+	sd->newidle_call++;
+	sd->newidle_success += success;
+
+	if (sd->newidle_call >= 1024) {
+		sd->newidle_ratio = sd->newidle_success;
+		sd->newidle_call /= 2;
+		sd->newidle_success /= 2;
+	}
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
 {
 	unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
 	unsigned long now = jiffies;
 
+	if (cost)
+		update_newidle_stats(sd, success);
+
 	if (cost > sd->max_newidle_lb_cost) {
 		/*
 		 * Track max cost of a domain to make sure to not delay the
@@ -12276,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 		 * Decay the newidle max times here because this is a regular
 		 * visit to all the domains.
 		 */
-		need_decay = update_newidle_cost(sd, 0);
+		need_decay = update_newidle_cost(sd, 0, 0);
 		max_cost += sd->max_newidle_lb_cost;
 
 		/*
@@ -12912,6 +12928,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 			break;
 
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			unsigned int weight = 1;
+
+			if (sched_feat(NI_RANDOM)) {
+				/*
+				 * Throw a 1k sided dice; and only run
+				 * newidle_balance according to the success
+				 * rate.
+				 */
+				u32 d1k = sched_rng() % 1024;
+				weight = 1 + sd->newidle_ratio;
+				if (d1k > weight) {
+					update_newidle_stats(sd, 0);
+					continue;
+				}
+				weight = (1024 + weight/2) / weight;
+			}
 
 			pulled_task = sched_balance_rq(this_cpu, this_rq,
 						   sd, CPU_NEWLY_IDLE,
@@ -12919,10 +12951,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 
 			t1 = sched_clock_cpu(this_cpu);
 			domain_cost = t1 - t0;
-			update_newidle_cost(sd, domain_cost);
-
 			curr_cost += domain_cost;
 			t0 = t1;
+
+			/*
+			 * Track max cost of a domain to make sure to not delay the
+			 * next wakeup on the CPU.
+			 */
+			update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
 		}
 
 		/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 0607def..980d92b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 
 SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index def9ab7..b419a4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #ifndef _KERNEL_SCHED_SCHED_H
 #define _KERNEL_SCHED_SCHED_H
 
+#include <linux/prandom.h>
 #include <linux/sched/affinity.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
 }
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+	return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		this_cpu_ptr(&runqueues)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 711076a..cf643a5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
 
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
+
+		/* 50% success rate */
+		.newidle_call		= 512,
+		.newidle_success	= 256,
+		.newidle_ratio		= 512,
+
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,
 		.child			= child,

[tip: sched/core] sched/fair: Proportional newidle balance

Posted by tip-bot2 for Peter Zijlstra 2 months, 3 weeks ago

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     7c983640e4db0c1fd8ce6c6cd921c19954a8d479
Gitweb:        https://git.kernel.org/tip/7c983640e4db0c1fd8ce6c6cd921c19954a8d479
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 07 Nov 2025 17:01:31 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 14 Nov 2025 13:03:08 +01:00

sched/fair: Proportional newidle balance

Add a randomized algorithm that runs newidle balancing proportional to
its success rate.

This improves schbench significantly:

 6.18-rc4:			2.22 Mrps/s
 6.18-rc4+revert:		2.04 Mrps/s
 6.18-rc4+revert+random:	2.18 Mrps/S

Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:

 6.17:			-6%
 6.17+revert:		 0%
 6.17+revert+random:	-1%

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
---
 include/linux/sched/topology.h |  3 ++-
 kernel/sched/core.c            |  3 ++-
 kernel/sched/fair.c            | 44 ++++++++++++++++++++++++++++++---
 kernel/sched/features.h        |  5 ++++-
 kernel/sched/sched.h           |  7 +++++-
 kernel/sched/topology.c        |  6 +++++-
 6 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf1..45c0022 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
 	/* idle_balance() stats */
+	unsigned int newidle_call;
+	unsigned int newidle_success;
+	unsigned int newidle_ratio;
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 699db3f..9f10cfb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
 
 #ifdef CONFIG_SCHED_PROXY_EXEC
 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8489,6 +8490,8 @@ void __init sched_init_smp(void)
 {
 	sched_init_numa(NUMA_NO_NODE);
 
+	prandom_init_once(&sched_rnd_state);
+
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
 	 * CPU masks are stable and all blatant races in the below code cannot
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50461c9..aaa47ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12223,11 +12223,27 @@ void update_max_interval(void)
 	max_load_balance_interval = HZ*num_online_cpus()/10;
 }
 
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+	sd->newidle_call++;
+	sd->newidle_success += success;
+
+	if (sd->newidle_call >= 1024) {
+		sd->newidle_ratio = sd->newidle_success;
+		sd->newidle_call /= 2;
+		sd->newidle_success /= 2;
+	}
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
 {
 	unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
 	unsigned long now = jiffies;
 
+	if (cost)
+		update_newidle_stats(sd, success);
+
 	if (cost > sd->max_newidle_lb_cost) {
 		/*
 		 * Track max cost of a domain to make sure to not delay the
@@ -12275,7 +12291,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 		 * Decay the newidle max times here because this is a regular
 		 * visit to all the domains.
 		 */
-		need_decay = update_newidle_cost(sd, 0);
+		need_decay = update_newidle_cost(sd, 0, 0);
 		max_cost += sd->max_newidle_lb_cost;
 
 		/*
@@ -12911,6 +12927,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 			break;
 
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			unsigned int weight = 1;
+
+			if (sched_feat(NI_RANDOM)) {
+				/*
+				 * Throw a 1k sided dice; and only run
+				 * newidle_balance according to the success
+				 * rate.
+				 */
+				u32 d1k = sched_rng() % 1024;
+				weight = 1 + sd->newidle_ratio;
+				if (d1k > weight) {
+					update_newidle_stats(sd, 0);
+					continue;
+				}
+				weight = (1024 + weight/2) / weight;
+			}
 
 			pulled_task = sched_balance_rq(this_cpu, this_rq,
 						   sd, CPU_NEWLY_IDLE,
@@ -12918,10 +12950,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 
 			t1 = sched_clock_cpu(this_cpu);
 			domain_cost = t1 - t0;
-			update_newidle_cost(sd, domain_cost);
-
 			curr_cost += domain_cost;
 			t0 = t1;
+
+			/*
+			 * Track max cost of a domain to make sure to not delay the
+			 * next wakeup on the CPU.
+			 */
+			update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
 		}
 
 		/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 0607def..980d92b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 
 SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index def9ab7..b419a4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #ifndef _KERNEL_SCHED_SCHED_H
 #define _KERNEL_SCHED_SCHED_H
 
+#include <linux/prandom.h>
 #include <linux/sched/affinity.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
 }
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+	return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		this_cpu_ptr(&runqueues)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 711076a..cf643a5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
 
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
+
+		/* 50% success rate */
+		.newidle_call		= 512,
+		.newidle_success	= 256,
+		.newidle_ratio		= 512,
+
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,
 		.child			= child,