Add a randomized algorithm that runs newidle balancing proportional to
its success rate.
This improves schbench significantly:
6.18-rc4: 2.22 Mrps/s
6.18-rc4+revert: 2.04 Mrps/s
6.18-rc4+revert+random: 2.18 Mrps/S
Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
6.17: -6%
6.17+revert: 0%
6.17+revert+random: -1%
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched/topology.h | 3 ++
kernel/sched/core.c | 3 ++
kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++----
kernel/sched/features.h | 5 ++++
kernel/sched/sched.h | 7 ++++++
kernel/sched/topology.c | 6 +++++
6 files changed, 63 insertions(+), 4 deletions(-)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
+ unsigned int newidle_call;
+ unsigned int newidle_success;
+ unsigned int newidle_ratio;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12146,11 +12146,26 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
{
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
unsigned long now = jiffies;
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
@@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
/*
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
#ifndef _KERNEL_SCHED_SCHED_H
#define _KERNEL_SCHED_SCHED_H
+#include <linux/prandom.h>
#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
The patch "Proportional newidle balance" introduced a regression
with Linux 6.12.65 and 6.18.5. There is noticeable regression with
easyWave testing. [1]
The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
to install easyWave [2]. That is fetching the two tar.gz archives.
#!/bin/bash
# CXXFLAGS="-O3 $CXXFLAGS" ./configure
# make -j8
trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
OMP_NUM_THREADS=48 ./src/easywave \
-grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
-time 1200
Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
easyWave ver.2013-04-11
Model time = 00:00:00, elapsed: 0 msec
Model time = 00:10:00, elapsed: 5 msec
Model time = 00:20:00, elapsed: 10 msec
Model time = 00:30:00, elapsed: 19 msec
...
Model time = 05:00:00, elapsed: 2908 msec
Model time = 05:10:00, elapsed: 3079 msec
Model time = 05:20:00, elapsed: 3307 msec
Model time = 05:30:00, elapsed: 3503 msec
...
After results with CachyOS 6.12.66-2 and 6.18.6-2 kernels.
easyWave ver.2013-04-11
Model time = 00:00:00, elapsed: 0 msec
Model time = 00:10:00, elapsed: 5 msec
Model time = 00:20:00, elapsed: 10 msec
Model time = 00:30:00, elapsed: 18 msec
...
Model time = 05:00:00, elapsed: 13057 msec (normal is < 3.0s)
Model time = 05:10:00, elapsed: 13512 msec
Model time = 05:20:00, elapsed: 13833 msec
Model time = 05:30:00, elapsed: 14206 msec
...
Reverting the patch "sched/fair: Proportional newidle balance"
returns back to prior performance.
[1] https://openbenchmarking.org/test/pts/easywave
[2]
https://openbenchmarking.org/innhold/da7f1cf159033fdfbb925102284aea8a83e8afdc
On 11/7/25 11:06 AM, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
>
> This improves schbench significantly:
>
> 6.18-rc4: 2.22 Mrps/s
> 6.18-rc4+revert: 2.04 Mrps/s
> 6.18-rc4+revert+random: 2.18 Mrps/S
>
> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
>
> 6.17: -6%
> 6.17+revert: 0%
> 6.17+revert+random: -1%
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/sched/topology.h | 3 ++
> kernel/sched/core.c | 3 ++
> kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++----
> kernel/sched/features.h | 5 ++++
> kernel/sched/sched.h | 7 ++++++
> kernel/sched/topology.c | 6 +++++
> 6 files changed, 63 insertions(+), 4 deletions(-)
>
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -92,6 +92,9 @@ struct sched_domain {
> unsigned int nr_balance_failed; /* initialise to 0 */
>
> /* idle_balance() stats */
> + unsigned int newidle_call;
> + unsigned int newidle_success;
> + unsigned int newidle_ratio;
> u64 max_newidle_lb_cost;
> unsigned long last_decay_max_lb_cost;
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>
> DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
>
> #ifdef CONFIG_SCHED_PROXY_EXEC
> DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> @@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
> {
> sched_init_numa(NUMA_NO_NODE);
>
> + prandom_init_once(&sched_rnd_state);
> +
> /*
> * There's no userspace yet to cause hotplug operations; hence all the
> * CPU masks are stable and all blatant races in the below code cannot
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12146,11 +12146,26 @@ void update_max_interval(void)
> max_load_balance_interval = HZ*num_online_cpus()/10;
> }
>
> -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
> +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
> +{
> + sd->newidle_call++;
> + sd->newidle_success += success;
> +
> + if (sd->newidle_call >= 1024) {
> + sd->newidle_ratio = sd->newidle_success;
> + sd->newidle_call /= 2;
> + sd->newidle_success /= 2;
> + }
> +}
> +
> +static inline bool
> +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
> {
> unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
> unsigned long now = jiffies;
>
> + update_newidle_stats(sd, success);
> +
> if (cost > sd->max_newidle_lb_cost) {
> /*
> * Track max cost of a domain to make sure to not delay the
> @@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
> * Decay the newidle max times here because this is a regular
> * visit to all the domains.
> */
> - need_decay = update_newidle_cost(sd, 0);
> + need_decay = update_newidle_cost(sd, 0, 0);
> max_cost += sd->max_newidle_lb_cost;
>
> /*
> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> break;
>
> if (sd->flags & SD_BALANCE_NEWIDLE) {
> + unsigned int weight = 1;
> +
> + if (sched_feat(NI_RANDOM)) {
> + /*
> + * Throw a 1k sided dice; and only run
> + * newidle_balance according to the success
> + * rate.
> + */
> + u32 d1k = sched_rng() % 1024;
> + weight = 1 + sd->newidle_ratio;
> + if (d1k > weight) {
> + update_newidle_stats(sd, 0);
> + continue;
> + }
> + weight = (1024 + weight/2) / weight;
> + }
>
> pulled_task = sched_balance_rq(this_cpu, this_rq,
> sd, CPU_NEWLY_IDLE,
> @@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
>
> t1 = sched_clock_cpu(this_cpu);
> domain_cost = t1 - t0;
> - update_newidle_cost(sd, domain_cost);
> -
> curr_cost += domain_cost;
> t0 = t1;
> +
> + /*
> + * Track max cost of a domain to make sure to not delay the
> + * next wakeup on the CPU.
> + */
> + update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
> }
>
> /*
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
> SCHED_FEAT(UTIL_EST, true)
>
> SCHED_FEAT(LATENCY_WARN, false)
> +
> +/*
> + * Do newidle balancing proportional to its success rate using randomization.
> + */
> +SCHED_FEAT(NI_RANDOM, true)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -5,6 +5,7 @@
> #ifndef _KERNEL_SCHED_SCHED_H
> #define _KERNEL_SCHED_SCHED_H
>
> +#include <linux/prandom.h>
> #include <linux/sched/affinity.h>
> #include <linux/sched/autogroup.h>
> #include <linux/sched/cpufreq.h>
> @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
> }
>
> DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
> +
> +static inline u32 sched_rng(void)
> +{
> + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
> +}
>
> #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
> #define this_rq() this_cpu_ptr(&runqueues)
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
>
> .last_balance = jiffies,
> .balance_interval = sd_weight,
> +
> + /* 50% success rate */
> + .newidle_call = 512,
> + .newidle_success = 256,
> + .newidle_ratio = 512,
> +
> .max_newidle_lb_cost = 0,
> .last_decay_max_lb_cost = jiffies,
> .child = child,
>
>
On 18/01/2026 20:46, Mario Roy wrote:
> CAUTION: This email originated from outside of the organization. Do not
> click links or open attachments unless you can confirm the sender and
> know the content is safe.
>
>
>
> The patch "Proportional newidle balance" introduced a regression
> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> easyWave testing. [1]
>
> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> to install easyWave [2]. That is fetching the two tar.gz archives.
>
>
> #!/bin/bash
> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
> # make -j8
>
> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
>
> OMP_NUM_THREADS=48 ./src/easywave \
> -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
> -time 1200
>
>
> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
>
> easyWave ver.2013-04-11
> Model time = 00:00:00, elapsed: 0 msec
> Model time = 00:10:00, elapsed: 5 msec
> Model time = 00:20:00, elapsed: 10 msec
> Model time = 00:30:00, elapsed: 19 msec
> ...
> Model time = 05:00:00, elapsed: 2908 msec
> Model time = 05:10:00, elapsed: 3079 msec
> Model time = 05:20:00, elapsed: 3307 msec
> Model time = 05:30:00, elapsed: 3503 msec
> ...
>
> After results with CachyOS 6.12.66-2 and 6.18.6-2 kernels.
>
> easyWave ver.2013-04-11
> Model time = 00:00:00, elapsed: 0 msec
> Model time = 00:10:00, elapsed: 5 msec
> Model time = 00:20:00, elapsed: 10 msec
> Model time = 00:30:00, elapsed: 18 msec
> ...
> Model time = 05:00:00, elapsed: 13057 msec (normal is < 3.0s)
> Model time = 05:10:00, elapsed: 13512 msec
> Model time = 05:20:00, elapsed: 13833 msec
> Model time = 05:30:00, elapsed: 14206 msec
> ...
>
>
> Reverting the patch "sched/fair: Proportional newidle balance"
> returns back to prior performance.
>
> [1] https://openbenchmarking.org/test/pts/easywave
> [2]
> https://openbenchmarking.org/innhold/
> da7f1cf159033fdfbb925102284aea8a83e8afdc
>
> On 11/7/25 11:06 AM, Peter Zijlstra wrote:
>> Add a randomized algorithm that runs newidle balancing proportional to
>> its success rate.
>>
>> This improves schbench significantly:
>>
>> 6.18-rc4: 2.22 Mrps/s
>> 6.18-rc4+revert: 2.04 Mrps/s
>> 6.18-rc4+revert+random: 2.18 Mrps/S
>>
>> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
>>
>> 6.17: -6%
>> 6.17+revert: 0%
>> 6.17+revert+random: -1%
>>
>> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> ---
>> include/linux/sched/topology.h | 3 ++
>> kernel/sched/core.c | 3 ++
>> kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++
>> +++++++----
>> kernel/sched/features.h | 5 ++++
>> kernel/sched/sched.h | 7 ++++++
>> kernel/sched/topology.c | 6 +++++
>> 6 files changed, 63 insertions(+), 4 deletions(-)
>>
>> --- a/include/linux/sched/topology.h
>> +++ b/include/linux/sched/topology.h
>> @@ -92,6 +92,9 @@ struct sched_domain {
>> unsigned int nr_balance_failed; /* initialise to 0 */
>>
>> /* idle_balance() stats */
>> + unsigned int newidle_call;
>> + unsigned int newidle_success;
>> + unsigned int newidle_ratio;
>> u64 max_newidle_lb_cost;
>> unsigned long last_decay_max_lb_cost;
>>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
>> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>>
>> DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>> +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
>>
>> #ifdef CONFIG_SCHED_PROXY_EXEC
>> DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
>> @@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
>> {
>> sched_init_numa(NUMA_NO_NODE);
>>
>> + prandom_init_once(&sched_rnd_state);
>> +
>> /*
>> * There's no userspace yet to cause hotplug operations; hence
>> all the
>> * CPU masks are stable and all blatant races in the below code
>> cannot
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -12146,11 +12146,26 @@ void update_max_interval(void)
>> max_load_balance_interval = HZ*num_online_cpus()/10;
>> }
>>
>> -static inline bool update_newidle_cost(struct sched_domain *sd, u64
>> cost)
>> +static inline void update_newidle_stats(struct sched_domain *sd,
>> unsigned int success)
>> +{
>> + sd->newidle_call++;
>> + sd->newidle_success += success;
>> +
>> + if (sd->newidle_call >= 1024) {
>> + sd->newidle_ratio = sd->newidle_success;
>> + sd->newidle_call /= 2;
>> + sd->newidle_success /= 2;
>> + }
>> +}
>> +
>> +static inline bool
>> +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int
>> success)
>> {
>> unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
>> unsigned long now = jiffies;
>>
>> + update_newidle_stats(sd, success);
>> +
>> if (cost > sd->max_newidle_lb_cost) {
>> /*
>> * Track max cost of a domain to make sure to not delay the
>> @@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
>> * Decay the newidle max times here because this is a
>> regular
>> * visit to all the domains.
>> */
>> - need_decay = update_newidle_cost(sd, 0);
>> + need_decay = update_newidle_cost(sd, 0, 0);
>> max_cost += sd->max_newidle_lb_cost;
>>
>> /*
>> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>> break;
>>
>> if (sd->flags & SD_BALANCE_NEWIDLE) {
>> + unsigned int weight = 1;
>> +
>> + if (sched_feat(NI_RANDOM)) {
>> + /*
>> + * Throw a 1k sided dice; and only run
>> + * newidle_balance according to the success
>> + * rate.
>> + */
>> + u32 d1k = sched_rng() % 1024;
>> + weight = 1 + sd->newidle_ratio;
>> + if (d1k > weight) {
>> + update_newidle_stats(sd, 0);
>> + continue;
>> + }
>> + weight = (1024 + weight/2) / weight;
>> + }
>>
>> pulled_task = sched_balance_rq(this_cpu, this_rq,
>> sd, CPU_NEWLY_IDLE,
>> @@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
>>
>> t1 = sched_clock_cpu(this_cpu);
>> domain_cost = t1 - t0;
>> - update_newidle_cost(sd, domain_cost);
>> -
>> curr_cost += domain_cost;
>> t0 = t1;
>> +
>> + /*
>> + * Track max cost of a domain to make sure to
>> not delay the
>> + * next wakeup on the CPU.
>> + */
>> + update_newidle_cost(sd, domain_cost, weight * !!
>> pulled_task);
>> }
>>
>> /*
>> --- a/kernel/sched/features.h
>> +++ b/kernel/sched/features.h
>> @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
>> SCHED_FEAT(UTIL_EST, true)
>>
>> SCHED_FEAT(LATENCY_WARN, false)
>> +
>> +/*
>> + * Do newidle balancing proportional to its success rate using
>> randomization.
>> + */
>> +SCHED_FEAT(NI_RANDOM, true)
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -5,6 +5,7 @@
>> #ifndef _KERNEL_SCHED_SCHED_H
>> #define _KERNEL_SCHED_SCHED_H
>>
>> +#include <linux/prandom.h>
>> #include <linux/sched/affinity.h>
>> #include <linux/sched/autogroup.h>
>> #include <linux/sched/cpufreq.h>
>> @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
>> }
>>
>> DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>> +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
>> +
>> +static inline u32 sched_rng(void)
>> +{
>> + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
>> +}
>>
>> #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
>> #define this_rq() this_cpu_ptr(&runqueues)
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
>>
>> .last_balance = jiffies,
>> .balance_interval = sd_weight,
>> +
>> + /* 50% success rate */
>> + .newidle_call = 512,
>> + .newidle_success = 256,
>> + .newidle_ratio = 512,
>> +
>> .max_newidle_lb_cost = 0,
>> .last_decay_max_lb_cost = jiffies,
>> .child = child,
>>
>>
Hi All,
I can confirm that we are seeing a 4-11% performance regression in
v6.12.66 on multiple benchmarks running on c7a.4xlarge AWS EC2 instances
that are powered by AMD EPYC 9R14-series CPU (code-named Genoa) and
c7i.4xlarge which is powered by 4th-Generation Intel Xeon Scalable
processor (code-named Sapphire Rapids). The regression is caused by the
commit 33cf66d88306 ("sched/fair: Proportional newidle balance"). We
were able to reclaim the performance back after reverting this commit.
We also noticed that the impact is higher on AMD vs Intel.
Benchmark Name | Description | Unit
postgresql | HammerDB workload (TPC-C-like benchmark) | NOPM
nginx_lb | Testing NGINX as a load balancer | RPS
memcached | Testing using Lancet load generator | QPS
**Results on v6.12.66**
Benchmark name | SUT EC2 Instance | Regression percentage
postgresql | c7a.4xlarge | -4.0%
postgresql | c7i.4xlarge | -4.0%
nginx_lb | c7a.4xlarge | -5.0%
memcached | c7a.4xlarge | -11.0%
We have also seen smaller impact on v6.1.161 which has the mentioned commit.
**Results on v6.1.161**
Benchmark name | SUT EC2 Instance | Regression percentage
nginx_lb | c7a.4xlarge | -3.0%
nginx_lb | c7i.4xlarge | -4.0%
memcached | c7a.4xlarge | -5.0%
On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
> Sapphire Rapids). The regression is caused by the commit 33cf66d88306
> ("sched/fair: Proportional newidle balance"). We were able to reclaim the
> performance back after reverting this commit. We also noticed that the
> impact is higher on AMD vs Intel.
>
> Benchmark Name | Description | Unit
> postgresql | HammerDB workload (TPC-C-like benchmark) | NOPM
> nginx_lb | Testing NGINX as a load balancer | RPS
> memcached | Testing using Lancet load generator | QPS
>
> **Results on v6.12.66**
>
> Benchmark name | SUT EC2 Instance | Regression percentage
> postgresql | c7a.4xlarge | -4.0%
> postgresql | c7i.4xlarge | -4.0%
> nginx_lb | c7a.4xlarge | -5.0%
> memcached | c7a.4xlarge | -11.0%
So only postgres has a regression on Intel? Memcached doesn't show
anything?
On 27/01/2026 08:50, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
>
>
>
> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>
>> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
>> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
>> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
>> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
>> Sapphire Rapids). The regression is caused by the commit 33cf66d88306
>> ("sched/fair: Proportional newidle balance"). We were able to reclaim the
>> performance back after reverting this commit. We also noticed that the
>> impact is higher on AMD vs Intel.
>>
>> Benchmark Name | Description | Unit
>> postgresql | HammerDB workload (TPC-C-like benchmark) | NOPM
>> nginx_lb | Testing NGINX as a load balancer | RPS
>> memcached | Testing using Lancet load generator | QPS
>>
>> **Results on v6.12.66**
>>
>> Benchmark name | SUT EC2 Instance | Regression percentage
>> postgresql | c7a.4xlarge | -4.0%
>> postgresql | c7i.4xlarge | -4.0%
>> nginx_lb | c7a.4xlarge | -5.0%
>> memcached | c7a.4xlarge | -11.0%
>
> So only postgres has a regression on Intel? Memcached doesn't show
> anything?
True, memcached performance on Intel is exactly the same with and
without commit 33cf66d88306 ("sched/fair: Proportional newidle
balance"). The memcached regression is only visible on AMD. I also
tested on arm64 VMs using Graviton3(based on Arm Neoverse V1) and
Graviton4(based on Arm Neoverse V2) and I don't see any memcached
regression there.
On Tue, Jan 27, 2026 at 09:50:25AM +0100, Peter Zijlstra wrote:
> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>
> > I can confirm that we are seeing a 4-11% performance regression in v6.12.66
> > on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
> > powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
> > is powered by 4th-Generation Intel Xeon Scalable processor (code-named
> > Sapphire Rapids). The regression is caused by the commit 33cf66d88306
> > ("sched/fair: Proportional newidle balance"). We were able to reclaim the
> > performance back after reverting this commit. We also noticed that the
> > impact is higher on AMD vs Intel.
> >
> > Benchmark Name | Description | Unit
> > postgresql | HammerDB workload (TPC-C-like benchmark) | NOPM
> > nginx_lb | Testing NGINX as a load balancer | RPS
> > memcached | Testing using Lancet load generator | QPS
> >
> > **Results on v6.12.66**
> >
> > Benchmark name | SUT EC2 Instance | Regression percentage
> > postgresql | c7a.4xlarge | -4.0%
> > postgresql | c7i.4xlarge | -4.0%
> > nginx_lb | c7a.4xlarge | -5.0%
> > memcached | c7a.4xlarge | -11.0%
>
> So only postgres has a regression on Intel? Memcached doesn't show
> anything?
And just to be sure, v6.12.43-v6.12.65 have no problem?
That is, afaict those are the kernels that have:
fc4289233e4b sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails
But not yet have:
1b9c118fe318 sched/fair: Proportional newidle balance
c6ae271bc5fd sched/fair: Small cleanup to update_newidle_cost()
52aa889c6f57 sched/fair: Small cleanup to sched_balance_newidle()
81343616e712 sched/fair: Revert max_newidle_lb_cost bump
Because fc4289233e4b was also causing a ton of regressions (but also
improving some workloads). 81343616e712 then reverts this and
1b9c118fe318 is supposed to be a compromise between these two.
So if your workloads are not affected by fc4289233e4b and 81343616e712,
but somehow 1b9c118fe318 is causing fail, then I'm a little puzzled.
On 27/01/2026 09:13, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
>
>
>
> On Tue, Jan 27, 2026 at 09:50:25AM +0100, Peter Zijlstra wrote:
>> On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>>
>>> I can confirm that we are seeing a 4-11% performance regression in v6.12.66
>>> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are
>>> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which
>>> is powered by 4th-Generation Intel Xeon Scalable processor (code-named
>>> Sapphire Rapids). The regression is caused by the commit 33cf66d88306
>>> ("sched/fair: Proportional newidle balance"). We were able to reclaim the
>>> performance back after reverting this commit. We also noticed that the
>>> impact is higher on AMD vs Intel.
>>>
>>> Benchmark Name | Description | Unit
>>> postgresql | HammerDB workload (TPC-C-like benchmark) | NOPM
>>> nginx_lb | Testing NGINX as a load balancer | RPS
>>> memcached | Testing using Lancet load generator | QPS
>>>
>>> **Results on v6.12.66**
>>>
>>> Benchmark name | SUT EC2 Instance | Regression percentage
>>> postgresql | c7a.4xlarge | -4.0%
>>> postgresql | c7i.4xlarge | -4.0%
>>> nginx_lb | c7a.4xlarge | -5.0%
>>> memcached | c7a.4xlarge | -11.0%
>>
>> So only postgres has a regression on Intel? Memcached doesn't show
>> anything?
>
> And just to be sure, v6.12.43-v6.12.65 have no problem?
>
> That is, afaict those are the kernels that have:
>
> fc4289233e4b sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails
>
> But not yet have:
>
> 1b9c118fe318 sched/fair: Proportional newidle balance
> c6ae271bc5fd sched/fair: Small cleanup to update_newidle_cost()
> 52aa889c6f57 sched/fair: Small cleanup to sched_balance_newidle()
> 81343616e712 sched/fair: Revert max_newidle_lb_cost bump
>
> Because fc4289233e4b was also causing a ton of regressions (but also
> improving some workloads). 81343616e712 then reverts this and
> 1b9c118fe318 is supposed to be a compromise between these two.
>
> So if your workloads are not affected by fc4289233e4b and 81343616e712,
> but somehow 1b9c118fe318 is causing fail, then I'm a little puzzled.
>
We have definitely seen significant performance regression specifically
on DB workloads because of fc4289233e4b ("sched/fair: Bump
sd->max_newidle_lb_cost when newidle balance fails") which we reported
in [1]. We were able to reclaim the performance back with ("81343616e712
sched/fair: Revert max_newidle_lb_cost bump") before we start seeing
negative impact from 1b9c118fe318 sched/fair: Proportional newidle balance.
[1]
https://lore.kernel.org/all/006c9df2-b691-47f1-82e6-e233c3f91faf@oracle.com/T/#mb96105e4a320659b5aa68ec112bbeafaae37e769
On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote: > I can confirm that we are seeing a 4-11% performance regression in v6.12.66 > on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are > powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which > is powered by 4th-Generation Intel Xeon Scalable processor (code-named > Sapphire Rapids). For those not speaking Amazon; what actual system setup is that Xeon? Is that single socket or multi-socket? Going by the name, the 4x would suggest a quad-socket Xeon, which are somewhat beastly, but if I google this 'c7i.4xlarge' identifier, I get a puny single socket 16cpu thing. What is it?
On 27/01/2026 08:44, Peter Zijlstra wrote: > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. > > > > On Sun, Jan 25, 2026 at 12:22:21PM +0000, Mohamed Abuelfotoh, Hazem wrote: >> I can confirm that we are seeing a 4-11% performance regression in v6.12.66 >> on multiple benchmarks running on c7a.4xlarge AWS EC2 instances that are >> powered by AMD EPYC 9R14-series CPU (code-named Genoa) and c7i.4xlarge which >> is powered by 4th-Generation Intel Xeon Scalable processor (code-named >> Sapphire Rapids). > > For those not speaking Amazon; what actual system setup is that Xeon? Is > that single socket or multi-socket? > > Going by the name, the 4x would suggest a quad-socket Xeon, which are > somewhat beastly, but if I google this 'c7i.4xlarge' identifier, I get a > puny single socket 16cpu thing. > > What is it? Hi Peter, Apologize for the confusion, the "4x" is just an Amazon naming for EC2 instance sizing, basically number of CPUs & Memory and Network Bandwidth. The naming has nothing to do with exact number of sockets within the VM. Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge. c7i.4xlarge CPU Model: Intel(R) Xeon(R) Platinum 8488C Number of CPUs: 16 Memory: 32 GB Number of sockets: 1 ------------------------------------------------------------------------- c7a.4xlarge CPU Model: AMD EPYC 9R14 Number of CPUs: 16 Memory: 32 GB Number of sockets: 1
On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote: > Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge. > > c7i.4xlarge > > CPU Model: Intel(R) Xeon(R) Platinum 8488C > Number of CPUs: 16 > Memory: 32 GB > Number of sockets: 1 But the 8488C is a 56 core part, with 112 threads. So you're handing out 8 core partitions of that thing, for 7 such instances on one machine? (Also, calling anything 16 core with 32GB 'large' is laughable, that's laptop territory.) > ------------------------------------------------------------------------- > > c7a.4xlarge > > CPU Model: AMD EPYC 9R14 > Number of CPUs: 16 > Memory: 32 GB > Number of sockets: 1 And that 9r14 is a 96 core part, 12 CCDs, 8 cores each. So you're again handing out partitions of that. For both cases, are these partitions fixed? Specifically in the AMD case, are you handing out exactly 1 CCDs per partition? Because if so, I'm mighty confused by the results. 8 cores, 16 threads is exactly one CCD worth of Zen4 and should therefore be a single L3 and behave exactly like the Intel thing. Something is missing here.
On 29/01/2026 09:19, Peter Zijlstra wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
>
>
>
> On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote:
>
>> Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge.
>>
>> c7i.4xlarge
>>
>> CPU Model: Intel(R) Xeon(R) Platinum 8488C
>> Number of CPUs: 16
>> Memory: 32 GB
>> Number of sockets: 1
>
> But the 8488C is a 56 core part, with 112 threads. So you're handing out
> 8 core partitions of that thing, for 7 such instances on one machine?
>
c7i.4xlarge is an EC2 instance which is basically a virtual machine
running on Nitro KVM based hypervisor. The VM is sharing the host with
other VMs which explain why Amazon doesn't allocate all the host CPU
resources to a single VM. There are larger EC2 instance sizes where a
single VM would occupy the whole host for example c7i.48xlarge which has
192 vCPUs. Your conclusion is right c7i.4xlarge has 8 Physical cores
with HT enabled which adds up to 16 vCPU.
> (Also, calling anything 16 core with 32GB 'large' is laughable, that's
> laptop territory.)
>
>> -------------------------------------------------------------------------
>>
>> c7a.4xlarge
>>
>> CPU Model: AMD EPYC 9R14
>> Number of CPUs: 16
>> Memory: 32 GB
>> Number of sockets: 1
>
> And that 9r14 is a 96 core part, 12 CCDs, 8 cores each. So you're again
> handing out partitions of that.
>
>
>
> For both cases, are these partitions fixed? Specifically in the AMD case,
> are you handing out exactly 1 CCDs per partition?
>
> Because if so, I'm mighty confused by the results. 8 cores, 16 threads
> is exactly one CCD worth of Zen4 and should therefore be a single L3 and
> behave exactly like the Intel thing.
>
> Something is missing here.
The main difference between Intel based c7i.4xlarge vs AMD based
c7a.4xlarge is that on Intel we have HT enabled so the instance has 16
vCPU which are really 8 Physical cores with HT enabled. On AMD the VM
comes with 16 physical cores with no HT so it has 2 CCDs while on Intel
we have a single L3 cache. I am also adding the output of lscpu on both
instances to clarify architectural discrepancies between both.
**c7i.4xlarge**
# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Address sizes: 46 bits physical, 48 bits virtual
Byte Order: Little Endian
CPU(s): 16
On-line CPU(s) list: 0-15
Vendor ID: GenuineIntel
BIOS Vendor ID: Intel(R) Corporation
Model name: Intel(R) Xeon(R) Platinum 8488C
BIOS Model name: Intel(R) Xeon(R) Platinum 8488C
CPU family: 6
Model: 143
Thread(s) per core: 2
Core(s) per socket: 8
Socket(s): 1
Stepping: 8
BogoMIPS: 4800.00
Flags: fpu vme de pse tsc msr pae mce cx8 apic
sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall
nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology
nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ss
se3 fma cx16 pdcm pcid sse4_1 sse4_2
x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd
ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2
erms invpc
id avx512f avx512dq rdseed adx smap
avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt
xsavec xgetbv1 xsaves avx_vnni avx512_bf16 wbnoinvd ida arat avx512vbmi
umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx5
12_bitalg tme avx512_vpopcntdq rdpid
cldemote movdiri movdir64b md_clear serialize amx_bf16 avx512_fp16
amx_tile amx_int8 flush_l1d arch_capabilities
Virtualization features:
Hypervisor vendor: KVM
Virtualization type: full
Caches (sum of all):
L1d: 384 KiB (8 instances)
L1i: 256 KiB (8 instances)
L2: 16 MiB (8 instances)
L3: 105 MiB (1 instance)
NUMA:
NUMA node(s): 1
NUMA node0 CPU(s): 0-15
-------------------------------------------------------------------------
**c7a.4xlarge**
# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Address sizes: 48 bits physical, 48 bits virtual
Byte Order: Little Endian
CPU(s): 16
On-line CPU(s) list: 0-15
Vendor ID: AuthenticAMD
BIOS Vendor ID: Advanced Micro Devices, Inc.
Model name: AMD EPYC 9R14
BIOS Model name: AMD EPYC 9R14
CPU family: 25
Model: 17
Thread(s) per core: 1
Core(s) per socket: 16
Socket(s): 1
Stepping: 1
BogoMIPS: 5199.99
Flags: fpu vme de pse tsc msr pae mce cx8 apic
sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx
mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc
cpuid extd_apicid aperfmperf tsc_known_freq pni pclmulqdq monitor
ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic
movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy
cr8_legacy abm sse4a misalignsse 3dnowprefetch topoext perfctr_core
invpcid_single ssbd perfmon_v2 ibrs ibpb stibp ibrs_enhanced vmmcall fsgs
base bmi1 avx2 smep bmi2 invpcid avx512f
avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni
avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx512_bf16 clzero
xsaveerptr rdpru wbnoinvd arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes
vpclmulqdq avx512_vnni avx512_bitalg
avx512_vpopcntdq rdpid flush_l1d
Virtualization features:
Hypervisor vendor: KVM
Virtualization type: full
Caches (sum of all):
L1d: 512 KiB (16 instances)
L1i: 512 KiB (16 instances)
L2: 16 MiB (16 instances)
L3: 64 MiB (2 instances)
NUMA:
NUMA node(s): 1
NUMA node0 CPU(s): 0-15
On Fri, Jan 30, 2026 at 01:16:52PM +0000, Mohamed Abuelfotoh, Hazem wrote: > The main difference between Intel based c7i.4xlarge vs AMD based c7a.4xlarge > is that on Intel we have HT enabled so the instance has 16 vCPU which are > really 8 Physical cores with HT enabled. On AMD the VM comes with 16 > physical cores with no HT so it has 2 CCDs while on Intel we have a single > L3 cache. I am also adding the output of lscpu on both instances to clarify > architectural discrepancies between both. OK, that clarifies. How does the NI_RATE patch work for you? https://lkml.kernel.org/r/20260127151748.GA1079264@noisy.programming.kicks-ass.net
On 02/02/2026 10:51, Peter Zijlstra wrote: > > OK, that clarifies. > > How does the NI_RATE patch work for you? > > https://lkml.kernel.org/r/20260127151748.GA1079264@noisy.programming.kicks-ass.net Sure I am going to apply this patch and report the results.
On 02/02/2026 11:07, Mohamed Abuelfotoh, Hazem wrote:
> On 02/02/2026 10:51, Peter Zijlstra wrote:
>>
>> OK, that clarifies.
>>
>> How does the NI_RATE patch work for you?
>>
>> https://lkml.kernel.org/
>> r/20260127151748.GA1079264@noisy.programming.kicks-ass.net
>
> Sure I am going to apply this patch and report the results.
>
>
>
I have tested the patch proposed in [1] on top of v6.12.66 and
unfortunately I haven't seen any improvement. I mainly compared the
results between v6.12.66 & v6.12.66_revert_1b9c118fe318 which has the
revert for 1b9c118fe318 ("sched/fair: Proportional newidle balance") and
v6.12.66_proposed which is v6.12.66 + the patch proposed in [1]. I
mainly focused on AMD based c7a.4xlarge which was highly impacted as
previously discussed in this thread. The baseline is
v6.12.66_revert_1b9c118fe318 as it showed the best performance among the
other available kernel options.
Version | Benchmark name | SUT EC2 Instance| diff %
v6.12.66 | postgresql | c7a.4xlarge | -4.0%
v6.12.66 | nginx_lb | c7a.4xlarge | -5.0%
v6.12.66 | memcached | c7a.4xlarge | -11.0%
v6.12.66_proposed | postgresql | c7a.4xlarge | -4.0%
v6.12.66_proposed | nginx_lb | c7a.4xlarge | -5.0%
v6.12.66_proposed | memcached | c7a.4xlarge | -13.0%
Furthermore We have also seen around 10-20% randwrite fio performance
regression on v6.18.5 only on AMD based VMs. We confirmed that this
regression is caused by 1b9c118fe318 ("sched/fair: Proportional newidle
balance"). We are currently testing if the patch proposed in [1] to see
if it will help with this regression and will be sharing the results &
reproduction steps and environment in the next update.
[1
https://lkml.kernel.org/r/20260127151748.GA1079264@noisy.programming.kicks-ass.net
On Wed, Feb 04, 2026 at 12:45:41PM +0000, Mohamed Abuelfotoh, Hazem wrote: > Version | Benchmark name | SUT EC2 Instance| diff % > v6.12.66 | postgresql | c7a.4xlarge | -4.0% > v6.12.66 | nginx_lb | c7a.4xlarge | -5.0% > v6.12.66 | memcached | c7a.4xlarge | -11.0% > v6.12.66_proposed | postgresql | c7a.4xlarge | -4.0% > v6.12.66_proposed | nginx_lb | c7a.4xlarge | -5.0% > v6.12.66_proposed | memcached | c7a.4xlarge | -13.0% *sigh*, that actually made it worse for memcached :/ I'm not familiar with the memcached benchmark, is this something I can run on a single machine, or does it require high speed networking and 2 machines?
On 04/02/2026 13:27, Peter Zijlstra wrote: > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. > > > > On Wed, Feb 04, 2026 at 12:45:41PM +0000, Mohamed Abuelfotoh, Hazem wrote: > >> Version | Benchmark name | SUT EC2 Instance| diff % >> v6.12.66 | postgresql | c7a.4xlarge | -4.0% >> v6.12.66 | nginx_lb | c7a.4xlarge | -5.0% >> v6.12.66 | memcached | c7a.4xlarge | -11.0% >> v6.12.66_proposed | postgresql | c7a.4xlarge | -4.0% >> v6.12.66_proposed | nginx_lb | c7a.4xlarge | -5.0% >> v6.12.66_proposed | memcached | c7a.4xlarge | -13.0% > > *sigh*, that actually made it worse for memcached :/ I'm not familiar > with the memcached benchmark, is this something I can run on a single > machine, or does it require high speed networking and 2 machines? Yup that's true it's slightly worse on memcached with the proposed fix:( The memcached benchmark is kind of multi-layer test where you need at at least 2 client machines and 1 server machine and 1 machine as a test coordinator. The server VM is able to achieve 12.5 Gbps of network bandwidth and the client each one is able to achieve 30 Gbps so I think it will be tricky and likely impossible to reproduce this on a single machine. I will try to come-up with standalone reproduction steps that can be used to investigate this memcached regression. Meanwhile we will share the fio regression reproduction steps that I mentioned in my previous update. This should be much simpler in steps and can be done on a single machine.
On Wed, Feb 04, 2026 at 01:59:24PM +0000, Mohamed Abuelfotoh, Hazem wrote: > On 04/02/2026 13:27, Peter Zijlstra wrote: > > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. > > > > > > > > On Wed, Feb 04, 2026 at 12:45:41PM +0000, Mohamed Abuelfotoh, Hazem wrote: > > > > > Version | Benchmark name | SUT EC2 Instance| diff % > > > v6.12.66 | postgresql | c7a.4xlarge | -4.0% > > > v6.12.66 | nginx_lb | c7a.4xlarge | -5.0% > > > v6.12.66 | memcached | c7a.4xlarge | -11.0% > > > v6.12.66_proposed | postgresql | c7a.4xlarge | -4.0% > > > v6.12.66_proposed | nginx_lb | c7a.4xlarge | -5.0% > > > v6.12.66_proposed | memcached | c7a.4xlarge | -13.0% > > > > *sigh*, that actually made it worse for memcached :/ I'm not familiar > > with the memcached benchmark, is this something I can run on a single > > machine, or does it require high speed networking and 2 machines? > > Yup that's true it's slightly worse on memcached with the proposed fix:( The > memcached benchmark is kind of multi-layer test where you need at at least 2 > client machines and 1 server machine and 1 machine as a test coordinator. > The server VM is able to achieve 12.5 Gbps of network bandwidth and the > client each one is able to achieve 30 Gbps so I think it will be tricky and > likely impossible to reproduce this on a single machine. Urgh, yeah, while I have multiple machines, not two of them are the same and I can only offer 1 Gbps of networking, that's not going to keep anything busy. > I will try to > come-up with standalone reproduction steps that can be used to investigate > this memcached regression. Meanwhile we will share the fio regression > reproduction steps that I mentioned in my previous update. This should be > much simpler in steps and can be done on a single machine. Thanks! I have a few machines with a 'spare' nvme drive to run things on, hopefully that is sufficient.
On 04/02/2026 14:05, Peter Zijlstra wrote:
>> I will try to
>> come-up with standalone reproduction steps that can be used to investigate
>> this memcached regression. Meanwhile we will share the fio regression
>> reproduction steps that I mentioned in my previous update. This should be
>> much simpler in steps and can be done on a single machine.
>
> Thanks! I have a few machines with a 'spare' nvme drive to run things
> on, hopefully that is sufficient.
It looks like the previously reported fio regression has been fully
mitigated by the proposed patch [1]. I verified this on both 6.18.5 &
6.12.66. I will try to come-up with standalone reproduction for the
memcached regression to make it easier for debugging.
**fio regression reproduction environment**
AWS EC2 instance: c5ad.24xlarge
96 vCPUs = 48 Cores with HT
12 CCDs
Memory : 192 GiB
SSD Disk space: 1900 GiB
SSD Disk Max write IOPS: 180K
SSD Disk Max Write B.W: 760 MB/sec
Below are the results of three different runs.
6.18.5
6.18.5_revert 6.18.5 with the revert of 1b9c118fe318 ("sched/fair:
Proportional newidle balance")
6.18.5_proposed 6.18.5 with patch[1]
---------------------------------------------------------------
Version 6.18.5
# sudo fio --time_based --name=benchmark --size=50G --runtime=60
--filename=/dev/nvme1n1 --ioengine=psync --randrepeat=0 --iodepth=1
--fsync=64 --invalidate=1 --verify=0 --verify_fatal=0 --blocksize=4k
--group_reporting --rw=randwrite --numjobs=4
Run status group 0 (all jobs):
WRITE: bw=478MiB/s (501MB/s), 478MiB/s-478MiB/s (501MB/s-501MB/s),
io=28.0GiB (30.1GB), run=60003-60003msec
----------------------------------------------------------------
Version 6.18.5_revert
# sudo fio --time_based --name=benchmark --size=50G --runtime=60
--filename=/dev/nvme1n1 --ioengine=psync --randrepeat=0 --iodepth=1
--fsync=64 --invalidate=1 --verify=0 --verify_fatal=0 --blocksize=4k
--group_reporting --rw=randwrite --numjobs=4
Run status group 0 (all jobs):
WRITE: bw=549MiB/s (575MB/s), 549MiB/s-549MiB/s (575MB/s-575MB/s),
io=32.2GiB (34.5GB), run=60002-60002msec
-----------------------------------------------------------------
Version 6.18.5_proposed
# sudo fio --time_based --name=benchmark --size=50G --runtime=60
--filename=/dev/nvme1n1 --ioengine=psync --randrepeat=0 --iodepth=1
--fsync=64 --invalidate=1 --verify=0 --verify_fatal=0 --blocksize=4k
--group_reporting --rw=randwrite --numjobs=4
Run status group 0 (all jobs):
WRITE: bw=551MiB/s (578MB/s), 551MiB/s-551MiB/s (578MB/s-578MB/s),
io=32.3GiB (34.7GB), run=60003-60003msec
[1]
https://lore.kernel.org/all/20260127151748.GA1079264@noisy.programming.kicks-ass.net/T/#u
On Thu, Jan 29, 2026 at 10:19:37AM +0100, Peter Zijlstra wrote: > On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote: > > > Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge. > > > > c7i.4xlarge > > > > CPU Model: Intel(R) Xeon(R) Platinum 8488C > > Number of CPUs: 16 > > Memory: 32 GB > > Number of sockets: 1 > > But the 8488C is a 56 core part, with 112 threads. So you're handing out > 8 core partitions of that thing, for 7 such instances on one machine? > > (Also, calling anything 16 core with 32GB 'large' is laughable, that's > laptop territory.) Also, are you employing Intel-CAT on these partitions to separate the L3s? (Not immediately relevant I suppose, but I was curious) > > ------------------------------------------------------------------------- > > > > c7a.4xlarge > > > > CPU Model: AMD EPYC 9R14 > > Number of CPUs: 16 > > Memory: 32 GB > > Number of sockets: 1 > > And that 9r14 is a 96 core part, 12 CCDs, 8 cores each. So you're again > handing out partitions of that. > > > > For both cases, are these partitions fixed? Specifically in the AMD case, > are you handing out exactly 1 CCDs per partition? > > Because if so, I'm mighty confused by the results. 8 cores, 16 threads > is exactly one CCD worth of Zen4 and should therefore be a single L3 and > behave exactly like the Intel thing. > > Something is missing here.
On 29/01/2026 09:24, Peter Zijlstra wrote: > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. > > > > On Thu, Jan 29, 2026 at 10:19:37AM +0100, Peter Zijlstra wrote: >> On Wed, Jan 28, 2026 at 03:48:13PM +0000, Mohamed Abuelfotoh, Hazem wrote: >> >>> Below are the hardware specs for both c7i.4xlarge & c7a.4xlarge. >>> >>> c7i.4xlarge >>> >>> CPU Model: Intel(R) Xeon(R) Platinum 8488C >>> Number of CPUs: 16 >>> Memory: 32 GB >>> Number of sockets: 1 >> >> But the 8488C is a 56 core part, with 112 threads. So you're handing out >> 8 core partitions of that thing, for 7 such instances on one machine? >> >> (Also, calling anything 16 core with 32GB 'large' is laughable, that's >> laptop territory.) > > Also, are you employing Intel-CAT on these partitions to separate the > L3s? > > (Not immediately relevant I suppose, but I was curious) > We don't enable Intel-CAT to partition L3 cache between VMs.
On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> The patch "Proportional newidle balance" introduced a regression
> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> easyWave testing. [1]
>
> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> to install easyWave [2]. That is fetching the two tar.gz archives.
What is the actual configuration of that chip? Is it like 3*8 or 4*6
(CCX wise). A quick google couldn't find me the answer :/
> #!/bin/bash
> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
> # make -j8
>
> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
>
> OMP_NUM_THREADS=48 ./src/easywave \
> -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
> -time 1200
>
>
> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
So the problem is that 6.12 -> 6.18 is an enormous amount of kernel
releases :/ This patch in particular was an effort to fix a regression
caused by:
155213a2aed4 ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")
I'm thinking that if you revert all 4 patches of this series your
performance will be even worse?
Anyway, my guess is that somehow this benchmark likes doing newidle even
if it is often not successful. I'll see if I can reproduce this on one
of my machine, but that might take a little while.
On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> > The patch "Proportional newidle balance" introduced a regression
> > with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> > easyWave testing. [1]
> >
> > The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> > to install easyWave [2]. That is fetching the two tar.gz archives.
>
> What is the actual configuration of that chip? Is it like 3*8 or 4*6
> (CCX wise). A quick google couldn't find me the answer :/
Obviously I found it right after sending this. It's a 4x6 config.
Meaning it needs newidle to balance between those 4 domains.
Pratheek -- are you guys still considering that SIS_NODE thing? That
worked really well for workstation chips, but there were some issues on
Epyc or so.
> > #!/bin/bash
> > # CXXFLAGS="-O3 $CXXFLAGS" ./configure
> > # make -j8
> >
> > trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
> >
> > OMP_NUM_THREADS=48 ./src/easywave \
> > -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
> > -time 1200
> >
> >
> > Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
>
> So the problem is that 6.12 -> 6.18 is an enormous amount of kernel
> releases :/ This patch in particular was an effort to fix a regression
> caused by:
>
> 155213a2aed4 ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")
>
> I'm thinking that if you revert all 4 patches of this series your
> performance will be even worse?
>
> Anyway, my guess is that somehow this benchmark likes doing newidle even
> if it is often not successful. I'll see if I can reproduce this on one
> of my machine, but that might take a little while.
On Fri, Jan 23, 2026 at 12:03:06PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
> > On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> > > The patch "Proportional newidle balance" introduced a regression
> > > with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> > > easyWave testing. [1]
> > >
> > > The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> > > to install easyWave [2]. That is fetching the two tar.gz archives.
> >
> > What is the actual configuration of that chip? Is it like 3*8 or 4*6
> > (CCX wise). A quick google couldn't find me the answer :/
>
> Obviously I found it right after sending this. It's a 4x6 config.
> Meaning it needs newidle to balance between those 4 domains.
So with the below patch on top of my Xeon w7-2495X (which is 24-core
48-thread) I too have 4 LLC :-)
And I think I can see a slight difference, but nowhere near as terrible.
Let me go stick some tracing on.
cpu0 0 0 0 0 0 0 199480591279 9327118209 21136
domain0 SMT 0000,01000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 1111,11111111 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 205007928818 2654503460 14772
domain0 SMT 0000,02000002 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 2222,22222222 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu2 0 0 0 0 0 0 190458000839 2361863044 13265
domain0 SMT 0000,04000004 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 4444,44444444 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu3 0 0 0 0 0 0 193040171114 2769182152 16215
domain0 SMT 0000,08000008 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 MC 8888,88888888 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 PKG ffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
...
easywave# echo NI_RANDOM > /debug/sched/features; OMP_NUM_THREADS=48 ./src/easywave -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt -time 300
easyWave ver.2013-04-11
Model time = 00:00:00, elapsed: 2 msec
Model time = 00:10:00, elapsed: 6 msec
Model time = 00:20:00, elapsed: 13 msec
Model time = 00:30:00, elapsed: 21 msec
Model time = 00:40:00, elapsed: 33 msec
Model time = 00:50:00, elapsed: 59 msec
Model time = 01:00:00, elapsed: 136 msec
Model time = 01:10:00, elapsed: 160 msec
Model time = 01:20:00, elapsed: 189 msec
Model time = 01:30:00, elapsed: 266 msec
Model time = 01:40:00, elapsed: 321 msec
Model time = 01:50:00, elapsed: 401 msec
Model time = 02:00:00, elapsed: 482 msec
Model time = 02:10:00, elapsed: 619 msec
Model time = 02:20:00, elapsed: 731 msec
Model time = 02:30:00, elapsed: 856 msec
Model time = 02:40:00, elapsed: 1013 msec
Model time = 02:50:00, elapsed: 1204 msec
Model time = 03:00:00, elapsed: 1437 msec
Model time = 03:10:00, elapsed: 1715 msec
Model time = 03:20:00, elapsed: 1952 msec
Model time = 03:30:00, elapsed: 2713 msec
Model time = 03:40:00, elapsed: 3090 msec
Model time = 03:50:00, elapsed: 3644 msec
Model time = 04:00:00, elapsed: 4157 msec
Model time = 04:10:00, elapsed: 4632 msec
Model time = 04:20:00, elapsed: 5131 msec
Model time = 04:30:00, elapsed: 5685 msec
Model time = 04:40:00, elapsed: 6404 msec
Model time = 04:50:00, elapsed: 7154 msec
Model time = 05:00:00, elapsed: 8143 msec
easywave# echo NO_NI_RANDOM > /debug/sched/features; OMP_NUM_THREADS=48 ./src/easywave -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt -time 300
easyWave ver.2013-04-11
Model time = 00:00:00, elapsed: 1 msec
Model time = 00:10:00, elapsed: 6 msec
Model time = 00:20:00, elapsed: 12 msec
Model time = 00:30:00, elapsed: 21 msec
Model time = 00:40:00, elapsed: 33 msec
Model time = 00:50:00, elapsed: 94 msec
Model time = 01:00:00, elapsed: 114 msec
Model time = 01:10:00, elapsed: 138 msec
Model time = 01:20:00, elapsed: 191 msec
Model time = 01:30:00, elapsed: 227 msec
Model time = 01:40:00, elapsed: 272 msec
Model time = 01:50:00, elapsed: 322 msec
Model time = 02:00:00, elapsed: 381 msec
Model time = 02:10:00, elapsed: 458 msec
Model time = 02:20:00, elapsed: 634 msec
Model time = 02:30:00, elapsed: 861 msec
Model time = 02:40:00, elapsed: 1050 msec
Model time = 02:50:00, elapsed: 1265 msec
Model time = 03:00:00, elapsed: 1463 msec
Model time = 03:10:00, elapsed: 1658 msec
Model time = 03:20:00, elapsed: 1892 msec
Model time = 03:30:00, elapsed: 2243 msec
Model time = 03:40:00, elapsed: 2672 msec
Model time = 03:50:00, elapsed: 3038 msec
Model time = 04:00:00, elapsed: 3462 msec
Model time = 04:10:00, elapsed: 3961 msec
Model time = 04:20:00, elapsed: 4455 msec
Model time = 04:30:00, elapsed: 5040 msec
Model time = 04:40:00, elapsed: 5594 msec
Model time = 04:50:00, elapsed: 6190 msec
Model time = 05:00:00, elapsed: 7065 msec
---
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a24c7805acdb..d0d7cefb6cd3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -699,6 +699,11 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu)
return per_cpu(cpu_info.topo.l2c_id, cpu);
}
+static inline u32 per_cpu_core_id(unsigned int cpu)
+{
+ return per_cpu(cpu_info.topo.core_id, cpu);
+}
+
#ifdef CONFIG_CPU_SUP_AMD
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5cd6950ab672..5e7349c0f6ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,6 +438,9 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
return false;
+ if ((per_cpu_core_id(cpu1) % 4) != (per_cpu_core_id(cpu2) % 4))
+ return false;
+
/*
* Allow the SNC topology without warning. Return of false
* means 'c' does not share the LLC of 'o'. This will be
On Tue, Jan 27, 2026 at 11:40:41AM +0100, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 12:03:06PM +0100, Peter Zijlstra wrote:
> > On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
> > > On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
> > > > The patch "Proportional newidle balance" introduced a regression
> > > > with Linux 6.12.65 and 6.18.5. There is noticeable regression with
> > > > easyWave testing. [1]
> > > >
> > > > The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
> > > > to install easyWave [2]. That is fetching the two tar.gz archives.
> > >
> > > What is the actual configuration of that chip? Is it like 3*8 or 4*6
> > > (CCX wise). A quick google couldn't find me the answer :/
> >
> > Obviously I found it right after sending this. It's a 4x6 config.
> > Meaning it needs newidle to balance between those 4 domains.
>
> So with the below patch on top of my Xeon w7-2495X (which is 24-core
> 48-thread) I too have 4 LLC :-)
>
> And I think I can see a slight difference, but nowhere near as terrible.
>
> Let me go stick some tracing on.
Does this help some?
Turns out, this easywave thing has a very low newidle rate, but then
also a fairly low success rate. But since it doesn't do it that often,
the cost isn't that significant so we might as well always do it etc..
This adds a second term to the ratio computation that takes time into
account, For low rate newidle this term will dominate, while for higher
rate the success ratio is more important.
Chris, afaict this still DTRT for schbench, but if this works for Mario,
could you also re-run things at your end?
[ the 4 'second' thing is a bit random, but looking at the timings
between easywave and schbench this seems to be a reasonable middle
ground. Although I think 8 'seconds' -- 23 shift -- would also work.
That would give:
1024 - 8 s - 64 Hz
512 - 4 s - 128 Hz
256 - 2 s - 256 Hz
128 - 1 s - 512 Hz
64 - .5 s - 1024 Hz
32 - .25 s - 2048 Hz
]
---
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 45c0022b91ce..a1e1032426dc 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -95,6 +95,7 @@ struct sched_domain {
unsigned int newidle_call;
unsigned int newidle_success;
unsigned int newidle_ratio;
+ u64 newidle_stamp;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eca642295c4b..ab9cf06c6a76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12224,8 +12224,31 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
sd->newidle_call++;
sd->newidle_success += success;
if (sd->newidle_call >= 1024) {
- sd->newidle_ratio = sd->newidle_success;
+ u64 now = sched_clock();
+ s64 delta = now - sd->newidle_stamp;
+ sd->newidle_stamp = now;
+ int ratio = 0;
+
+ if (delta < 0)
+ delta = 0;
+
+ if (sched_feat(NI_RATE)) {
+ /*
+ * ratio delta freq
+ *
+ * 1024 - 4 s - 128 Hz
+ * 512 - 2 s - 256 Hz
+ * 256 - 1 s - 512 Hz
+ * 128 - .5 s - 1024 Hz
+ * 64 - .25 s - 2048 Hz
+ */
+ ratio = delta >> 22;
+ }
+
+ ratio += sd->newidle_success;
+
+ sd->newidle_ratio = min(1024, ratio);
sd->newidle_call /= 2;
sd->newidle_success /= 2;
}
@@ -12932,7 +12959,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
unsigned int weight = 1;
- if (sched_feat(NI_RANDOM)) {
+ if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
/*
* Throw a 1k sided dice; and only run
* newidle_balance according to the success
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 980d92bab8ab..7aba7523c6c1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false)
* Do newidle balancing proportional to its success rate using randomization.
*/
SCHED_FEAT(NI_RANDOM, true)
+SCHED_FEAT(NI_RATE, true)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..05741f18f334 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include <linux/bsearch.h>
#include "sched.h"
@@ -1637,6 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl,
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
+ u64 now = sched_clock();
sd_weight = cpumask_weight(tl->mask(tl, cpu));
@@ -1674,6 +1676,7 @@ sd_init(struct sched_domain_topology_level *tl,
.newidle_call = 512,
.newidle_success = 256,
.newidle_ratio = 512,
+ .newidle_stamp = now,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
I missed stating higher is better for the stress-ng socket tests. For clarity, I find it difficult to know for certain if a scheduler patch is good or bad without the prefer-idle-core results. A fix may resolve an issue. But, only to introduce another. I'm unqualified to know for certain if the fix in question introduces another. Because, of the limited CPU saturation anomaly with EEVDF. EEVDF turns out to be amazing. However, the folks in my circle including myself are constantly worried about the ups and downs with EEVDF. Mainly with the stable kernels. We consider varied testing one way to be certain, including limited CPU saturation testing. Well, a wish request for the test machines to include limited CPU saturation, e.g. 100%, 50%, 31.25%, and 25%. The 25% is helpful in the case the test does 2x the number of given parameter. Plus wanting to be at/below the number of physical cores. Thank you for your efforts with EEVDF.
Peter, thank you for your fix to improve EEVDF.
Cc'd Andrea Righi
Thank you for the is_idle_core() function and help. [0]
Cc'd Shubhang Kaushik
Your patch inspired me to perform trial and error testing.
What has now become the 0280 patch in CachyMod GitHub repo. [0]
Together with the help of CachyOS community members, we concluded
the prefcore + prefer-idle-core to be surreal. I enjoy the EEVDF
scheduler a lot more, since lesser favoring the SMT siblings.
For comparison, I added results for sched-ext cosmos.
Limited CPU saturation can be revealing of potential scheduler issues.
Testing includes 100%, 50%, 31.25%, and 25% CPU saturation.
All kernels built with GCC to factor out CLANG/AutoFDO.
A) 6.18.8-rc1
with sched/fair: Proportional newidle balance
48cpus(100%) 24cpus(50%) 15cpus(31.25%) 12cpus(25%)
algorithm3 [1] 9.462s 14.181s 20.311s 24.498s
darktable [2] 2.811s 3.715s 5.315s 6.434s
easywave [3] 19.747s 10.804s 20.207s 21.571s
stress-ng [4] 37632.06 56220.21 41694.50 34740.58
B) 6.18.8-rc1
Peter Z's fix for sched/fair: Proportional newidle balance
48cpus(100%) 24cpus(50%) 15cpus(31.25%) 12cpus(25%)
algorithm3 [1] 9.340s 14.733s 21.339s 25.069s
darktable [2] 2.493s 3.616s 5.148s 5.968s
easywave [3] 11.357s 13.312s * 18.483s 20.741s
stress-ng [4] 37533.24 55419.85 39452.17 32217.55
algorithm3 and stress-ng regressed, possibly limited CPU saturation
anomaly
easywave (*) wierd result, repeatable and all over the place
C) 6.18.8-rc1
Revert sched/fair: Proportional newidle balance
48cpus(100%) 24cpus(50%) 15cpus(31.25%) 12cpus(25%)
algorithm3 [1] 9.286s 15.101s 21.417s 25.126s
darktable [2] 2.484s 3.531s 5.185s 6.002s
easywave [3] 11.517s 12.300s 18.466s 20.428s
stress-ng [4] 42231.92 47306.18 * 32438.03 * 28820.83 *
stress-ng (*) lack-luster with limited CPU saturation
D) 6.18.8-rc1
Revert sched/fair: Proportional newidle balance
Plus apply the prefer-idle-core patch [0]
48cpus(100%) 24cpus(50%) 15cpus(31.25%) 12cpus(25%)
algorithm3 [1] 9.312s 11.292s 17.243s 21.811s
darktable [2] 2.418s 3.711s * 5.499s * 6.510s *
easywave [3] 10.035s 9.832s 15.738s 18.805s
stress-ng [4] 44837.41 63364.56 55646.26 48202.58
darktable (*) lesser performance with limited CPU saturation
noticeably better performance, otherwise
E) scx_cosmos -m 0-5 -s 800 -l 8000 -f -c 1 -p 0 [5]
48cpus(100%) 24cpus(50%) 15cpus(31.25%) 12cpus(25%)
algorithm3 [1] 9.218s 11.188s 17.045s 21.130s
darktable [2] 2.365s 3.900s 4.626s 5.664s
easywave [3] 9.187s 16.528s * 15.933s 16.991s
stress-ng [4] 21065.70 36417.65 27185.95 23141.87
easywave (*) sched-ext cosmos appears to favor SMT siblings
---
[0] https://github.com/marioroy/cachymod
the prefer-idle-core is 0280-prefer-prevcpu-for-wakeup.patch
more about mindfulness for limited CPU saturation versus accepting
patch
surreal is prefcore + prefer-idle-core, improving many workloads
[1] https://github.com/marioroy/mce-sandbox
./algorithm3.pl 1e12 --threads=N
algorithm3.pl is akin to server/client application; chatty
primesieve.pl is more CPU-bound; less chatty
optionally, compare with primesieve binary (fully cpu bound, no chatty)
https://github.com/kimwalisch/primesieve
[2] https://math.dartmouth.edu/~sarunas/darktable_bench.html
OMP_NUM_THREADS=N darktable-cli setubal.orf setubal.orf.xmp test.jpg \
--core --disable-opencl -d perf
result: pixel pipeline processing took {...} secs
[3] https://openbenchmarking.org/test/pts/easywave
OMP_NUM_THREADS=N ./src/easywave \
-grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
-time 600
result: Model time = 10:00:00, elapsed: {...} msec
[4] https://openbenchmarking.org/test/pts/stress-ng
stress-ng -t 30 --metrics-brief --sock N --no-rand-seed --sock-zerocopy
result: bogo ops real time usr time sys time bogo ops/s bogo ops/s
(secs) (secs) (secs) (real time)
(usr+sys time)
{...}
this involves 2x NCPUs due to { writer, reader } threads per sock
thus the reason adding 12cpus result (12 x 2 = 24 <= 50% saturation)
[5] https://github.com/sched-ext/scx
cargo build --release -p scx_cosmos
On 1/27/26 10:17 AM, Peter Zijlstra wrote:
> On Tue, Jan 27, 2026 at 11:40:41AM +0100, Peter Zijlstra wrote:
>> On Fri, Jan 23, 2026 at 12:03:06PM +0100, Peter Zijlstra wrote:
>>> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>>>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>>>> The patch "Proportional newidle balance" introduced a regression
>>>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>>>> easyWave testing. [1]
>>>>>
>>>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>>>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>>>> (CCX wise). A quick google couldn't find me the answer :/
>>> Obviously I found it right after sending this. It's a 4x6 config.
>>> Meaning it needs newidle to balance between those 4 domains.
>> So with the below patch on top of my Xeon w7-2495X (which is 24-core
>> 48-thread) I too have 4 LLC :-)
>>
>> And I think I can see a slight difference, but nowhere near as terrible.
>>
>> Let me go stick some tracing on.
> Does this help some?
>
> Turns out, this easywave thing has a very low newidle rate, but then
> also a fairly low success rate. But since it doesn't do it that often,
> the cost isn't that significant so we might as well always do it etc..
>
> This adds a second term to the ratio computation that takes time into
> account, For low rate newidle this term will dominate, while for higher
> rate the success ratio is more important.
>
> Chris, afaict this still DTRT for schbench, but if this works for Mario,
> could you also re-run things at your end?
>
> [ the 4 'second' thing is a bit random, but looking at the timings
> between easywave and schbench this seems to be a reasonable middle
> ground. Although I think 8 'seconds' -- 23 shift -- would also work.
>
> That would give:
>
> 1024 - 8 s - 64 Hz
> 512 - 4 s - 128 Hz
> 256 - 2 s - 256 Hz
> 128 - 1 s - 512 Hz
> 64 - .5 s - 1024 Hz
> 32 - .25 s - 2048 Hz
> ]
>
> ---
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 45c0022b91ce..a1e1032426dc 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -95,6 +95,7 @@ struct sched_domain {
> unsigned int newidle_call;
> unsigned int newidle_success;
> unsigned int newidle_ratio;
> + u64 newidle_stamp;
> u64 max_newidle_lb_cost;
> unsigned long last_decay_max_lb_cost;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index eca642295c4b..ab9cf06c6a76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12224,8 +12224,31 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
> sd->newidle_call++;
> sd->newidle_success += success;
>
> if (sd->newidle_call >= 1024) {
> - sd->newidle_ratio = sd->newidle_success;
> + u64 now = sched_clock();
> + s64 delta = now - sd->newidle_stamp;
> + sd->newidle_stamp = now;
> + int ratio = 0;
> +
> + if (delta < 0)
> + delta = 0;
> +
> + if (sched_feat(NI_RATE)) {
> + /*
> + * ratio delta freq
> + *
> + * 1024 - 4 s - 128 Hz
> + * 512 - 2 s - 256 Hz
> + * 256 - 1 s - 512 Hz
> + * 128 - .5 s - 1024 Hz
> + * 64 - .25 s - 2048 Hz
> + */
> + ratio = delta >> 22;
> + }
> +
> + ratio += sd->newidle_success;
> +
> + sd->newidle_ratio = min(1024, ratio);
> sd->newidle_call /= 2;
> sd->newidle_success /= 2;
> }
> @@ -12932,7 +12959,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
> if (sd->flags & SD_BALANCE_NEWIDLE) {
> unsigned int weight = 1;
>
> - if (sched_feat(NI_RANDOM)) {
> + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
> /*
> * Throw a 1k sided dice; and only run
> * newidle_balance according to the success
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 980d92bab8ab..7aba7523c6c1 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false)
> * Do newidle balancing proportional to its success rate using randomization.
> */
> SCHED_FEAT(NI_RANDOM, true)
> +SCHED_FEAT(NI_RATE, true)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index cf643a5ddedd..05741f18f334 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -4,6 +4,7 @@
> */
>
> #include <linux/sched/isolation.h>
> +#include <linux/sched/clock.h>
> #include <linux/bsearch.h>
> #include "sched.h"
>
> @@ -1637,6 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl,
> struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
> int sd_id, sd_weight, sd_flags = 0;
> struct cpumask *sd_span;
> + u64 now = sched_clock();
>
> sd_weight = cpumask_weight(tl->mask(tl, cpu));
>
> @@ -1674,6 +1676,7 @@ sd_init(struct sched_domain_topology_level *tl,
> .newidle_call = 512,
> .newidle_success = 256,
> .newidle_ratio = 512,
> + .newidle_stamp = now,
>
> .max_newidle_lb_cost = 0,
> .last_decay_max_lb_cost = jiffies,
I tried the Stress-NG socket activity test. Plus prefer-idle-core patch.
The patch is about mindfulness for limited CPU saturation testing.
AMD Ryzen Threadripper 9960X CPU (24/48)
Bogo operations/second, More is better
A B C D E F
SocketAct 12128.7 13907.6 12377.7 10551.7 12158.7 11842.2
SocketAct24 64553.3 20072.0 67018.7 62182.3 18133.5 66756.6
SocketAct15 49206.3 22170.7 57038.7 44077.6 19884.1 56727.5
SocketAct10 35263.5 20140.3 40092.1 33040.3 19701.6 41346.3
The kernels are built with clang without LTO/AutoFDO
A. 6.19-rc7 next_buddy ena with sched/fair: Proportional newidle balance
B. 6.19-rc7 next_buddy ena without sched/fair: Proportional newidle balance
C. 6.19-rc7 next_buddy ena without sched regression; with prefer-idle-core
D. 6.19-rc7 next_buddy dis with sched/fair: Proportional newidle balance
E. 6.19-rc7 next_buddy dis without sched/fair: Proportional newidle balance
F. 6.19-rc7 next_buddy dis without sched regression; with prefer-idle-core
Without sched regression:
this is without sched/fair: Proportional newidle balance
With prefer-idle-core:
https://github.com/marioroy/cachymod/blob/main/linux-cachymod-6.18/
0280-prefer-prevcpu-for-wakeup.patch
Stress-NG 0.20.00: SocketAct, SocketAct24, SocketAct15, SocketAct10
stress-ng -t 30 --metrics-brief --sock -1 --no-rand-seed --sock-zerocopy
stress-ng -t 30 --metrics-brief --sock 24 --no-rand-seed --sock-zerocopy
stress-ng -t 30 --metrics-brief --sock 15 --no-rand-seed --sock-zerocopy
stress-ng -t 30 --metrics-brief --sock 10 --no-rand-seed --sock-zerocopy
Basically 100%, 50%, and 31.25% times 2 (writer, reader)
I ran also, --sock 10 because 10 x 2 is less than 50% (24 threads)
Linux 6.18.7 results: granted, both are built with LTO + AutoFDO profile
CachyOS 6.18.7-2 CachyMod 6.18.7-2 [1]
SocketAct 40799.2 46784.3
SocketAct24 61057.6 71414.5
SocketAct15 45056.4 61772.3
SocketAct10 32691.6 44244.6
[1] https://github.com/marioroy/cachymod
the sched regression reverted (0040 patch)
prefer-idle-core (0280 patch)
On 1/23/26 6:03 AM, Peter Zijlstra wrote:
> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>> The patch "Proportional newidle balance" introduced a regression
>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>> easyWave testing. [1]
>>>
>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>> (CCX wise). A quick google couldn't find me the answer :/
> Obviously I found it right after sending this. It's a 4x6 config.
> Meaning it needs newidle to balance between those 4 domains.
>
> Pratheek -- are you guys still considering that SIS_NODE thing? That
> worked really well for workstation chips, but there were some issues on
> Epyc or so.
>
>>> #!/bin/bash
>>> # CXXFLAGS="-O3 $CXXFLAGS" ./configure
>>> # make -j8
>>>
>>> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT
>>>
>>> OMP_NUM_THREADS=48 ./src/easywave \
>>> -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \
>>> -time 1200
>>>
>>>
>>> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels.
>> So the problem is that 6.12 -> 6.18 is an enormous amount of kernel
>> releases :/ This patch in particular was an effort to fix a regression
>> caused by:
>>
>> 155213a2aed4 ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")
>>
>> I'm thinking that if you revert all 4 patches of this series your
>> performance will be even worse?
>>
>> Anyway, my guess is that somehow this benchmark likes doing newidle even
>> if it is often not successful. I'll see if I can reproduce this on one
>> of my machine, but that might take a little while.
Hello Peter, On 1/23/2026 4:33 PM, Peter Zijlstra wrote: > On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote: >> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote: >>> The patch "Proportional newidle balance" introduced a regression >>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with >>> easyWave testing. [1] >>> >>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source >>> to install easyWave [2]. That is fetching the two tar.gz archives. >> >> What is the actual configuration of that chip? Is it like 3*8 or 4*6 >> (CCX wise). A quick google couldn't find me the answer :/ > > Obviously I found it right after sending this. It's a 4x6 config. > Meaning it needs newidle to balance between those 4 domains. > > Pratheek -- are you guys still considering that SIS_NODE thing? That > worked really well for workstation chips, but there were some issues on > Epyc or so. SIS_NODE was really turned out to be a trade-off between search time vs search opportunity, especially when the system was heavily overloaded. Let me rebase those old patches and give it a spin over the weekend on a couple of those large machines (128C/256T and 192C/384T per socket) to see the damage. I'll update here by Tuesday or post out a series if I see the situation having changed on the recent kernels - some benchmarks had a completely different bottleneck there when we looked closer last. > >>> #!/bin/bash >>> # CXXFLAGS="-O3 $CXXFLAGS" ./configure >>> # make -j8 >>> >>> trap 'rm -f *.ssh *.idx *.log *.sshmax *.time' EXIT >>> >>> OMP_NUM_THREADS=48 ./src/easywave \ >>> -grid examples/e2Asean.grd -source examples/BengkuluSept2007.flt \ >>> -time 1200 >>> >>> >>> Before results with CachyOS 6.12.63-2 and 6.18.3-2 kernels. I'll go look at the benchmark too to see if I can reproduce on my end and get some stats for these too. Thanks for bringing it to my notice. -- Thanks and Regards, Prateek
On 1/23/2026 5:54 PM, K Prateek Nayak wrote:
> Hello Peter,
>
> On 1/23/2026 4:33 PM, Peter Zijlstra wrote:
>> On Fri, Jan 23, 2026 at 11:50:46AM +0100, Peter Zijlstra wrote:
>>> On Sun, Jan 18, 2026 at 03:46:22PM -0500, Mario Roy wrote:
>>>> The patch "Proportional newidle balance" introduced a regression
>>>> with Linux 6.12.65 and 6.18.5. There is noticeable regression with
>>>> easyWave testing. [1]
>>>>
>>>> The CPU is AMD Threadripper 9960X CPU (24/48). I followed the source
>>>> to install easyWave [2]. That is fetching the two tar.gz archives.
>>>
>>> What is the actual configuration of that chip? Is it like 3*8 or 4*6
>>> (CCX wise). A quick google couldn't find me the answer :/
>>
>> Obviously I found it right after sending this. It's a 4x6 config.
>> Meaning it needs newidle to balance between those 4 domains.
>>
>> Pratheek -- are you guys still considering that SIS_NODE thing? That
>> worked really well for workstation chips, but there were some issues on
>> Epyc or so.
>
> SIS_NODE was really turned out to be a trade-off between search
> time vs search opportunity, especially when the system was heavily
> overloaded.
>
> Let me rebase those old patches and give it a spin over the weekend
> on a couple of those large machines (128C/256T and 192C/384T per
> socket) to see the damage. I'll update here by Tuesday or post out
> a series if I see the situation having changed on the recent
> kernels - some benchmarks had a completely different bottleneck
> there when we looked closer last.
So these are the results on tip:sched/core merged onto tip:sched/urgent
with SIS_NODE and SIS_NODE + SIS_UTIL [1] on a 512 CPUs machine with
(2 sockets x 16 CCXs (LLCs) x 8C/16T Zen4c cores):
tl;dr
(*) Consistent regressions, even with SIS_UTIL bailout on higher domain;
Benchmark are mainly measuring tail-latency or have a thundering
heard behavior that SIS_UTIL uwith default imbalance_pct isn't able
to fully adjust to.
(#) Data has run-to-run variance but is still worse on average.
Note: Although "new-schbench-wakeup-latency" shows regression, the
baseline is few "us" and a couple more "us" addition appears as a
~ 20%-30% regression.
I'm still fighting dependency hell to get some of the longer running
benchmarks running on this system but I expect a few pct regressions
like last time [2].
System:
- 2 x 128C/256T Zen4c system with 16CCXs per socket
- Boost on
- C2 disabled
- Each socket is a NUMA node
Kernels:
tip: tip:sched/core at commit 377521af0341 ("sched: remove
task_struct->faults_disabled_mapping") merged onto
tip:sched/urgent at commit 15257cc2f905 ("sched/fair: Revert
force wakeup preemption")
sis_node: tip + sis_node patch + cpumask_and() moved to after
SIS_UTIL bailout [3]
sis_node: Tree from [1] based on tip:sched/core merged onto
tip:sched/urgent
Full results:
==================================================================
Test : hackbench
Units : Normalized time in seconds
Interpretation: Lower is better
Statistic : AMean
==================================================================
Case: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1-groups 1.00 [ -0.00](11.61) 0.76 [ 24.30]( 4.43) 0.76 [ 24.05]( 2.93)
2-groups 1.00 [ -0.00]( 9.73) 0.86 [ 14.22](17.59) 0.80 [ 19.85](15.31)
4-groups 1.00 [ -0.00]( 5.88) 0.78 [ 21.87](11.93) 0.78 [ 21.64](14.33)
8-groups 1.00 [ -0.00]( 2.93) 0.92 [ 8.44]( 3.99) 0.92 [ 7.79]( 4.04)
16-groups 1.00 [ -0.00]( 1.77) 0.90 [ 10.47]( 5.61) 0.94 [ 5.92]( 5.65)
==================================================================
Test : tbench
Units : Normalized throughput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1 1.00 [ 0.00]( 0.20) 1.00 [ -0.07]( 0.16) 1.01 [ 0.53]( 0.23)
2 1.00 [ 0.00]( 0.35) 1.00 [ -0.03]( 0.58) 1.00 [ 0.12]( 0.20)
4 1.00 [ 0.00]( 0.09) 1.01 [ 0.60]( 0.60) 1.00 [ 0.16]( 0.15)
8 1.00 [ 0.00]( 0.63) 1.00 [ -0.35]( 0.53) 1.00 [ 0.26]( 0.19)
16 1.00 [ 0.00]( 0.97) 1.00 [ 0.33]( 0.30) 1.01 [ 1.16]( 0.50)
32 1.00 [ 0.00]( 0.98) 1.02 [ 1.54]( 0.91) 1.01 [ 1.10]( 0.26)
64 1.00 [ 0.00]( 3.45) 1.02 [ 1.88]( 0.48) 1.02 [ 1.78]( 1.29)
128 1.00 [ 0.00]( 2.49) 1.00 [ -0.01]( 1.63) 0.99 [ -0.68]( 1.88)
256 1.00 [ 0.00]( 0.57) 1.01 [ 0.73]( 0.45) 1.01 [ 0.92]( 0.35)
512 1.00 [ 0.00]( 3.92) 0.51 [-48.55]( 0.11) 0.80 [-19.59]( 6.31) (*)
1024 1.00 [ 0.00]( 0.10) 0.98 [ -2.11]( 0.09) 0.97 [ -3.29]( 0.28)
2048 1.00 [ 0.00]( 0.09) 0.98 [ -2.08]( 0.28) 0.99 [ -0.75]( 0.48)
==================================================================
Test : stream-10
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
Copy 1.00 [ 0.00]( 0.31) 0.99 [ -0.70]( 0.57) 1.00 [ -0.09]( 1.44)
Scale 1.00 [ 0.00]( 0.38) 0.99 [ -1.00]( 0.49) 1.00 [ 0.32]( 1.41)
Add 1.00 [ 0.00]( 0.31) 0.99 [ -0.95]( 0.63) 1.00 [ 0.43]( 1.16)
Triad 1.00 [ 0.00]( 0.18) 0.99 [ -0.84]( 0.68) 1.00 [ 0.16]( 1.12)
==================================================================
Test : stream-100
Units : Normalized Bandwidth, MB/s
Interpretation: Higher is better
Statistic : HMean
==================================================================
Test: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
Copy 1.00 [ 0.00]( 1.46) 1.00 [ 0.39]( 1.57) 1.01 [ 0.82]( 0.52)
Scale 1.00 [ 0.00]( 1.45) 1.00 [ 0.49]( 1.37) 1.01 [ 1.20]( 0.55)
Add 1.00 [ 0.00]( 1.09) 1.00 [ 0.31]( 0.94) 1.01 [ 0.79]( 0.35)
Triad 1.00 [ 0.00]( 1.06) 1.00 [ 0.22]( 1.02) 1.01 [ 0.56]( 0.19)
==================================================================
Test : netperf
Units : Normalized Througput
Interpretation: Higher is better
Statistic : AMean
==================================================================
Clients: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1-clients 1.00 [ 0.00]( 0.27) 0.99 [ -0.82]( 0.26) 0.99 [ -0.78]( 0.16)
2-clients 1.00 [ 0.00]( 0.28) 0.99 [ -0.87]( 0.19) 1.00 [ -0.17]( 0.67)
4-clients 1.00 [ 0.00]( 0.38) 1.00 [ -0.47]( 0.33) 0.99 [ -0.53]( 0.31)
8-clients 1.00 [ 0.00]( 0.34) 0.99 [ -0.55]( 0.18) 1.00 [ -0.33]( 0.24)
16-clients 1.00 [ 0.00]( 0.30) 1.00 [ -0.39]( 0.23) 1.00 [ -0.19]( 0.26)
32-clients 1.00 [ 0.00]( 0.43) 1.00 [ -0.40]( 0.57) 1.00 [ -0.24]( 0.68)
64-clients 1.00 [ 0.00]( 0.82) 1.00 [ -0.12]( 0.45) 1.00 [ -0.14]( 0.70)
128-clients 1.00 [ 0.00]( 1.21) 1.00 [ 0.10]( 1.28) 1.00 [ 0.08]( 1.19)
256-clients 1.00 [ 0.00]( 1.38) 1.01 [ 0.65]( 0.89) 1.00 [ 0.34]( 0.89)
512-clients 1.00 [ 0.00]( 8.76) 0.47 [-52.76]( 1.64) 0.77 [-23.10](10.06) (*)
768-clients 1.00 [ 0.00](34.29) 0.83 [-16.89](30.45) 0.98 [ -2.16](36.19)
1024-clients 1.00 [ 0.00](47.96) 0.91 [ -9.29](36.02) 0.98 [ -1.93](46.36)
==================================================================
Test : schbench
Units : Normalized 99th percentile latency in us
Interpretation: Lower is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1 1.00 [ -0.00](14.20) 1.72 [-72.00](15.01) 0.88 [ 12.00]( 4.55)
2 1.00 [ -0.00]( 1.68) 1.09 [ -8.82]( 6.96) 0.97 [ 2.94]( 9.90)
4 1.00 [ -0.00]( 4.45) 1.18 [-17.65]( 5.29) 1.03 [ -2.94]( 3.24)
8 1.00 [ -0.00]( 2.44) 1.12 [-12.20]( 4.35) 1.02 [ -2.44]( 2.38)
16 1.00 [ -0.00]( 0.00) 1.04 [ -3.64]( 1.75) 0.98 [ 1.82]( 1.85)
32 1.00 [ -0.00]( 2.87) 1.03 [ -2.53]( 2.80) 0.99 [ 1.27]( 1.47)
64 1.00 [ -0.00]( 3.17) 1.02 [ -1.57]( 5.72) 0.98 [ 2.36]( 2.30)
128 1.00 [ -0.00]( 2.95) 1.01 [ -1.35]( 3.03) 1.00 [ -0.00]( 1.13)
256 1.00 [ -0.00]( 1.17) 0.99 [ 1.23]( 1.75) 0.99 [ 1.43]( 1.56)
512 1.00 [ -0.00]( 4.54) 1.14 [-13.60]( 2.41) 0.97 [ 2.50]( 0.42)
768 1.00 [ -0.00]( 2.24) 1.27 [-27.44]( 3.18) 1.12 [-11.54]( 5.64) (*)
1024 1.00 [ -0.00]( 0.28) 1.14 [-14.20]( 0.56) 1.13 [-13.00]( 1.01) (*)
==================================================================
Test : new-schbench-requests-per-second
Units : Normalized Requests per second
Interpretation: Higher is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.15)
2 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.15)
4 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.00) 1.00 [ 0.29]( 0.15)
8 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.00) 1.00 [ 0.29]( 0.00)
16 1.00 [ 0.00]( 0.15) 1.00 [ -0.29]( 0.15) 1.00 [ 0.00]( 0.00)
32 1.00 [ 0.00]( 0.15) 1.00 [ -0.29]( 0.00) 1.00 [ 0.00]( 0.15)
64 1.00 [ 0.00]( 0.00) 1.00 [ 0.00]( 0.00) 1.00 [ 0.29]( 0.00)
128 1.00 [ 0.00]( 0.27) 1.00 [ 0.00](18.48) 0.65 [-34.50](24.12) (#)
256 1.00 [ 0.00]( 0.00) 0.99 [ -0.58]( 0.00) 0.99 [ -0.58]( 0.00)
512 1.00 [ 0.00]( 1.05) 1.00 [ 0.00]( 0.20) 1.00 [ 0.39]( 0.87)
768 1.00 [ 0.00]( 0.95) 0.98 [ -1.88]( 0.93) 0.99 [ -0.71]( 0.53)
1024 1.00 [ 0.00]( 0.49) 0.99 [ -0.81]( 0.57) 1.00 [ 0.00]( 0.74)
==================================================================
Test : new-schbench-wakeup-latency
Units : Normalized 99th percentile latency in us
Interpretation: Lower is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1 1.00 [ -0.00]( 6.74) 2.38 [-137.50](29.34) 1.75 [-75.00]( 9.53)
2 1.00 [ -0.00](12.06) 1.27 [-27.27]( 9.53) 1.36 [-36.36]( 6.59)
4 1.00 [ -0.00](11.71) 1.33 [-33.33]( 3.30) 1.33 [-33.33]( 3.16)
8 1.00 [ -0.00]( 0.00) 1.27 [-27.27](12.69) 1.09 [ -9.09]( 4.43)
16 1.00 [ -0.00]( 4.84) 1.09 [ -9.09]( 4.43) 1.18 [-18.18](10.79)
32 1.00 [ -0.00]( 0.00) 1.00 [ -0.00]( 0.00) 1.10 [-10.00]( 4.56)
64 1.00 [ -0.00](13.22) 1.00 [ -0.00]( 5.00) 1.00 [ -0.00]( 9.68)
128 1.00 [ -0.00]( 8.13) 1.00 [ -0.00]( 8.85) 1.18 [-18.18](13.76)
256 1.00 [ -0.00]( 2.97) 1.02 [ -1.94]( 3.80) 1.08 [ -7.77]( 7.13)
512 1.00 [ -0.00]( 1.25) 1.00 [ 0.37]( 0.68) 1.00 [ -0.37]( 1.81)
768 1.00 [ -0.00]( 0.00) 1.00 [ -0.00]( 0.00) 1.00 [ -0.00]( 0.00)
1024 1.00 [ -0.00]( 0.63) 1.00 [ -0.11]( 4.06) 1.00 [ -0.11]( 3.13)
==================================================================
Test : new-schbench-request-latency
Units : Normalized 99th percentile latency in us
Interpretation: Lower is better
Statistic : Median
==================================================================
#workers: tip[pct imp](CV) sis-node[pct imp](CV) sis-node-w-sis-util[pct imp](CV)
1 1.00 [ -0.00]( 0.14) 1.00 [ -0.26]( 0.14) 1.00 [ -0.00]( 0.14)
2 1.00 [ -0.00]( 0.14) 1.00 [ -0.26]( 0.00) 1.00 [ -0.00]( 0.14)
4 1.00 [ -0.00]( 0.00) 1.00 [ -0.00]( 0.00) 1.00 [ 0.26]( 0.14)
8 1.00 [ -0.00]( 0.00) 1.00 [ -0.00]( 0.00) 1.00 [ 0.26]( 0.14)
16 1.00 [ -0.00]( 0.00) 1.00 [ -0.00]( 0.00) 1.01 [ -0.53]( 1.18)
32 1.00 [ -0.00]( 0.54) 1.01 [ -1.05]( 0.59) 0.99 [ 0.53]( 0.27)
64 1.00 [ -0.00]( 0.00) 1.00 [ 0.26]( 1.08) 1.00 [ 0.26](31.75)
128 1.00 [ -0.00]( 0.61) 1.00 [ -0.00]( 4.19) 1.10 [-10.22]( 4.79) (#)
256 1.00 [ -0.00]( 0.43) 1.01 [ -1.39]( 0.74) 1.02 [ -1.63]( 0.66)
512 1.00 [ -0.00]( 3.32) 1.00 [ 0.23]( 1.62) 1.04 [ -3.72]( 3.79)
768 1.00 [ -0.00]( 0.88) 0.95 [ 4.52]( 0.63) 0.98 [ 1.94]( 0.54)
1024 1.00 [ -0.00]( 1.01) 0.98 [ 1.54]( 0.91) 1.00 [ 0.17]( 0.31)
Let me go play around with imbalance_pct for SIS_UITL at PKG/NODE domain
to see if there is a sweet spot that keeps everything happy while things
are happier on average.
I doubt if Meta's workload will be happy with more aggressive SIS_UTIL
limits since data from David's SHARED_RUNQ series [4] showed that
specific workload requires aggressive search + aggressive newidle balance.
References:
[1] https://github.com/kudureranganath/linux/commits/kudure/sched/sis_node/
[2] https://lore.kernel.org/all/3de5c24f-6437-f21b-ed61-76b86a199e8c@amd.com/
[3] https://github.com/kudureranganath/linux/commit/7639cf7632853b91e6a5b449eee08d3399b10d31
[4] https://lore.kernel.org/lkml/20230809221218.163894-1-void@manifault.com/
--
Thanks and Regards,
Prateek
On 11/7/25 9:36 PM, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
>
> This improves schbench significantly:
>
> 6.18-rc4: 2.22 Mrps/s
> 6.18-rc4+revert: 2.04 Mrps/s
> 6.18-rc4+revert+random: 2.18 Mrps/S
>
Could you please share the schbench command?
I see command like "schbench -t 90 -r 30 -i 30" running on 60 core regress.
Will do more iterations to confirm it (to be sure it is not run/run variation)
> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
>
> 6.17: -6%
> 6.17+revert: 0%
> 6.17+revert+random: -1%
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> include/linux/sched/topology.h | 3 ++
> kernel/sched/core.c | 3 ++
> kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++----
> kernel/sched/features.h | 5 ++++
> kernel/sched/sched.h | 7 ++++++
> kernel/sched/topology.c | 6 +++++
> 6 files changed, 63 insertions(+), 4 deletions(-)
>
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -92,6 +92,9 @@ struct sched_domain {
> unsigned int nr_balance_failed; /* initialise to 0 */
>
> /* idle_balance() stats */
> + unsigned int newidle_call;
> + unsigned int newidle_success;
> + unsigned int newidle_ratio;
> u64 max_newidle_lb_cost;
> unsigned long last_decay_max_lb_cost;
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_updat
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>
> DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
>
> #ifdef CONFIG_SCHED_PROXY_EXEC
> DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> @@ -8589,6 +8590,8 @@ void __init sched_init_smp(void)
> {
> sched_init_numa(NUMA_NO_NODE);
>
> + prandom_init_once(&sched_rnd_state);
> +
> /*
> * There's no userspace yet to cause hotplug operations; hence all the
> * CPU masks are stable and all blatant races in the below code cannot
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12146,11 +12146,26 @@ void update_max_interval(void)
> max_load_balance_interval = HZ*num_online_cpus()/10;
> }
>
> -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
> +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
> +{
> + sd->newidle_call++;
> + sd->newidle_success += success;
> +
> + if (sd->newidle_call >= 1024) {
> + sd->newidle_ratio = sd->newidle_success;
> + sd->newidle_call /= 2;
> + sd->newidle_success /= 2;
> + }
Would it be better to >> 1 ? or compiler takes care of it?
> +}
> +
> +static inline bool
> +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
> {
> unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
> unsigned long now = jiffies;
>
> + update_newidle_stats(sd, success);
> +
> if (cost > sd->max_newidle_lb_cost) {
> /*
> * Track max cost of a domain to make sure to not delay the
> @@ -12198,7 +12213,7 @@ static void sched_balance_domains(struct
> * Decay the newidle max times here because this is a regular
> * visit to all the domains.
> */
> - need_decay = update_newidle_cost(sd, 0);
> + need_decay = update_newidle_cost(sd, 0, 0);
> max_cost += sd->max_newidle_lb_cost;
>
> /*
> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> break;
>
> if (sd->flags & SD_BALANCE_NEWIDLE) {
> + unsigned int weight = 1;
> +
> + if (sched_feat(NI_RANDOM)) {
> + /*
> + * Throw a 1k sided dice; and only run
> + * newidle_balance according to the success
> + * rate.
> + */
> + u32 d1k = sched_rng() % 1024;
> + weight = 1 + sd->newidle_ratio;
> + if (d1k > weight) {
> + update_newidle_stats(sd, 0);
> + continue;
> + }
> + weight = (1024 + weight/2) / weight;
> + }
>
> pulled_task = sched_balance_rq(this_cpu, this_rq,
> sd, CPU_NEWLY_IDLE,
> @@ -12850,10 +12881,14 @@ static int sched_balance_newidle(struct
>
> t1 = sched_clock_cpu(this_cpu);
> domain_cost = t1 - t0;
> - update_newidle_cost(sd, domain_cost);
> -
> curr_cost += domain_cost;
> t0 = t1;
> +
> + /*
> + * Track max cost of a domain to make sure to not delay the
> + * next wakeup on the CPU.
> + */
> + update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
> }
>
> /*
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
> SCHED_FEAT(UTIL_EST, true)
>
> SCHED_FEAT(LATENCY_WARN, false)
> +
> +/*
> + * Do newidle balancing proportional to its success rate using randomization.
> + */
> +SCHED_FEAT(NI_RANDOM, true)
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -5,6 +5,7 @@
> #ifndef _KERNEL_SCHED_SCHED_H
> #define _KERNEL_SCHED_SCHED_H
>
> +#include <linux/prandom.h>
> #include <linux/sched/affinity.h>
> #include <linux/sched/autogroup.h>
> #include <linux/sched/cpufreq.h>
> @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled
> }
>
> DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
> +
> +static inline u32 sched_rng(void)
> +{
> + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
> +}
>
> #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
> #define this_rq() this_cpu_ptr(&runqueues)
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_lev
>
> .last_balance = jiffies,
> .balance_interval = sd_weight,
> +
> + /* 50% success rate */
> + .newidle_call = 512,
> + .newidle_success = 256,
> + .newidle_ratio = 512,
> +
> .max_newidle_lb_cost = 0,
> .last_decay_max_lb_cost = jiffies,
> .child = child,
>
>
run hackbench with it, Looks like hackbench does better when utilization is very high.
Otherwise, it regresses slightly.
I compared series applied vs on 65177ea9f64d. Let me know if i need to set anything different.
Will do numbers with more loops/iterations to iron out any run/run variations.
On Wed, Nov 12, 2025 at 09:12:57PM +0530, Shrikanth Hegde wrote:
>
>
> On 11/7/25 9:36 PM, Peter Zijlstra wrote:
> > Add a randomized algorithm that runs newidle balancing proportional to
> > its success rate.
> >
> > This improves schbench significantly:
> >
> > 6.18-rc4: 2.22 Mrps/s
> > 6.18-rc4+revert: 2.04 Mrps/s
> > 6.18-rc4+revert+random: 2.18 Mrps/S
> >
>
> Could you please share the schbench command?
>
> I see command like "schbench -t 90 -r 30 -i 30" running on 60 core regress.
> Will do more iterations to confirm it (to be sure it is not run/run variation)
This was:
schbench -L -m 4 -M auto -t 256 -n 0 -r 60 -s 0
from the original thread:
https://lkml.kernel.org/r/20250626144017.1510594-2-clm@fb.com
> > + if (sd->newidle_call >= 1024) {
> > + sd->newidle_ratio = sd->newidle_success;
> > + sd->newidle_call /= 2;
> > + sd->newidle_success /= 2;
> > + }
>
> Would it be better to >> 1 ? or compiler takes care of it?
I would be very disappointed if our compilers don't do this.
On 11/8/2025 12:06 AM, Peter Zijlstra wrote:
> Add a randomized algorithm that runs newidle balancing proportional to
> its success rate.
>
> This improves schbench significantly:
>
> 6.18-rc4: 2.22 Mrps/s
> 6.18-rc4+revert: 2.04 Mrps/s
> 6.18-rc4+revert+random: 2.18 Mrps/S
>
> Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
>
> 6.17: -6%
> 6.17+revert: 0%
> 6.17+revert+random: -1%
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Adam Li <adamli@os.amperecomputing.com>
Please see the Specjbb test result on AmpereOne server bellow:
6.18-rc5: 0% (baseline)
6.18-rc5+patchset: +5%
6.18-rc4+patchset+NO_NI_RANDOM: +6%
6.18-rc5+revert-155213a2aed4: +6%
Could you please explain a little the math behind success rate
(sd->newidle_ratio) calculation?
[...]
> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> break;
>
> if (sd->flags & SD_BALANCE_NEWIDLE) {
> + unsigned int weight = 1;
> +
> + if (sched_feat(NI_RANDOM)) {
> + /*
> + * Throw a 1k sided dice; and only run
> + * newidle_balance according to the success
> + * rate.
> + */
> + u32 d1k = sched_rng() % 1024;
> + weight = 1 + sd->newidle_ratio;
> + if (d1k > weight) {
> + update_newidle_stats(sd, 0);
> + continue;
> + }
> + weight = (1024 + weight/2) / weight;
> + }
>
e.g: Why 'weight = (1024 + weight/2) / weight'
Thanks,
-adam
On Tue, Nov 11, 2025 at 05:07:45PM +0800, Adam Li wrote:
> > @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> > break;
> >
> > if (sd->flags & SD_BALANCE_NEWIDLE) {
> > + unsigned int weight = 1;
> > +
> > + if (sched_feat(NI_RANDOM)) {
> > + /*
> > + * Throw a 1k sided dice; and only run
> > + * newidle_balance according to the success
> > + * rate.
> > + */
> > + u32 d1k = sched_rng() % 1024;
> > + weight = 1 + sd->newidle_ratio;
> > + if (d1k > weight) {
> > + update_newidle_stats(sd, 0);
> > + continue;
> > + }
> > + weight = (1024 + weight/2) / weight;
> > + }
> >
> e.g: Why 'weight = (1024 + weight/2) / weight'
Not sure what you're asking, so two answers:
That's a rounding divide. We have a helper for that, but I never can
remember what its called.
The transformation as a whole here is from a ratio to a weight, suppose
our ratio is 256, this means that we do 1-in-4 or 25% of the balance
calls. However this also means that each success needs to be weighted as
4 (=1024/256), otherwise we under-account the successes and not even a
100% success rate can lift you out the hole.
Now, I made it a rounding divide to make it a little easier to climb out
of said hole (I even considered ceiling divide).
On 11/11/2025 5:20 PM, Peter Zijlstra wrote:
> On Tue, Nov 11, 2025 at 05:07:45PM +0800, Adam Li wrote:
>>> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
>>> break;
>>>
>>> if (sd->flags & SD_BALANCE_NEWIDLE) {
>>> + unsigned int weight = 1;
>>> +
>>> + if (sched_feat(NI_RANDOM)) {
>>> + /*
>>> + * Throw a 1k sided dice; and only run
>>> + * newidle_balance according to the success
>>> + * rate.
>>> + */
>>> + u32 d1k = sched_rng() % 1024;
>>> + weight = 1 + sd->newidle_ratio;
>>> + if (d1k > weight) {
>>> + update_newidle_stats(sd, 0);
>>> + continue;
>>> + }
>>> + weight = (1024 + weight/2) / weight;
>>> + }
>>>
>> e.g: Why 'weight = (1024 + weight/2) / weight'
>
> Not sure what you're asking, so two answers:
>
> That's a rounding divide. We have a helper for that, but I never can
> remember what its called.
>
> The transformation as a whole here is from a ratio to a weight, suppose
> our ratio is 256, this means that we do 1-in-4 or 25% of the balance
> calls. However this also means that each success needs to be weighted as
> 4 (=1024/256), otherwise we under-account the successes and not even a
> 100% success rate can lift you out the hole.
>
> Now, I made it a rounding divide to make it a little easier to climb out
> of said hole (I even considered ceiling divide).
>
>
Thanks for clarification.
If I understand correctly, (sd->newidle_ratio / 1024) is close to
(sd->newidle_success / sd->newidle_call). 'sd->newidle_ratio' means
success rate of newidle balance.
Shall we update newidle stats only from sched_balance_newidle()
as bellow patch? So that sched_balance_domains() will not update sd->newidle_call.
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12171,7 +12171,8 @@ update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
unsigned long now = jiffies;
- update_newidle_stats(sd, success);
+ if (cost)
+ update_newidle_stats(sd, success);
if (cost > sd->max_newidle_lb_cost) {
/*
I tested this change, Specjbb performance is similar with your patch.
Thanks,
-adam
On Wed, Nov 12, 2025 at 08:04:05PM +0800, Adam Li wrote:
> On 11/11/2025 5:20 PM, Peter Zijlstra wrote:
> > On Tue, Nov 11, 2025 at 05:07:45PM +0800, Adam Li wrote:
> >>> @@ -12843,6 +12858,22 @@ static int sched_balance_newidle(struct
> >>> break;
> >>>
> >>> if (sd->flags & SD_BALANCE_NEWIDLE) {
> >>> + unsigned int weight = 1;
> >>> +
> >>> + if (sched_feat(NI_RANDOM)) {
> >>> + /*
> >>> + * Throw a 1k sided dice; and only run
> >>> + * newidle_balance according to the success
> >>> + * rate.
> >>> + */
> >>> + u32 d1k = sched_rng() % 1024;
> >>> + weight = 1 + sd->newidle_ratio;
> >>> + if (d1k > weight) {
> >>> + update_newidle_stats(sd, 0);
> >>> + continue;
> >>> + }
> >>> + weight = (1024 + weight/2) / weight;
> >>> + }
> >>>
> >> e.g: Why 'weight = (1024 + weight/2) / weight'
> >
> > Not sure what you're asking, so two answers:
> >
> > That's a rounding divide. We have a helper for that, but I never can
> > remember what its called.
> >
> > The transformation as a whole here is from a ratio to a weight, suppose
> > our ratio is 256, this means that we do 1-in-4 or 25% of the balance
> > calls. However this also means that each success needs to be weighted as
> > 4 (=1024/256), otherwise we under-account the successes and not even a
> > 100% success rate can lift you out the hole.
> >
> > Now, I made it a rounding divide to make it a little easier to climb out
> > of said hole (I even considered ceiling divide).
> >
> >
> Thanks for clarification.
>
> If I understand correctly, (sd->newidle_ratio / 1024) is close to
> (sd->newidle_success / sd->newidle_call). 'sd->newidle_ratio' means
> success rate of newidle balance.
>
> Shall we update newidle stats only from sched_balance_newidle()
> as bellow patch? So that sched_balance_domains() will not update sd->newidle_call.
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -12171,7 +12171,8 @@ update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
> unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
> unsigned long now = jiffies;
>
> - update_newidle_stats(sd, success);
> + if (cost)
> + update_newidle_stats(sd, success);
>
> if (cost > sd->max_newidle_lb_cost) {
> /*
>
> I tested this change, Specjbb performance is similar with your patch.
Ah yes, that makes sense. Let me make that change.
Thanks!
On 07.11.25 17:06, Peter Zijlstra wrote: > Add a randomized algorithm that runs newidle balancing proportional to > its success rate. > > This improves schbench significantly: > > 6.18-rc4: 2.22 Mrps/s > 6.18-rc4+revert: 2.04 Mrps/s > 6.18-rc4+revert+random: 2.18 Mrps/S > > Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: > > 6.17: -6% > 6.17+revert: 0% > 6.17+revert+random: -1% > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Results with OLTP 'hammerdb - mysqld' on Arm64 VMs NOPM P50 latency 6.18-rc4 baseline baseline 6.18-rc4+revert-155213a2aed4 +13% -8.8% 6.18-rc4+patchset +11% -8.2% 6.18-rc4+patchset+NO_NI_RANDOM +13% -8.6% Pretty consistent with the results on the previous version. Although I hadn't tested NI_TARGET+NI_RANDOM back then. http://lkml.kernel.org/r/f6379aa6-459d-4205-96ea-9848e55d7f9c@arm.com In case (pure wakeup) schbench configs are the only workloads profiting from NI_RANDOM, make NO_NI_RANDOM the default? Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> [...]
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 33cf66d88306663d16e4759e9d24766b0aaa2e17
Gitweb: https://git.kernel.org/tip/33cf66d88306663d16e4759e9d24766b0aaa2e17
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 07 Nov 2025 17:01:31 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 17 Nov 2025 17:13:16 +01:00
sched/fair: Proportional newidle balance
Add a randomized algorithm that runs newidle balancing proportional to
its success rate.
This improves schbench significantly:
6.18-rc4: 2.22 Mrps/s
6.18-rc4+revert: 2.04 Mrps/s
6.18-rc4+revert+random: 2.18 Mrps/S
Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
6.17: -6%
6.17+revert: 0%
6.17+revert+random: -1%
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
---
include/linux/sched/topology.h | 3 ++-
kernel/sched/core.c | 3 ++-
kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++---
kernel/sched/features.h | 5 ++++-
kernel/sched/sched.h | 7 +++++-
kernel/sched/topology.c | 6 +++++-
6 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf1..45c0022 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
+ unsigned int newidle_call;
+ unsigned int newidle_success;
+ unsigned int newidle_ratio;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 699db3f..9f10cfb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8489,6 +8490,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index abcbb67..1855975 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12224,11 +12224,27 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
{
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
unsigned long now = jiffies;
+ if (cost)
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
@@ -12276,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12912,6 +12928,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12919,10 +12951,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 0607def..980d92b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index def9ab7..b419a4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
#ifndef _KERNEL_SCHED_SCHED_H
#define _KERNEL_SCHED_SCHED_H
+#include <linux/prandom.h>
#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 711076a..cf643a5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 7c983640e4db0c1fd8ce6c6cd921c19954a8d479
Gitweb: https://git.kernel.org/tip/7c983640e4db0c1fd8ce6c6cd921c19954a8d479
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 07 Nov 2025 17:01:31 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 14 Nov 2025 13:03:08 +01:00
sched/fair: Proportional newidle balance
Add a randomized algorithm that runs newidle balancing proportional to
its success rate.
This improves schbench significantly:
6.18-rc4: 2.22 Mrps/s
6.18-rc4+revert: 2.04 Mrps/s
6.18-rc4+revert+random: 2.18 Mrps/S
Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
6.17: -6%
6.17+revert: 0%
6.17+revert+random: -1%
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
---
include/linux/sched/topology.h | 3 ++-
kernel/sched/core.c | 3 ++-
kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++---
kernel/sched/features.h | 5 ++++-
kernel/sched/sched.h | 7 +++++-
kernel/sched/topology.c | 6 +++++-
6 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf1..45c0022 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -92,6 +92,9 @@ struct sched_domain {
unsigned int nr_balance_failed; /* initialise to 0 */
/* idle_balance() stats */
+ unsigned int newidle_call;
+ unsigned int newidle_success;
+ unsigned int newidle_ratio;
u64 max_newidle_lb_cost;
unsigned long last_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 699db3f..9f10cfb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8489,6 +8490,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50461c9..aaa47ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12223,11 +12223,27 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
{
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
unsigned long now = jiffies;
+ if (cost)
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
@@ -12275,7 +12291,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12911,6 +12927,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12918,10 +12950,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 0607def..980d92b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(LATENCY_WARN, false)
+
+/*
+ * Do newidle balancing proportional to its success rate using randomization.
+ */
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index def9ab7..b419a4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
#ifndef _KERNEL_SCHED_SCHED_H
#define _KERNEL_SCHED_SCHED_H
+#include <linux/prandom.h>
#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 711076a..cf643a5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
© 2016 - 2026 Red Hat, Inc.