When the CPU is about to enter idle, it invokes newidle_balance()
to pull some tasks from other runqueues. Although there is per
domain max_newidle_lb_cost to throttle the newidle_balance(), it
would be good to further limit the scan based on overall system
utilization. The reason is that there is no limitation for
newidle_balance() to launch this balance simultaneously on
multiple CPUs. Since each newidle_balance() has to traverse all
the groups to calculate the statistics one by one, this total
time cost on newidle_balance() could be O(n^2). n is the number
of groups. This issue is more severe if there are many groups
within 1 domain, for example, a system with a large number of
Cores in a LLC domain. This is not good for performance or
power saving.
sqlite has spent quite some time on newidle balance() on Intel
Sapphire Rapids, which has 2 x 56C/112T = 224 CPUs:
6.69% 0.09% sqlite3 [kernel.kallsyms] [k] newidle_balance
5.39% 4.71% sqlite3 [kernel.kallsyms] [k] update_sd_lb_stats
Based on this observation, limit the scan depth of newidle_balance()
by considering the utilization of the sched domain. Let the number of
scanned groups be a linear function of the utilization ratio:
nr_groups_to_scan = nr_groups * (1 - util_ratio)
Suggested-by: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
include/linux/sched/topology.h | 1 +
kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
kernel/sched/features.h | 1 +
3 files changed, 32 insertions(+)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d6a64a2c92aa..af2261308529 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -84,6 +84,7 @@ struct sched_domain_shared {
int nr_idle_scan;
unsigned long total_load;
unsigned long total_capacity;
+ int nr_sg_scan;
};
struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index edcfee9965cd..6925813db59b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10153,6 +10153,35 @@ static void ilb_save_stats(struct lb_env *env,
WRITE_ONCE(sd_share->total_capacity, sds->total_capacity);
}
+static void update_ilb_group_scan(struct lb_env *env,
+ unsigned long sum_util,
+ struct sched_domain_shared *sd_share)
+{
+ u64 tmp, nr_scan;
+
+ if (!sched_feat(ILB_UTIL))
+ return;
+
+ if (!sd_share)
+ return;
+
+ if (env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ /*
+ * Limit the newidle balance scan depth based on overall system
+ * utilization:
+ * nr_groups_scan = nr_groups * (1 - util_ratio)
+ * and util_ratio = sum_util / (sd_weight * SCHED_CAPACITY_SCALE)
+ */
+ nr_scan = env->sd->nr_groups * sum_util;
+ tmp = env->sd->span_weight * SCHED_CAPACITY_SCALE;
+ do_div(nr_scan, tmp);
+ nr_scan = env->sd->nr_groups - nr_scan;
+ if ((int)nr_scan != sd_share->nr_sg_scan)
+ WRITE_ONCE(sd_share->nr_sg_scan, (int)nr_scan);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -10231,6 +10260,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_idle_cpu_scan(env, sum_util);
+ update_ilb_group_scan(env, sum_util, sd_share);
/* save a snapshot of stats during periodic load balance */
ilb_save_stats(env, sd_share, sds);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3cb71c8cddc0..30f6d1a2f235 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -103,3 +103,4 @@ SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true)
SCHED_FEAT(ILB_SNAPSHOT, true)
+SCHED_FEAT(ILB_UTIL, true)
--
2.25.1
On 7/27/23 8:05 PM, Chen Yu wrote:
> When the CPU is about to enter idle, it invokes newidle_balance()
> to pull some tasks from other runqueues. Although there is per
> domain max_newidle_lb_cost to throttle the newidle_balance(), it
> would be good to further limit the scan based on overall system
> utilization. The reason is that there is no limitation for
> newidle_balance() to launch this balance simultaneously on
> multiple CPUs. Since each newidle_balance() has to traverse all
> the groups to calculate the statistics one by one, this total
> time cost on newidle_balance() could be O(n^2). n is the number
> of groups. This issue is more severe if there are many groups
> within 1 domain, for example, a system with a large number of
> Cores in a LLC domain. This is not good for performance or
> power saving.
>
> sqlite has spent quite some time on newidle balance() on Intel
> Sapphire Rapids, which has 2 x 56C/112T = 224 CPUs:
> 6.69% 0.09% sqlite3 [kernel.kallsyms] [k] newidle_balance
> 5.39% 4.71% sqlite3 [kernel.kallsyms] [k] update_sd_lb_stats
>
> Based on this observation, limit the scan depth of newidle_balance()
> by considering the utilization of the sched domain. Let the number of
> scanned groups be a linear function of the utilization ratio:
>
> nr_groups_to_scan = nr_groups * (1 - util_ratio)
>
> Suggested-by: Tim Chen <tim.c.chen@intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
> include/linux/sched/topology.h | 1 +
> kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
> kernel/sched/features.h | 1 +
> 3 files changed, 32 insertions(+)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index d6a64a2c92aa..af2261308529 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -84,6 +84,7 @@ struct sched_domain_shared {
> int nr_idle_scan;
> unsigned long total_load;
> unsigned long total_capacity;
> + int nr_sg_scan;
> };
>
> struct sched_domain {
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index edcfee9965cd..6925813db59b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10153,6 +10153,35 @@ static void ilb_save_stats(struct lb_env *env,
> WRITE_ONCE(sd_share->total_capacity, sds->total_capacity);
> }
>
> +static void update_ilb_group_scan(struct lb_env *env,
> + unsigned long sum_util,
> + struct sched_domain_shared *sd_share)
> +{
> + u64 tmp, nr_scan;
> +
> + if (!sched_feat(ILB_UTIL))
> + return;
> +
> + if (!sd_share)
> + return;
> +
> + if (env->idle == CPU_NEWLY_IDLE)
> + return;
Suggestion for small improvement:
First if condition here could be check for newidle. As it often very often we could save a few cycles of checking
sched feature.
> + if (env->idle == CPU_NEWLY_IDLE)
> + return;
> +
> + /*
> + * Limit the newidle balance scan depth based on overall system
> + * utilization:
> + * nr_groups_scan = nr_groups * (1 - util_ratio)
> + * and util_ratio = sum_util / (sd_weight * SCHED_CAPACITY_SCALE)
> + */
> + nr_scan = env->sd->nr_groups * sum_util;
> + tmp = env->sd->span_weight * SCHED_CAPACITY_SCALE;
> + do_div(nr_scan, tmp);
> + nr_scan = env->sd->nr_groups - nr_scan;
> + if ((int)nr_scan != sd_share->nr_sg_scan)
> + WRITE_ONCE(sd_share->nr_sg_scan, (int)nr_scan);
> +}
> +
> /**
> * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
> * @env: The load balancing environment.
> @@ -10231,6 +10260,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> }
>
> update_idle_cpu_scan(env, sum_util);
> + update_ilb_group_scan(env, sum_util, sd_share);
>
> /* save a snapshot of stats during periodic load balance */
> ilb_save_stats(env, sd_share, sds);
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 3cb71c8cddc0..30f6d1a2f235 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -103,3 +103,4 @@ SCHED_FEAT(ALT_PERIOD, true)
> SCHED_FEAT(BASE_SLICE, true)
>
> SCHED_FEAT(ILB_SNAPSHOT, true)
> +SCHED_FEAT(ILB_UTIL, true)
On 2023-08-25 at 11:32:01 +0530, Shrikanth Hegde wrote:
>
>
> On 7/27/23 8:05 PM, Chen Yu wrote:
> > When the CPU is about to enter idle, it invokes newidle_balance()
> > to pull some tasks from other runqueues. Although there is per
> > domain max_newidle_lb_cost to throttle the newidle_balance(), it
> > would be good to further limit the scan based on overall system
> > utilization. The reason is that there is no limitation for
> > newidle_balance() to launch this balance simultaneously on
> > multiple CPUs. Since each newidle_balance() has to traverse all
> > the groups to calculate the statistics one by one, this total
> > time cost on newidle_balance() could be O(n^2). n is the number
> > of groups. This issue is more severe if there are many groups
> > within 1 domain, for example, a system with a large number of
> > Cores in a LLC domain. This is not good for performance or
> > power saving.
> >
> > sqlite has spent quite some time on newidle balance() on Intel
> > Sapphire Rapids, which has 2 x 56C/112T = 224 CPUs:
> > 6.69% 0.09% sqlite3 [kernel.kallsyms] [k] newidle_balance
> > 5.39% 4.71% sqlite3 [kernel.kallsyms] [k] update_sd_lb_stats
> >
> > Based on this observation, limit the scan depth of newidle_balance()
> > by considering the utilization of the sched domain. Let the number of
> > scanned groups be a linear function of the utilization ratio:
> >
> > nr_groups_to_scan = nr_groups * (1 - util_ratio)
> >
> > Suggested-by: Tim Chen <tim.c.chen@intel.com>
> > Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> > ---
> > include/linux/sched/topology.h | 1 +
> > kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
> > kernel/sched/features.h | 1 +
> > 3 files changed, 32 insertions(+)
> >
> > diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> > index d6a64a2c92aa..af2261308529 100644
> > --- a/include/linux/sched/topology.h
> > +++ b/include/linux/sched/topology.h
> > @@ -84,6 +84,7 @@ struct sched_domain_shared {
> > int nr_idle_scan;
> > unsigned long total_load;
> > unsigned long total_capacity;
> > + int nr_sg_scan;
> > };
> >
> > struct sched_domain {
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index edcfee9965cd..6925813db59b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -10153,6 +10153,35 @@ static void ilb_save_stats(struct lb_env *env,
> > WRITE_ONCE(sd_share->total_capacity, sds->total_capacity);
> > }
> >
> > +static void update_ilb_group_scan(struct lb_env *env,
> > + unsigned long sum_util,
> > + struct sched_domain_shared *sd_share)
> > +{
> > + u64 tmp, nr_scan;
> > +
> > + if (!sched_feat(ILB_UTIL))
> > + return;
> > +
> > + if (!sd_share)
> > + return;
> > +
> > + if (env->idle == CPU_NEWLY_IDLE)
> > + return;
>
>
> Suggestion for small improvement:
>
> First if condition here could be check for newidle. As it often very often we could save a few cycles of checking
> sched feature.
>
Yes, this makes sense, I'll change it.
thanks,
Chenyu
© 2016 - 2025 Red Hat, Inc.