sched: cpufreq: Remove magic margins

[RFC PATCH 4/7] sched: cpufreq: Remove magic 1.25 headroom from apply_dvfs_headroom()

Posted by Qais Yousef 2 years, 5 months ago

Instead of the magical 1.25 headroom, use the new approximate_util_avg()
to provide headroom based on the dvfs_update_delay; which is the period
at which the cpufreq governor will send DVFS updates to the hardware.

Add a new percpu dvfs_update_delay that can be cheaply accessed whenever
apply_dvfs_headroom() is called. We expect cpufreq governors that rely
on util to drive its DVFS logic/algorithm to populate these percpu
variables. schedutil is the only such governor at the moment.

Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
---
 kernel/sched/core.c              |  3 ++-
 kernel/sched/cpufreq_schedutil.c | 10 +++++++++-
 kernel/sched/sched.h             | 25 ++++++++++++++-----------
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 602e369753a3..f56eb44745a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);
 
 #ifdef CONFIG_SCHED_DEBUG
 /*
@@ -7439,7 +7440,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	 * frequency will be gracefully reduced with the utilization decay.
 	 */
 	if (type == FREQUENCY_UTIL) {
-		util = apply_dvfs_headroom(util_cfs) + cpu_util_rt(rq);
+		util = apply_dvfs_headroom(util_cfs, cpu) + cpu_util_rt(rq);
 		util = uclamp_rq_util_with(rq, util, p);
 	} else {
 		util = util_cfs + cpu_util_rt(rq);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0c7565ac31fb..04aa06846f31 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -519,15 +519,21 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
 	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 	struct sugov_policy *sg_policy;
 	unsigned int rate_limit_us;
+	int cpu;
 
 	if (kstrtouint(buf, 10, &rate_limit_us))
 		return -EINVAL;
 
 	tunables->rate_limit_us = rate_limit_us;
 
-	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+
 		sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
 
+		for_each_cpu(cpu, sg_policy->policy->cpus)
+			per_cpu(dvfs_update_delay, cpu) = rate_limit_us;
+	}
+
 	return count;
 }
 
@@ -772,6 +778,8 @@ static int sugov_start(struct cpufreq_policy *policy)
 		memset(sg_cpu, 0, sizeof(*sg_cpu));
 		sg_cpu->cpu			= cpu;
 		sg_cpu->sg_policy		= sg_policy;
+
+		per_cpu(dvfs_update_delay, cpu) = sg_policy->tunables->rate_limit_us;
 	}
 
 	if (policy_is_shared(policy))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b889ad399de..e06e512af192 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3001,6 +3001,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 unsigned long approximate_util_avg(unsigned long util, u64 delta);
 u64 approximate_runtime(unsigned long util);
 
+/*
+ * Any governor that relies on util signal to drive DVFS, must populate these
+ * percpu dvfs_update_delay variables.
+ *
+ * It should describe the rate/delay at which the governor sends DVFS freq
+ * update to the hardware in us.
+ */
+DECLARE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);
+
 /*
  * DVFS decision are made at discrete points. If CPU stays busy, the util will
  * continue to grow, which means it could need to run at a higher frequency
@@ -3010,20 +3019,14 @@ u64 approximate_runtime(unsigned long util);
  * to run at adequate performance point.
  *
  * This function provides enough headroom to provide adequate performance
- * assuming the CPU continues to be busy.
- *
- * At the moment it is a constant multiplication with 1.25.
+ * assuming the CPU continues to be busy. This headroom is based on the
+ * dvfs_update_delay of the cpufreq governor.
  *
- * TODO: The headroom should be a function of the delay. 25% is too high
- * especially on powerful systems. For example, if the delay is 500us, it makes
- * more sense to give a small headroom as the next decision point is not far
- * away and will follow the util if it continues to rise. On the other hand if
- * the delay is 10ms, then we need a bigger headroom so the CPU won't struggle
- * at a lower frequency if it never goes to idle until then.
+ * XXX: Should we provide headroom when the util is decaying?
  */
-static inline unsigned long apply_dvfs_headroom(unsigned long util)
+static inline unsigned long apply_dvfs_headroom(unsigned long util, int cpu)
 {
-	return util + (util >> 2);
+	return approximate_util_avg(util, per_cpu(dvfs_update_delay, cpu));
 }
 
 /*
-- 
2.34.1

Re: [RFC PATCH 4/7] sched: cpufreq: Remove magic 1.25 headroom from apply_dvfs_headroom()

Posted by Peter Zijlstra 2 years, 5 months ago

On Mon, Aug 28, 2023 at 12:32:00AM +0100, Qais Yousef wrote:
> Instead of the magical 1.25 headroom, use the new approximate_util_avg()
> to provide headroom based on the dvfs_update_delay; which is the period
> at which the cpufreq governor will send DVFS updates to the hardware.
> 
> Add a new percpu dvfs_update_delay that can be cheaply accessed whenever
> apply_dvfs_headroom() is called. We expect cpufreq governors that rely
> on util to drive its DVFS logic/algorithm to populate these percpu
> variables. schedutil is the only such governor at the moment.
> 
> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> ---
>  kernel/sched/core.c              |  3 ++-
>  kernel/sched/cpufreq_schedutil.c | 10 +++++++++-
>  kernel/sched/sched.h             | 25 ++++++++++++++-----------
>  3 files changed, 25 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 602e369753a3..f56eb44745a8 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
>  
>  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);

This makes no sense, why are you using SHARED_ALIGNED and thus wasting
an entire cacheline for the one variable?

Re: [RFC PATCH 4/7] sched: cpufreq: Remove magic 1.25 headroom from apply_dvfs_headroom()

Posted by Qais Yousef 2 years, 5 months ago

On 09/07/23 13:34, Peter Zijlstra wrote:
> On Mon, Aug 28, 2023 at 12:32:00AM +0100, Qais Yousef wrote:
> > Instead of the magical 1.25 headroom, use the new approximate_util_avg()
> > to provide headroom based on the dvfs_update_delay; which is the period
> > at which the cpufreq governor will send DVFS updates to the hardware.
> > 
> > Add a new percpu dvfs_update_delay that can be cheaply accessed whenever
> > apply_dvfs_headroom() is called. We expect cpufreq governors that rely
> > on util to drive its DVFS logic/algorithm to populate these percpu
> > variables. schedutil is the only such governor at the moment.
> > 
> > Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> > ---
> >  kernel/sched/core.c              |  3 ++-
> >  kernel/sched/cpufreq_schedutil.c | 10 +++++++++-
> >  kernel/sched/sched.h             | 25 ++++++++++++++-----------
> >  3 files changed, 25 insertions(+), 13 deletions(-)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 602e369753a3..f56eb44745a8 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
> >  
> >  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > +DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);
> 
> This makes no sense, why are you using SHARED_ALIGNED and thus wasting
> an entire cacheline for the one variable?

Err, brain fart. Sorry.


Thanks

--
Qais Yousef

[RFC PATCH 1/7] sched/pelt: Add a new function to approximate the future util_avg value
[RFC PATCH 2/7] sched/pelt: Add a new function to approximate runtime to reach given util
[RFC PATCH 3/7] sched/fair: Remove magic margin in fits_capacity()
[RFC PATCH 4/7] sched: cpufreq: Remove magic 1.25 headroom from apply_dvfs_headroom()
[RFC PATCH 5/7] sched/schedutil: Add a new tunable to dictate response time
[RFC PATCH 6/7] sched/pelt: Introduce PELT multiplier
[RFC PATCH 7/7] cpufreq: Change default transition delay to 2ms