Instead of the magical 1.25 headroom, use the new approximate_util_avg()
to provide headroom based on the dvfs_update_delay; which is the period
at which the cpufreq governor will send DVFS updates to the hardware.
Add a new percpu dvfs_update_delay that can be cheaply accessed whenever
apply_dvfs_headroom() is called. We expect cpufreq governors that rely
on util to drive its DVFS logic/algorithm to populate these percpu
variables. schedutil is the only such governor at the moment.
Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
---
kernel/sched/core.c | 3 ++-
kernel/sched/cpufreq_schedutil.c | 10 +++++++++-
kernel/sched/sched.h | 25 ++++++++++++++-----------
3 files changed, 25 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 602e369753a3..f56eb44745a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);
#ifdef CONFIG_SCHED_DEBUG
/*
@@ -7439,7 +7440,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* frequency will be gracefully reduced with the utilization decay.
*/
if (type == FREQUENCY_UTIL) {
- util = apply_dvfs_headroom(util_cfs) + cpu_util_rt(rq);
+ util = apply_dvfs_headroom(util_cfs, cpu) + cpu_util_rt(rq);
util = uclamp_rq_util_with(rq, util, p);
} else {
util = util_cfs + cpu_util_rt(rq);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0c7565ac31fb..04aa06846f31 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -519,15 +519,21 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
unsigned int rate_limit_us;
+ int cpu;
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
tunables->rate_limit_us = rate_limit_us;
- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+
sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ for_each_cpu(cpu, sg_policy->policy->cpus)
+ per_cpu(dvfs_update_delay, cpu) = rate_limit_us;
+ }
+
return count;
}
@@ -772,6 +778,8 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
+
+ per_cpu(dvfs_update_delay, cpu) = sg_policy->tunables->rate_limit_us;
}
if (policy_is_shared(policy))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b889ad399de..e06e512af192 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3001,6 +3001,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long approximate_util_avg(unsigned long util, u64 delta);
u64 approximate_runtime(unsigned long util);
+/*
+ * Any governor that relies on util signal to drive DVFS, must populate these
+ * percpu dvfs_update_delay variables.
+ *
+ * It should describe the rate/delay at which the governor sends DVFS freq
+ * update to the hardware in us.
+ */
+DECLARE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);
+
/*
* DVFS decision are made at discrete points. If CPU stays busy, the util will
* continue to grow, which means it could need to run at a higher frequency
@@ -3010,20 +3019,14 @@ u64 approximate_runtime(unsigned long util);
* to run at adequate performance point.
*
* This function provides enough headroom to provide adequate performance
- * assuming the CPU continues to be busy.
- *
- * At the moment it is a constant multiplication with 1.25.
+ * assuming the CPU continues to be busy. This headroom is based on the
+ * dvfs_update_delay of the cpufreq governor.
*
- * TODO: The headroom should be a function of the delay. 25% is too high
- * especially on powerful systems. For example, if the delay is 500us, it makes
- * more sense to give a small headroom as the next decision point is not far
- * away and will follow the util if it continues to rise. On the other hand if
- * the delay is 10ms, then we need a bigger headroom so the CPU won't struggle
- * at a lower frequency if it never goes to idle until then.
+ * XXX: Should we provide headroom when the util is decaying?
*/
-static inline unsigned long apply_dvfs_headroom(unsigned long util)
+static inline unsigned long apply_dvfs_headroom(unsigned long util, int cpu)
{
- return util + (util >> 2);
+ return approximate_util_avg(util, per_cpu(dvfs_update_delay, cpu));
}
/*
--
2.34.1
On Mon, Aug 28, 2023 at 12:32:00AM +0100, Qais Yousef wrote: > Instead of the magical 1.25 headroom, use the new approximate_util_avg() > to provide headroom based on the dvfs_update_delay; which is the period > at which the cpufreq governor will send DVFS updates to the hardware. > > Add a new percpu dvfs_update_delay that can be cheaply accessed whenever > apply_dvfs_headroom() is called. We expect cpufreq governors that rely > on util to drive its DVFS logic/algorithm to populate these percpu > variables. schedutil is the only such governor at the moment. > > Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io> > --- > kernel/sched/core.c | 3 ++- > kernel/sched/cpufreq_schedutil.c | 10 +++++++++- > kernel/sched/sched.h | 25 ++++++++++++++----------- > 3 files changed, 25 insertions(+), 13 deletions(-) > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 602e369753a3..f56eb44745a8 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); > EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); > > DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); > +DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay); This makes no sense, why are you using SHARED_ALIGNED and thus wasting an entire cacheline for the one variable?
On 09/07/23 13:34, Peter Zijlstra wrote: > On Mon, Aug 28, 2023 at 12:32:00AM +0100, Qais Yousef wrote: > > Instead of the magical 1.25 headroom, use the new approximate_util_avg() > > to provide headroom based on the dvfs_update_delay; which is the period > > at which the cpufreq governor will send DVFS updates to the hardware. > > > > Add a new percpu dvfs_update_delay that can be cheaply accessed whenever > > apply_dvfs_headroom() is called. We expect cpufreq governors that rely > > on util to drive its DVFS logic/algorithm to populate these percpu > > variables. schedutil is the only such governor at the moment. > > > > Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io> > > --- > > kernel/sched/core.c | 3 ++- > > kernel/sched/cpufreq_schedutil.c | 10 +++++++++- > > kernel/sched/sched.h | 25 ++++++++++++++----------- > > 3 files changed, 25 insertions(+), 13 deletions(-) > > > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > index 602e369753a3..f56eb44745a8 100644 > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); > > EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); > > > > DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); > > +DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay); > > This makes no sense, why are you using SHARED_ALIGNED and thus wasting > an entire cacheline for the one variable? Err, brain fart. Sorry. Thanks -- Qais Yousef
© 2016 - 2025 Red Hat, Inc.