Improve VM DVFS and task placement behavior

[RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by David Dai 2 years, 10 months ago

For virtualization usecases, util_est and util_avg currently tracked
on the host aren't sufficient to accurately represent the workload on
vCPU threads, which results in poor frequency selection and performance.
For example, when a large workload migrates from a busy vCPU thread to
an idle vCPU thread, it incurs additional DVFS ramp-up latencies
as util accumulates.

Introduce a new "util_guest" member as an additional PELT signal that's
independently updated by the guest. When used, it's max aggregated to
provide a boost to both task_util and task_util_est.

Updating task_util and task_util_est will ensure:
-Better task placement decisions for vCPU threads on the host
-Correctly updating util_est.ewma during dequeue
-Additive util with other threads on the same runqueue for more
accurate frequency responses

Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
---
 include/linux/sched.h | 11 +++++++++++
 kernel/sched/core.c   | 18 +++++++++++++++++-
 kernel/sched/fair.c   | 15 +++++++++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..d8c346fcdf52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -445,6 +445,16 @@ struct util_est {
 #define UTIL_AVG_UNCHANGED		0x80000000
 } __attribute__((__aligned__(sizeof(u64))));
 
+/*
+ * For sched_setattr_nocheck() (kernel) only
+ *
+ * Allow vCPU threads to use UTIL_GUEST as a way to hint the scheduler with more
+ * accurate utilization info. This is useful when guest kernels have some way of
+ * tracking its own runqueue's utilization.
+ *
+ */
+#define SCHED_FLAG_UTIL_GUEST   0x20000000
+
 /*
  * The load/runnable/util_avg accumulates an infinite geometric series
  * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -499,6 +509,7 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
+	unsigned long			util_guest;
 	struct util_est			util_est;
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..7700ef5610c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2024,6 +2024,16 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
+static void __setscheduler_task_util(struct task_struct *p,
+				  const struct sched_attr *attr)
+{
+
+	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_GUEST)))
+		return;
+
+	p->se.avg.util_guest = attr->sched_util_min;
+}
+
 bool sched_task_on_rq(struct task_struct *p)
 {
 	return task_on_rq_queued(p);
@@ -7561,7 +7571,7 @@ static int __sched_setscheduler(struct task_struct *p,
 			return -EINVAL;
 	}
 
-	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV | SCHED_FLAG_UTIL_GUEST))
 		return -EINVAL;
 
 	/*
@@ -7583,6 +7593,9 @@ static int __sched_setscheduler(struct task_struct *p,
 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
 			return -EINVAL;
 
+		if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+			return -EINVAL;
+
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
@@ -7629,6 +7642,8 @@ static int __sched_setscheduler(struct task_struct *p,
 			goto change;
 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
 		retval = 0;
@@ -7718,6 +7733,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		__setscheduler_prio(p, newprio);
 	}
 	__setscheduler_uclamp(p, attr);
+	__setscheduler_task_util(p, attr);
 
 	if (queued) {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..998649554344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 
 static inline unsigned long task_util(struct task_struct *p)
 {
-	return READ_ONCE(p->se.avg.util_avg);
+	return max(READ_ONCE(p->se.avg.util_avg),
+			READ_ONCE(p->se.avg.util_guest));
 }
 
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+	return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
+			max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	util_est_enqueue(&rq->cfs, p);
 
+	/*
+	 * The normal code path for host thread enqueue doesn't take into
+	 * account guest task migrations when updating cpufreq util.
+	 * So, always update the cpufreq when a vCPU thread has a
+	 * non-zero util_guest value.
+	 */
+	if (READ_ONCE(p->se.avg.util_guest))
+		cpufreq_update_util(rq, 0);
+
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
-- 
2.40.0.348.gf938b09366-goog

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by Peter Zijlstra 2 years, 10 months ago

On Thu, Mar 30, 2023 at 03:43:36PM -0700, David Dai wrote:
> @@ -499,6 +509,7 @@ struct sched_avg {
>  	unsigned long			load_avg;
>  	unsigned long			runnable_avg;
>  	unsigned long			util_avg;
> +	unsigned long			util_guest;
>  	struct util_est			util_est;
>  } ____cacheline_aligned;
>  

Yeah, no... you'll have to make room first.

struct sched_avg {
	/* typedef u64 -> __u64 */ long long unsigned int     last_update_time;          /*     0     8 */
	/* typedef u64 -> __u64 */ long long unsigned int     load_sum;                  /*     8     8 */
	/* typedef u64 -> __u64 */ long long unsigned int     runnable_sum;              /*    16     8 */
	/* typedef u32 -> __u32 */ unsigned int               util_sum;                  /*    24     4 */
	/* typedef u32 -> __u32 */ unsigned int               period_contrib;            /*    28     4 */
	long unsigned int          load_avg;                                             /*    32     8 */
	long unsigned int          runnable_avg;                                         /*    40     8 */
	long unsigned int          util_avg;                                             /*    48     8 */
	struct util_est {
		unsigned int       enqueued;                                             /*    56     4 */
		unsigned int       ewma;                                                 /*    60     4 */
	} __attribute__((__aligned__(8)))util_est __attribute__((__aligned__(8))); /*    56     8 */

	/* size: 64, cachelines: 1, members: 9 */
	/* forced alignments: 1 */
} __attribute__((__aligned__(64)));

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by David Dai 2 years, 10 months ago

On Wed, Apr 5, 2023 at 1:14 AM Peter Zijlstra <peterz@infradead.org> wrote:
>

Hi Peter,

Appreciate your time,

> On Thu, Mar 30, 2023 at 03:43:36PM -0700, David Dai wrote:
> > @@ -499,6 +509,7 @@ struct sched_avg {
> >       unsigned long                   load_avg;
> >       unsigned long                   runnable_avg;
> >       unsigned long                   util_avg;
> > +     unsigned long                   util_guest;
> >       struct util_est                 util_est;
> >  } ____cacheline_aligned;
> >
>
> Yeah, no... you'll have to make room first.
>

I’m not sure what you mean. Do you mean making room by reducing
another member in the same struct? If so, which member would be a good
fit to shorten? Or do you mean something else entirely?

Thanks,
David

> struct sched_avg {
>         /* typedef u64 -> __u64 */ long long unsigned int     last_update_time;          /*     0     8 */
>         /* typedef u64 -> __u64 */ long long unsigned int     load_sum;                  /*     8     8 */
>         /* typedef u64 -> __u64 */ long long unsigned int     runnable_sum;              /*    16     8 */
>         /* typedef u32 -> __u32 */ unsigned int               util_sum;                  /*    24     4 */
>         /* typedef u32 -> __u32 */ unsigned int               period_contrib;            /*    28     4 */
>         long unsigned int          load_avg;                                             /*    32     8 */
>         long unsigned int          runnable_avg;                                         /*    40     8 */
>         long unsigned int          util_avg;                                             /*    48     8 */
>         struct util_est {
>                 unsigned int       enqueued;                                             /*    56     4 */
>                 unsigned int       ewma;                                                 /*    60     4 */
>         } __attribute__((__aligned__(8)))util_est __attribute__((__aligned__(8))); /*    56     8 */
>
>         /* size: 64, cachelines: 1, members: 9 */
>         /* forced alignments: 1 */
> } __attribute__((__aligned__(64)));
>
>

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by Peter Zijlstra 2 years, 10 months ago

On Wed, Apr 05, 2023 at 03:54:08PM -0700, David Dai wrote:
> On Wed, Apr 5, 2023 at 1:14 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> 
> Hi Peter,
> 
> Appreciate your time,
> 
> > On Thu, Mar 30, 2023 at 03:43:36PM -0700, David Dai wrote:
> > > @@ -499,6 +509,7 @@ struct sched_avg {
> > >       unsigned long                   load_avg;
> > >       unsigned long                   runnable_avg;
> > >       unsigned long                   util_avg;
> > > +     unsigned long                   util_guest;
> > >       struct util_est                 util_est;
> > >  } ____cacheline_aligned;
> > >
> >
> > Yeah, no... you'll have to make room first.
> >
> 
> I’m not sure what you mean. Do you mean making room by reducing
> another member in the same struct? If so, which member would be a good
> fit to shorten? Or do you mean something else entirely?

Yeah, as you can see below, this structure is completely filling up the
cacheline already so there's no room for another member. I've not looked
at this in detail in a little while so I'm not at all sure what would be
the easiest way to free up space.

Going by the rest of the discusion is seems this is the least of your
problems though.

> > struct sched_avg {
> >         /* typedef u64 -> __u64 */ long long unsigned int     last_update_time;          /*     0     8 */
> >         /* typedef u64 -> __u64 */ long long unsigned int     load_sum;                  /*     8     8 */
> >         /* typedef u64 -> __u64 */ long long unsigned int     runnable_sum;              /*    16     8 */
> >         /* typedef u32 -> __u32 */ unsigned int               util_sum;                  /*    24     4 */
> >         /* typedef u32 -> __u32 */ unsigned int               period_contrib;            /*    28     4 */
> >         long unsigned int          load_avg;                                             /*    32     8 */
> >         long unsigned int          runnable_avg;                                         /*    40     8 */
> >         long unsigned int          util_avg;                                             /*    48     8 */
> >         struct util_est {
> >                 unsigned int       enqueued;                                             /*    56     4 */
> >                 unsigned int       ewma;                                                 /*    60     4 */
> >         } __attribute__((__aligned__(8)))util_est __attribute__((__aligned__(8))); /*    56     8 */
> >
> >         /* size: 64, cachelines: 1, members: 9 */
> >         /* forced alignments: 1 */
> > } __attribute__((__aligned__(64)));
> >
> >

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by Dietmar Eggemann 2 years, 10 months ago

Hi David,

On 31/03/2023 00:43, David Dai wrote:
> For virtualization usecases, util_est and util_avg currently tracked
> on the host aren't sufficient to accurately represent the workload on
> vCPU threads, which results in poor frequency selection and performance.
> For example, when a large workload migrates from a busy vCPU thread to
> an idle vCPU thread, it incurs additional DVFS ramp-up latencies
> as util accumulates.
> 
> Introduce a new "util_guest" member as an additional PELT signal that's
> independently updated by the guest. When used, it's max aggregated to
> provide a boost to both task_util and task_util_est.
> 
> Updating task_util and task_util_est will ensure:
> -Better task placement decisions for vCPU threads on the host
> -Correctly updating util_est.ewma during dequeue
> -Additive util with other threads on the same runqueue for more
> accurate frequency responses
> 
> Co-developed-by: Saravana Kannan <saravanak@google.com>
> Signed-off-by: Saravana Kannan <saravanak@google.com>
> Signed-off-by: David Dai <davidai@google.com>
> ---

[...]

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6986ea31c984..998649554344 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
>  
>  static inline unsigned long task_util(struct task_struct *p)
>  {
> -	return READ_ONCE(p->se.avg.util_avg);
> +	return max(READ_ONCE(p->se.avg.util_avg),
> +			READ_ONCE(p->se.avg.util_guest));
>  }
>  
>  static inline unsigned long _task_util_est(struct task_struct *p)
>  {
>  	struct util_est ue = READ_ONCE(p->se.avg.util_est);
>  
> -	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
> +	return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
> +			max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
>  }

I can't see why the existing p->uclamp_req[UCLAMP_MIN].value can't be
used here instead p->se.avg.util_guest.

I do understand the issue of inheriting uclamp values at fork but don't
get the not being `additive` thing. We are at task level here.

The fact that you have to max util_avg and util_est directly in
task_util() and _task_util_est() tells me that there are places where
this helps and uclamp_task_util() is not called there.

When you say in the cover letter that you tried uclamp_min, how exactly
did you use it? Did you run the existing mainline or did you use
uclamp_min as a replacement for util_guest in this patch here?

>  static inline unsigned long task_util_est(struct task_struct *p)
> @@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	 */
>  	util_est_enqueue(&rq->cfs, p);
>  
> +	/*
> +	 * The normal code path for host thread enqueue doesn't take into
> +	 * account guest task migrations when updating cpufreq util.
> +	 * So, always update the cpufreq when a vCPU thread has a
> +	 * non-zero util_guest value.
> +	 */
> +	if (READ_ONCE(p->se.avg.util_guest))
> +		cpufreq_update_util(rq, 0);


This is because enqueue_entity() -> update_load_avg() ->
attach_entity_load_avg() -> cfs_rq_util_change() requires root run-queue
(&rq->cfs == cfs_rq) to call cpufreq_update_util()?

[...]

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by David Dai 2 years, 10 months ago

On Mon, Apr 3, 2023 at 4:40 AM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> Hi David,
Hi Dietmar, thanks for your comments.
>
> On 31/03/2023 00:43, David Dai wrote:
> > For virtualization usecases, util_est and util_avg currently tracked
> > on the host aren't sufficient to accurately represent the workload on
> > vCPU threads, which results in poor frequency selection and performance.
> > For example, when a large workload migrates from a busy vCPU thread to
> > an idle vCPU thread, it incurs additional DVFS ramp-up latencies
> > as util accumulates.
> >
> > Introduce a new "util_guest" member as an additional PELT signal that's
> > independently updated by the guest. When used, it's max aggregated to
> > provide a boost to both task_util and task_util_est.
> >
> > Updating task_util and task_util_est will ensure:
> > -Better task placement decisions for vCPU threads on the host
> > -Correctly updating util_est.ewma during dequeue
> > -Additive util with other threads on the same runqueue for more
> > accurate frequency responses
> >
> > Co-developed-by: Saravana Kannan <saravanak@google.com>
> > Signed-off-by: Saravana Kannan <saravanak@google.com>
> > Signed-off-by: David Dai <davidai@google.com>
> > ---
>
> [...]
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 6986ea31c984..998649554344 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
> >
> >  static inline unsigned long task_util(struct task_struct *p)
> >  {
> > -     return READ_ONCE(p->se.avg.util_avg);
> > +     return max(READ_ONCE(p->se.avg.util_avg),
> > +                     READ_ONCE(p->se.avg.util_guest));
> >  }
> >
> >  static inline unsigned long _task_util_est(struct task_struct *p)
> >  {
> >       struct util_est ue = READ_ONCE(p->se.avg.util_est);
> >
> > -     return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
> > +     return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
> > +                     max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
> >  }
>
> I can't see why the existing p->uclamp_req[UCLAMP_MIN].value can't be
> used here instead p->se.avg.util_guest.
Using p->uclamp_req[UCLAMP_MIN].value would result in folding in
uclamp values into task_util and task_util_est for all tasks that have
uclamp values set. The intent of these patches isn’t to modify
existing uclamp behaviour. Users would also override util values from
the guest when they set uclamp values.
>
> I do understand the issue of inheriting uclamp values at fork but don't
> get the not being `additive` thing. We are at task level here.
Uclamp values are max aggregated with other tasks at the runqueue
level when deciding CPU frequency. For example, a vCPU runqueue may
have an util of 512 that results in setting 512 to uclamp_min on the
vCPU task. This is insufficient to drive a frequency response if it
shares the runqueue with another host task running with util of 512 as
it would result in a clamped util value of 512 at the runqueue(Ex. If
a guest thread had just migrated onto this vCPU).
>
> The fact that you have to max util_avg and util_est directly in
> task_util() and _task_util_est() tells me that there are places where
> this helps and uclamp_task_util() is not called there.
Can you clarify on this point a bit more?
>
> When you say in the cover letter that you tried uclamp_min, how exactly
> did you use it? Did you run the existing mainline or did you use
> uclamp_min as a replacement for util_guest in this patch here?
I called sched_setattr_nocheck() with .sched_flags =
SCHED_FLAG_UTIL_CLAMP when updating uclamp_min and clamp_max is left
at 1024. Uclamp_min was not aggregated with task_util and
task_util_est during my testing. The only caveat there is that I added
a change to only reset uclamp on fork when testing(I realize there is
specifically a SCHED_FLAG_RESET_ON_FORK, but I didn’t want to reset
other sched attributes).
>
> >  static inline unsigned long task_util_est(struct task_struct *p)
> > @@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >        */
> >       util_est_enqueue(&rq->cfs, p);
> >
> > +     /*
> > +      * The normal code path for host thread enqueue doesn't take into
> > +      * account guest task migrations when updating cpufreq util.
> > +      * So, always update the cpufreq when a vCPU thread has a
> > +      * non-zero util_guest value.
> > +      */
> > +     if (READ_ONCE(p->se.avg.util_guest))
> > +             cpufreq_update_util(rq, 0);
>
>
> This is because enqueue_entity() -> update_load_avg() ->
> attach_entity_load_avg() -> cfs_rq_util_change() requires root run-queue
> (&rq->cfs == cfs_rq) to call cpufreq_update_util()?
The enqueue_entity() would not call into update_load_avg() due to the
check for !se->avg.last_update_time. se->avg.last_update_time is
non-zero because the vCPU task did not migrate before this enqueue.
This enqueue path is reached when util_guest is updated for the vCPU
task through the sched_setattr_nocheck call where we want to ensure a
frequency update occurs.
>
> [...]

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by Dietmar Eggemann 2 years, 10 months ago

On 04/04/2023 03:11, David Dai wrote:
> On Mon, Apr 3, 2023 at 4:40 AM Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
>>
>> Hi David,
> Hi Dietmar, thanks for your comments.
>>
>> On 31/03/2023 00:43, David Dai wrote:

[...]

>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 6986ea31c984..998649554344 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
>>>
>>>  static inline unsigned long task_util(struct task_struct *p)
>>>  {
>>> -     return READ_ONCE(p->se.avg.util_avg);
>>> +     return max(READ_ONCE(p->se.avg.util_avg),
>>> +                     READ_ONCE(p->se.avg.util_guest));
>>>  }
>>>
>>>  static inline unsigned long _task_util_est(struct task_struct *p)
>>>  {
>>>       struct util_est ue = READ_ONCE(p->se.avg.util_est);
>>>
>>> -     return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
>>> +     return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
>>> +                     max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
>>>  }
>>
>> I can't see why the existing p->uclamp_req[UCLAMP_MIN].value can't be
>> used here instead p->se.avg.util_guest.
> Using p->uclamp_req[UCLAMP_MIN].value would result in folding in
> uclamp values into task_util and task_util_est for all tasks that have
> uclamp values set. The intent of these patches isn’t to modify
> existing uclamp behaviour. Users would also override util values from
> the guest when they set uclamp values.
>>
>> I do understand the issue of inheriting uclamp values at fork but don't
>> get the not being `additive` thing. We are at task level here.

> Uclamp values are max aggregated with other tasks at the runqueue
> level when deciding CPU frequency. For example, a vCPU runqueue may
> have an util of 512 that results in setting 512 to uclamp_min on the
> vCPU task. This is insufficient to drive a frequency response if it
> shares the runqueue with another host task running with util of 512 as
> it would result in a clamped util value of 512 at the runqueue(Ex. If
> a guest thread had just migrated onto this vCPU).

OK, see your point now. You want an accurate per-task boost for this
vCPU task on the host run-queue.
And a scenario in which a vCPU can ask for 100% in these moments is not
sufficient I guess? In this case uclamp_min could work.

>> The fact that you have to max util_avg and util_est directly in
>> task_util() and _task_util_est() tells me that there are places where
>> this helps and uclamp_task_util() is not called there.
> Can you clarify on this point a bit more?

Sorry, I meant s/util_est/util_guest/.

The effect of the change in _task_util_est() you see via:

enqueue_task_fair()
  util_est_enqueue()
    cfs_rq->avg.util_est.enqueued += _task_util_est(p)

so that `sugov_get_util() -> cpu_util_cfs() ->
cfs_rq->avg.util_est.enqueued` can see the effect of util_guest?

Not sure about the change in task_util() yet.

>> When you say in the cover letter that you tried uclamp_min, how exactly
>> did you use it? Did you run the existing mainline or did you use
>> uclamp_min as a replacement for util_guest in this patch here?

> I called sched_setattr_nocheck() with .sched_flags =
> SCHED_FLAG_UTIL_CLAMP when updating uclamp_min and clamp_max is left
> at 1024. Uclamp_min was not aggregated with task_util and
> task_util_est during my testing. The only caveat there is that I added
> a change to only reset uclamp on fork when testing(I realize there is
> specifically a SCHED_FLAG_RESET_ON_FORK, but I didn’t want to reset
> other sched attributes).

OK, understood. It's essentially a util_est v2 for vCPU tasks on host.

>>>  static inline unsigned long task_util_est(struct task_struct *p)
>>> @@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>        */
>>>       util_est_enqueue(&rq->cfs, p);
>>>
>>> +     /*
>>> +      * The normal code path for host thread enqueue doesn't take into
>>> +      * account guest task migrations when updating cpufreq util.
>>> +      * So, always update the cpufreq when a vCPU thread has a
>>> +      * non-zero util_guest value.
>>> +      */
>>> +     if (READ_ONCE(p->se.avg.util_guest))
>>> +             cpufreq_update_util(rq, 0);
>>
>>
>> This is because enqueue_entity() -> update_load_avg() ->
>> attach_entity_load_avg() -> cfs_rq_util_change() requires root run-queue
>> (&rq->cfs == cfs_rq) to call cpufreq_update_util()?
> The enqueue_entity() would not call into update_load_avg() due to the
> check for !se->avg.last_update_time. se->avg.last_update_time is
> non-zero because the vCPU task did not migrate before this enqueue.
> This enqueue path is reached when util_guest is updated for the vCPU
> task through the sched_setattr_nocheck call where we want to ensure a
> frequency update occurs.

OK, vCPU tasks are pinned so always !WF_MIGRATED wakeup I guess?

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by Saravana Kannan 2 years, 10 months ago

On Wed, Apr 5, 2023 at 3:50 AM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 04/04/2023 03:11, David Dai wrote:
> > On Mon, Apr 3, 2023 at 4:40 AM Dietmar Eggemann
> > <dietmar.eggemann@arm.com> wrote:
> >>
> >> Hi David,
> > Hi Dietmar, thanks for your comments.
> >>
> >> On 31/03/2023 00:43, David Dai wrote:
>
> [...]
>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 6986ea31c984..998649554344 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
> >>>
> >>>  static inline unsigned long task_util(struct task_struct *p)
> >>>  {
> >>> -     return READ_ONCE(p->se.avg.util_avg);
> >>> +     return max(READ_ONCE(p->se.avg.util_avg),
> >>> +                     READ_ONCE(p->se.avg.util_guest));
> >>>  }
> >>>
> >>>  static inline unsigned long _task_util_est(struct task_struct *p)
> >>>  {
> >>>       struct util_est ue = READ_ONCE(p->se.avg.util_est);
> >>>
> >>> -     return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
> >>> +     return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
> >>> +                     max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
> >>>  }
> >>
> >> I can't see why the existing p->uclamp_req[UCLAMP_MIN].value can't be
> >> used here instead p->se.avg.util_guest.
> > Using p->uclamp_req[UCLAMP_MIN].value would result in folding in
> > uclamp values into task_util and task_util_est for all tasks that have
> > uclamp values set. The intent of these patches isn’t to modify
> > existing uclamp behaviour. Users would also override util values from
> > the guest when they set uclamp values.
> >>
> >> I do understand the issue of inheriting uclamp values at fork but don't
> >> get the not being `additive` thing. We are at task level here.
>
> > Uclamp values are max aggregated with other tasks at the runqueue
> > level when deciding CPU frequency. For example, a vCPU runqueue may
> > have an util of 512 that results in setting 512 to uclamp_min on the
> > vCPU task. This is insufficient to drive a frequency response if it
> > shares the runqueue with another host task running with util of 512 as
> > it would result in a clamped util value of 512 at the runqueue(Ex. If
> > a guest thread had just migrated onto this vCPU).
>
> OK, see your point now. You want an accurate per-task boost for this
> vCPU task on the host run-queue.
> And a scenario in which a vCPU can ask for 100% in these moments is not
> sufficient I guess? In this case uclamp_min could work.

Right. vCPU can have whatever utilization and there can be random host
threads completely unrelated to the VM. And we need to aggregate both
of their util when deciding CPU freq.

>
> >> The fact that you have to max util_avg and util_est directly in
> >> task_util() and _task_util_est() tells me that there are places where
> >> this helps and uclamp_task_util() is not called there.
> > Can you clarify on this point a bit more?
>
> Sorry, I meant s/util_est/util_guest/.
>
> The effect of the change in _task_util_est() you see via:
>
> enqueue_task_fair()
>   util_est_enqueue()
>     cfs_rq->avg.util_est.enqueued += _task_util_est(p)
>
> so that `sugov_get_util() -> cpu_util_cfs() ->
> cfs_rq->avg.util_est.enqueued` can see the effect of util_guest?
>
> Not sure about the change in task_util() yet.
>
> >> When you say in the cover letter that you tried uclamp_min, how exactly
> >> did you use it? Did you run the existing mainline or did you use
> >> uclamp_min as a replacement for util_guest in this patch here?
>
> > I called sched_setattr_nocheck() with .sched_flags =
> > SCHED_FLAG_UTIL_CLAMP when updating uclamp_min and clamp_max is left
> > at 1024. Uclamp_min was not aggregated with task_util and
> > task_util_est during my testing. The only caveat there is that I added
> > a change to only reset uclamp on fork when testing(I realize there is
> > specifically a SCHED_FLAG_RESET_ON_FORK, but I didn’t want to reset
> > other sched attributes).
>
> OK, understood. It's essentially a util_est v2 for vCPU tasks on host.

Yup. We initially looked into just overwriting util_est, but didn't
think that'll land well with the community :) as it was a bit messier
because we needed to make sure the current util_est update paths don't
run for vCPU tasks on host (because those values would be wrong).

> >>>  static inline unsigned long task_util_est(struct task_struct *p)
> >>> @@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>>        */
> >>>       util_est_enqueue(&rq->cfs, p);
> >>>
> >>> +     /*
> >>> +      * The normal code path for host thread enqueue doesn't take into
> >>> +      * account guest task migrations when updating cpufreq util.
> >>> +      * So, always update the cpufreq when a vCPU thread has a
> >>> +      * non-zero util_guest value.
> >>> +      */
> >>> +     if (READ_ONCE(p->se.avg.util_guest))
> >>> +             cpufreq_update_util(rq, 0);
> >>
> >>
> >> This is because enqueue_entity() -> update_load_avg() ->
> >> attach_entity_load_avg() -> cfs_rq_util_change() requires root run-queue
> >> (&rq->cfs == cfs_rq) to call cpufreq_update_util()?
> > The enqueue_entity() would not call into update_load_avg() due to the
> > check for !se->avg.last_update_time. se->avg.last_update_time is
> > non-zero because the vCPU task did not migrate before this enqueue.
> > This enqueue path is reached when util_guest is updated for the vCPU
> > task through the sched_setattr_nocheck call where we want to ensure a
> > frequency update occurs.
>
> OK, vCPU tasks are pinned so always !WF_MIGRATED wakeup I guess?

Even if say little-vCPU threads are allowed to migrate within little
CPUs, this will still be an issue. While a vCPU thread is continuously
running on a single CPU, a guest thread can migrate into that vCPU and
cause a huge increase in util_guest. But that won't trigger an cpufreq
update on the host side because the host doesn't see a task migration.
That's what David is trying to address.

-Saravana

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by David Dai 2 years, 10 months ago

On Wed, Apr 5, 2023 at 2:43 PM Saravana Kannan <saravanak@google.com> wrote:
>
> On Wed, Apr 5, 2023 at 3:50 AM Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
> >
> > On 04/04/2023 03:11, David Dai wrote:
> > > On Mon, Apr 3, 2023 at 4:40 AM Dietmar Eggemann
> > > <dietmar.eggemann@arm.com> wrote:
> > >>
> > >> Hi David,
> > > Hi Dietmar, thanks for your comments.
> > >>
> > >> On 31/03/2023 00:43, David Dai wrote:
> >
> > [...]
> >
> > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > >>> index 6986ea31c984..998649554344 100644
> > >>> --- a/kernel/sched/fair.c
> > >>> +++ b/kernel/sched/fair.c
> > >>> @@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
> > >>>
> > >>>  static inline unsigned long task_util(struct task_struct *p)
> > >>>  {
> > >>> -     return READ_ONCE(p->se.avg.util_avg);
> > >>> +     return max(READ_ONCE(p->se.avg.util_avg),
> > >>> +                     READ_ONCE(p->se.avg.util_guest));
> > >>>  }
> > >>>
> > >>>  static inline unsigned long _task_util_est(struct task_struct *p)
> > >>>  {
> > >>>       struct util_est ue = READ_ONCE(p->se.avg.util_est);
> > >>>
> > >>> -     return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
> > >>> +     return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
> > >>> +                     max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
> > >>>  }
> > >>
> > >> I can't see why the existing p->uclamp_req[UCLAMP_MIN].value can't be
> > >> used here instead p->se.avg.util_guest.
> > > Using p->uclamp_req[UCLAMP_MIN].value would result in folding in
> > > uclamp values into task_util and task_util_est for all tasks that have
> > > uclamp values set. The intent of these patches isn’t to modify
> > > existing uclamp behaviour. Users would also override util values from
> > > the guest when they set uclamp values.
> > >>
> > >> I do understand the issue of inheriting uclamp values at fork but don't
> > >> get the not being `additive` thing. We are at task level here.
> >
> > > Uclamp values are max aggregated with other tasks at the runqueue
> > > level when deciding CPU frequency. For example, a vCPU runqueue may
> > > have an util of 512 that results in setting 512 to uclamp_min on the
> > > vCPU task. This is insufficient to drive a frequency response if it
> > > shares the runqueue with another host task running with util of 512 as
> > > it would result in a clamped util value of 512 at the runqueue(Ex. If
> > > a guest thread had just migrated onto this vCPU).
> >
> > OK, see your point now. You want an accurate per-task boost for this
> > vCPU task on the host run-queue.
> > And a scenario in which a vCPU can ask for 100% in these moments is not
> > sufficient I guess? In this case uclamp_min could work.
>
> Right. vCPU can have whatever utilization and there can be random host
> threads completely unrelated to the VM. And we need to aggregate both
> of their util when deciding CPU freq.
>
> >
> > >> The fact that you have to max util_avg and util_est directly in
> > >> task_util() and _task_util_est() tells me that there are places where
> > >> this helps and uclamp_task_util() is not called there.
> > > Can you clarify on this point a bit more?
> >
> > Sorry, I meant s/util_est/util_guest/.
> >
> > The effect of the change in _task_util_est() you see via:
> >
> > enqueue_task_fair()
> >   util_est_enqueue()
> >     cfs_rq->avg.util_est.enqueued += _task_util_est(p)
> >
> > so that `sugov_get_util() -> cpu_util_cfs() ->
> > cfs_rq->avg.util_est.enqueued` can see the effect of util_guest?

That sequence looks correct to me.

> >
> > Not sure about the change in task_util() yet.

task_util() provides some signaling in addition to task_util_est() via:

find_energy_effcient_cpu()
  cpu_util_next()
    lsub_positive(&util, task_util(p));
    ...
    util += task_util(p);
    //Can provide a better signal than util_est.

dequeue_task_fair()
  util_est_update()
    ue.enqueued = task_util(p);
    //Updates ue.ewma

Thanks,
David

> >
> > >> When you say in the cover letter that you tried uclamp_min, how exactly
> > >> did you use it? Did you run the existing mainline or did you use
> > >> uclamp_min as a replacement for util_guest in this patch here?
> >
> > > I called sched_setattr_nocheck() with .sched_flags =
> > > SCHED_FLAG_UTIL_CLAMP when updating uclamp_min and clamp_max is left
> > > at 1024. Uclamp_min was not aggregated with task_util and
> > > task_util_est during my testing. The only caveat there is that I added
> > > a change to only reset uclamp on fork when testing(I realize there is
> > > specifically a SCHED_FLAG_RESET_ON_FORK, but I didn’t want to reset
> > > other sched attributes).
> >
> > OK, understood. It's essentially a util_est v2 for vCPU tasks on host.
>
> Yup. We initially looked into just overwriting util_est, but didn't
> think that'll land well with the community :) as it was a bit messier
> because we needed to make sure the current util_est update paths don't
> run for vCPU tasks on host (because those values would be wrong).
>
> > >>>  static inline unsigned long task_util_est(struct task_struct *p)
> > >>> @@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > >>>        */
> > >>>       util_est_enqueue(&rq->cfs, p);
> > >>>
> > >>> +     /*
> > >>> +      * The normal code path for host thread enqueue doesn't take into
> > >>> +      * account guest task migrations when updating cpufreq util.
> > >>> +      * So, always update the cpufreq when a vCPU thread has a
> > >>> +      * non-zero util_guest value.
> > >>> +      */
> > >>> +     if (READ_ONCE(p->se.avg.util_guest))
> > >>> +             cpufreq_update_util(rq, 0);
> > >>
> > >>
> > >> This is because enqueue_entity() -> update_load_avg() ->
> > >> attach_entity_load_avg() -> cfs_rq_util_change() requires root run-queue
> > >> (&rq->cfs == cfs_rq) to call cpufreq_update_util()?
> > > The enqueue_entity() would not call into update_load_avg() due to the
> > > check for !se->avg.last_update_time. se->avg.last_update_time is
> > > non-zero because the vCPU task did not migrate before this enqueue.
> > > This enqueue path is reached when util_guest is updated for the vCPU
> > > task through the sched_setattr_nocheck call where we want to ensure a
> > > frequency update occurs.
> >
> > OK, vCPU tasks are pinned so always !WF_MIGRATED wakeup I guess?
>
> Even if say little-vCPU threads are allowed to migrate within little
> CPUs, this will still be an issue. While a vCPU thread is continuously
> running on a single CPU, a guest thread can migrate into that vCPU and
> cause a huge increase in util_guest. But that won't trigger an cpufreq
> update on the host side because the host doesn't see a task migration.
> That's what David is trying to address.
>
> -Saravana

Re: [RFC PATCH 1/6] sched/fair: Add util_guest for tasks

Posted by Quentin Perret 2 years, 10 months ago

On Monday 03 Apr 2023 at 18:11:37 (-0700), David Dai wrote:
> On Mon, Apr 3, 2023 at 4:40 AM Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
> > I can't see why the existing p->uclamp_req[UCLAMP_MIN].value can't be
> > used here instead p->se.avg.util_guest.
> Using p->uclamp_req[UCLAMP_MIN].value would result in folding in
> uclamp values into task_util and task_util_est for all tasks that have
> uclamp values set. The intent of these patches isn’t to modify
> existing uclamp behaviour. Users would also override util values from
> the guest when they set uclamp values.

That shouldn't be a problem if host userspace is responsible for driving
the uclamp values in response to guest frequency requests in the first
place ...

> > I do understand the issue of inheriting uclamp values at fork but don't
> > get the not being `additive` thing. We are at task level here.
> Uclamp values are max aggregated with other tasks at the runqueue
> level when deciding CPU frequency. For example, a vCPU runqueue may
> have an util of 512 that results in setting 512 to uclamp_min on the
> vCPU task. This is insufficient to drive a frequency response if it
> shares the runqueue with another host task running with util of 512 as
> it would result in a clamped util value of 512 at the runqueue(Ex. If
> a guest thread had just migrated onto this vCPU).

Maybe it's a feature rather than bug?

It's not obvious giving extra powers to vCPU threads that other host
threads don't have is a good idea. The fact that vCPU threads are
limited to what the VMM would be allowed to request for its other
threads is more than desirable. I'd even say it's a requirement.

Thanks,
Quentin

[RFC PATCH 1/6] sched/fair: Add util_guest for tasks
[RFC PATCH 2/6] kvm: arm64: Add support for get_cur_cpufreq service
[RFC PATCH 3/6] kvm: arm64: Add support for util_hint service
[RFC PATCH 4/6] kvm: arm64: Add support for get_freqtbl service
[RFC PATCH 5/6] dt-bindings: cpufreq: add bindings for virtual kvm cpufreq
[RFC PATCH 6/6] cpufreq: add kvm-cpufreq driver