sched/uclamp: Let each sched_class handle uclamp

[PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months, 2 weeks ago

While delayed dequeue issues were being resolved, uclamp was made out of
sync with cpufreq, especially in enqueue_task().

For example, when a task with uclamp_min goes through enqueue_task() and
updates cpufreq, its uclamp_min won't even be considered in the cpufreq
update. It is only after enqueue will the uclamp_min be added to rq
buckets, and cpufreq will only pick it up at the next update. This is
very different from the old behavior, where a uclamp value immediately
has an effect at enqueue. Worse, sub classes like fair.c issue cpufreq
updates on utilization changes. If no utilization changes for a while,
the new uclamp will be delayed further.

So, let each sched_class handle uclamp in its own class, in case delayed
dequeue needs further tweaks or there are potential future similar
changes, and make sure uclamp is picked up immediately on enqueue. In
fair.c, we re-use the guard logic for util_est.

Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/core.c  | 28 ++--------------------------
 kernel/sched/ext.c   |  8 ++++----
 kernel/sched/fair.c  | 12 ++++++------
 kernel/sched/rt.c    |  8 ++++----
 kernel/sched/sched.h |  9 +++++----
 5 files changed, 21 insertions(+), 44 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b00f884701a6..2d51608a4c46 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 	}
 }
 
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
 
@@ -1758,12 +1758,6 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 	if (!static_branch_unlikely(&sched_uclamp_used))
 		return;
 
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
-	if (p->se.sched_delayed)
-		return;
-
 	for_each_clamp_id(clamp_id)
 		uclamp_rq_inc_id(rq, p, clamp_id);
 
@@ -1772,7 +1766,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 		rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
 }
 
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
 
@@ -1785,12 +1779,6 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 	if (!static_branch_unlikely(&sched_uclamp_used))
 		return;
 
-	if (unlikely(!p->sched_class->uclamp_enabled))
-		return;
-
-	if (p->se.sched_delayed)
-		return;
-
 	for_each_clamp_id(clamp_id)
 		uclamp_rq_dec_id(rq, p, clamp_id);
 }
@@ -2029,8 +2017,6 @@ static void __init init_uclamp(void)
 }
 
 #else /* !CONFIG_UCLAMP_TASK */
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
-static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
 static inline void uclamp_fork(struct task_struct *p) { }
 static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
@@ -2066,11 +2052,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 		update_rq_clock(rq);
 
 	p->sched_class->enqueue_task(rq, p, flags);
-	/*
-	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
-	 * ->sched_delayed.
-	 */
-	uclamp_rq_inc(rq, p);
 
 	psi_enqueue(p, flags);
 
@@ -2097,11 +2078,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 
 	psi_dequeue(p, flags);
 
-	/*
-	 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
-	 * and mark the task ->sched_delayed.
-	 */
-	uclamp_rq_dec(rq, p);
 	return p->sched_class->dequeue_task(rq, p, flags);
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8857c0709bdd..4521c27f9ab8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2094,6 +2094,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 {
 	int sticky_cpu = p->scx.sticky_cpu;
 
+	uclamp_rq_inc(rq, p);
+
 	if (enq_flags & ENQUEUE_WAKEUP)
 		rq->scx.flags |= SCX_RQ_IN_WAKEUP;
 
@@ -2181,6 +2183,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 
 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
 {
+	uclamp_rq_dec(rq, p);
+
 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
 		WARN_ON_ONCE(task_runnable(p));
 		return true;
@@ -4456,10 +4460,6 @@ DEFINE_SCHED_CLASS(ext) = {
 	.prio_changed		= prio_changed_scx,
 
 	.update_curr		= update_curr_scx,
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		= 1,
-#endif
 };
 
 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 857808da23d8..7e5a653811ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * Let's add the task's estimated utilization to the cfs_rq's
 	 * estimated utilization, before we update schedutil.
 	 */
-	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
+		uclamp_rq_inc(rq, p);
 		util_est_enqueue(&rq->cfs, p);
+	}
 
 	if (flags & ENQUEUE_DELAYED) {
 		requeue_delayed_entity(se);
@@ -7183,8 +7185,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) {
+		uclamp_rq_dec(rq, p);
 		util_est_dequeue(&rq->cfs, p);
+	}
 
 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
 	if (dequeue_entities(rq, &p->se, flags) < 0)
@@ -13660,10 +13664,6 @@ DEFINE_SCHED_CLASS(fair) = {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	= task_is_throttled_fair,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		= 1,
-#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..7c0642ea85f2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1471,6 +1471,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
+	uclamp_rq_inc(rq, p);
+
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
@@ -1487,6 +1489,8 @@ static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
+	uclamp_rq_dec(rq, p);
+
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se, flags);
 
@@ -2649,10 +2653,6 @@ DEFINE_SCHED_CLASS(rt) = {
 #ifdef CONFIG_SCHED_CORE
 	.task_is_throttled	= task_is_throttled_rt,
 #endif
-
-#ifdef CONFIG_UCLAMP_TASK
-	.uclamp_enabled		= 1,
-#endif
 };
 
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab16d3d0e51c..990d87e8d8ed 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2410,10 +2410,6 @@ extern s64 update_curr_common(struct rq *rq);
 
 struct sched_class {
 
-#ifdef CONFIG_UCLAMP_TASK
-	int uclamp_enabled;
-#endif
-
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
@@ -3393,6 +3389,8 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
 #ifdef CONFIG_UCLAMP_TASK
 
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+void uclamp_rq_inc(struct rq *rq, struct task_struct *p);
+void uclamp_rq_dec(struct rq *rq, struct task_struct *p);
 
 static inline unsigned long uclamp_rq_get(struct rq *rq,
 					  enum uclamp_id clamp_id)
@@ -3470,6 +3468,9 @@ uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
 
 #else /* !CONFIG_UCLAMP_TASK: */
 
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { };
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { };
+
 static inline unsigned long
 uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
 {
-- 
2.34.1

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Xuewen Yan 11 months, 1 week ago

Hi Hongyan,

On Thu, Feb 27, 2025 at 9:55 PM Hongyan Xia <hongyan.xia2@arm.com> wrote:
>
> While delayed dequeue issues were being resolved, uclamp was made out of
> sync with cpufreq, especially in enqueue_task().
>
> For example, when a task with uclamp_min goes through enqueue_task() and
> updates cpufreq, its uclamp_min won't even be considered in the cpufreq
> update. It is only after enqueue will the uclamp_min be added to rq
> buckets, and cpufreq will only pick it up at the next update. This is
> very different from the old behavior, where a uclamp value immediately
> has an effect at enqueue. Worse, sub classes like fair.c issue cpufreq
> updates on utilization changes. If no utilization changes for a while,
> the new uclamp will be delayed further.
>
> So, let each sched_class handle uclamp in its own class, in case delayed
> dequeue needs further tweaks or there are potential future similar
> changes, and make sure uclamp is picked up immediately on enqueue. In
> fair.c, we re-use the guard logic for util_est.
>
> Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
> ---
>  kernel/sched/core.c  | 28 ++--------------------------
>  kernel/sched/ext.c   |  8 ++++----
>  kernel/sched/fair.c  | 12 ++++++------
>  kernel/sched/rt.c    |  8 ++++----
>  kernel/sched/sched.h |  9 +++++----
>  5 files changed, 21 insertions(+), 44 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b00f884701a6..2d51608a4c46 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1745,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>         }
>  }
>
> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
> +void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>  {
>         enum uclamp_id clamp_id;
>
> @@ -1758,12 +1758,6 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>         if (!static_branch_unlikely(&sched_uclamp_used))
>                 return;
>
> -       if (unlikely(!p->sched_class->uclamp_enabled))
> -               return;
> -
> -       if (p->se.sched_delayed)
> -               return;
> -
>         for_each_clamp_id(clamp_id)
>                 uclamp_rq_inc_id(rq, p, clamp_id);
>
> @@ -1772,7 +1766,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>                 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
>  }
>
> -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
> +void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>  {
>         enum uclamp_id clamp_id;
>
> @@ -1785,12 +1779,6 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>         if (!static_branch_unlikely(&sched_uclamp_used))
>                 return;
>
> -       if (unlikely(!p->sched_class->uclamp_enabled))
> -               return;
> -
> -       if (p->se.sched_delayed)
> -               return;
> -
>         for_each_clamp_id(clamp_id)
>                 uclamp_rq_dec_id(rq, p, clamp_id);
>  }
> @@ -2029,8 +2017,6 @@ static void __init init_uclamp(void)
>  }
>
>  #else /* !CONFIG_UCLAMP_TASK */
> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
> -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
>  static inline void uclamp_fork(struct task_struct *p) { }
>  static inline void uclamp_post_fork(struct task_struct *p) { }
>  static inline void init_uclamp(void) { }
> @@ -2066,11 +2052,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>                 update_rq_clock(rq);
>
>         p->sched_class->enqueue_task(rq, p, flags);
> -       /*
> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> -        * ->sched_delayed.
> -        */
> -       uclamp_rq_inc(rq, p);
>
>         psi_enqueue(p, flags);
>
> @@ -2097,11 +2078,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
>
>         psi_dequeue(p, flags);
>
> -       /*
> -        * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
> -        * and mark the task ->sched_delayed.
> -        */
> -       uclamp_rq_dec(rq, p);
>         return p->sched_class->dequeue_task(rq, p, flags);
>  }
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 8857c0709bdd..4521c27f9ab8 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -2094,6 +2094,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>  {
>         int sticky_cpu = p->scx.sticky_cpu;
>
> +       uclamp_rq_inc(rq, p);
> +
>         if (enq_flags & ENQUEUE_WAKEUP)
>                 rq->scx.flags |= SCX_RQ_IN_WAKEUP;
>
> @@ -2181,6 +2183,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
>
>  static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
>  {
> +       uclamp_rq_dec(rq, p);
> +
>         if (!(p->scx.flags & SCX_TASK_QUEUED)) {
>                 WARN_ON_ONCE(task_runnable(p));
>                 return true;
> @@ -4456,10 +4460,6 @@ DEFINE_SCHED_CLASS(ext) = {
>         .prio_changed           = prio_changed_scx,
>
>         .update_curr            = update_curr_scx,
> -
> -#ifdef CONFIG_UCLAMP_TASK
> -       .uclamp_enabled         = 1,
> -#endif
>  };
>
>  static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 857808da23d8..7e5a653811ad 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>          * Let's add the task's estimated utilization to the cfs_rq's
>          * estimated utilization, before we update schedutil.
>          */
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
> +               uclamp_rq_inc(rq, p);
>                 util_est_enqueue(&rq->cfs, p);
> +       }
>
>         if (flags & ENQUEUE_DELAYED) {
>                 requeue_delayed_entity(se);
> @@ -7183,8 +7185,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) {
> +               uclamp_rq_dec(rq, p);
>                 util_est_dequeue(&rq->cfs, p);
> +       }
>
>         util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>         if (dequeue_entities(rq, &p->se, flags) < 0)
> @@ -13660,10 +13664,6 @@ DEFINE_SCHED_CLASS(fair) = {
>  #ifdef CONFIG_SCHED_CORE
>         .task_is_throttled      = task_is_throttled_fair,
>  #endif
> -
> -#ifdef CONFIG_UCLAMP_TASK
> -       .uclamp_enabled         = 1,
> -#endif
>  };
>
>  #ifdef CONFIG_SCHED_DEBUG
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 4b8e33c615b1..7c0642ea85f2 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1471,6 +1471,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>  {
>         struct sched_rt_entity *rt_se = &p->rt;
>
> +       uclamp_rq_inc(rq, p);
> +
>         if (flags & ENQUEUE_WAKEUP)
>                 rt_se->timeout = 0;
>
> @@ -1487,6 +1489,8 @@ static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>  {
>         struct sched_rt_entity *rt_se = &p->rt;
>
> +       uclamp_rq_dec(rq, p);
> +
>         update_curr_rt(rq);
>         dequeue_rt_entity(rt_se, flags);
>
> @@ -2649,10 +2653,6 @@ DEFINE_SCHED_CLASS(rt) = {
>  #ifdef CONFIG_SCHED_CORE
>         .task_is_throttled      = task_is_throttled_rt,
>  #endif
> -
> -#ifdef CONFIG_UCLAMP_TASK
> -       .uclamp_enabled         = 1,
> -#endif
>  };
>
>  #ifdef CONFIG_RT_GROUP_SCHED
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index ab16d3d0e51c..990d87e8d8ed 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2410,10 +2410,6 @@ extern s64 update_curr_common(struct rq *rq);
>
>  struct sched_class {
>
> -#ifdef CONFIG_UCLAMP_TASK
> -       int uclamp_enabled;
> -#endif

Why delete the uclamp_enable?

> -
>         void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
>         bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
>         void (*yield_task)   (struct rq *rq);
> @@ -3393,6 +3389,8 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
>  #ifdef CONFIG_UCLAMP_TASK
>
>  unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
> +void uclamp_rq_inc(struct rq *rq, struct task_struct *p);
> +void uclamp_rq_dec(struct rq *rq, struct task_struct *p);
>
>  static inline unsigned long uclamp_rq_get(struct rq *rq,
>                                           enum uclamp_id clamp_id)
> @@ -3470,6 +3468,9 @@ uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
>
>  #else /* !CONFIG_UCLAMP_TASK: */
>
> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { };
> +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { };
> +
>  static inline unsigned long
>  uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
>  {
> --
> 2.34.1
>
>

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months, 1 week ago

On 06/03/2025 12:03, Xuewen Yan wrote:
> Hi Hongyan,
> 
> On Thu, Feb 27, 2025 at 9:55 PM Hongyan Xia <hongyan.xia2@arm.com> wrote:
>>
>> While delayed dequeue issues were being resolved, uclamp was made out of
>> sync with cpufreq, especially in enqueue_task().
>>
>> For example, when a task with uclamp_min goes through enqueue_task() and
>> updates cpufreq, its uclamp_min won't even be considered in the cpufreq
>> update. It is only after enqueue will the uclamp_min be added to rq
>> buckets, and cpufreq will only pick it up at the next update. This is
>> very different from the old behavior, where a uclamp value immediately
>> has an effect at enqueue. Worse, sub classes like fair.c issue cpufreq
>> updates on utilization changes. If no utilization changes for a while,
>> the new uclamp will be delayed further.
>>
>> So, let each sched_class handle uclamp in its own class, in case delayed
>> dequeue needs further tweaks or there are potential future similar
>> changes, and make sure uclamp is picked up immediately on enqueue. In
>> fair.c, we re-use the guard logic for util_est.
>>
>> Signed-off-by: Hongyan Xia <hongyan.xia2@arm.com>
>> ---
>>   kernel/sched/core.c  | 28 ++--------------------------
>>   kernel/sched/ext.c   |  8 ++++----
>>   kernel/sched/fair.c  | 12 ++++++------
>>   kernel/sched/rt.c    |  8 ++++----
>>   kernel/sched/sched.h |  9 +++++----
>>   5 files changed, 21 insertions(+), 44 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index b00f884701a6..2d51608a4c46 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -1745,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>>          }
>>   }
>>
>> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>> +void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>   {
>>          enum uclamp_id clamp_id;
>>
>> @@ -1758,12 +1758,6 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>          if (!static_branch_unlikely(&sched_uclamp_used))
>>                  return;
>>
>> -       if (unlikely(!p->sched_class->uclamp_enabled))
>> -               return;
>> -
>> -       if (p->se.sched_delayed)
>> -               return;
>> -
>>          for_each_clamp_id(clamp_id)
>>                  uclamp_rq_inc_id(rq, p, clamp_id);
>>
>> @@ -1772,7 +1766,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>                  rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
>>   }
>>
>> -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>> +void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>>   {
>>          enum uclamp_id clamp_id;
>>
>> @@ -1785,12 +1779,6 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>>          if (!static_branch_unlikely(&sched_uclamp_used))
>>                  return;
>>
>> -       if (unlikely(!p->sched_class->uclamp_enabled))
>> -               return;
>> -
>> -       if (p->se.sched_delayed)
>> -               return;
>> -
>>          for_each_clamp_id(clamp_id)
>>                  uclamp_rq_dec_id(rq, p, clamp_id);
>>   }
>> @@ -2029,8 +2017,6 @@ static void __init init_uclamp(void)
>>   }
>>
>>   #else /* !CONFIG_UCLAMP_TASK */
>> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
>> -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
>>   static inline void uclamp_fork(struct task_struct *p) { }
>>   static inline void uclamp_post_fork(struct task_struct *p) { }
>>   static inline void init_uclamp(void) { }
>> @@ -2066,11 +2052,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>>                  update_rq_clock(rq);
>>
>>          p->sched_class->enqueue_task(rq, p, flags);
>> -       /*
>> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
>> -        * ->sched_delayed.
>> -        */
>> -       uclamp_rq_inc(rq, p);
>>
>>          psi_enqueue(p, flags);
>>
>> @@ -2097,11 +2078,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
>>
>>          psi_dequeue(p, flags);
>>
>> -       /*
>> -        * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
>> -        * and mark the task ->sched_delayed.
>> -        */
>> -       uclamp_rq_dec(rq, p);
>>          return p->sched_class->dequeue_task(rq, p, flags);
>>   }
>>
>> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
>> index 8857c0709bdd..4521c27f9ab8 100644
>> --- a/kernel/sched/ext.c
>> +++ b/kernel/sched/ext.c
>> @@ -2094,6 +2094,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>>   {
>>          int sticky_cpu = p->scx.sticky_cpu;
>>
>> +       uclamp_rq_inc(rq, p);
>> +
>>          if (enq_flags & ENQUEUE_WAKEUP)
>>                  rq->scx.flags |= SCX_RQ_IN_WAKEUP;
>>
>> @@ -2181,6 +2183,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
>>
>>   static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
>>   {
>> +       uclamp_rq_dec(rq, p);
>> +
>>          if (!(p->scx.flags & SCX_TASK_QUEUED)) {
>>                  WARN_ON_ONCE(task_runnable(p));
>>                  return true;
>> @@ -4456,10 +4460,6 @@ DEFINE_SCHED_CLASS(ext) = {
>>          .prio_changed           = prio_changed_scx,
>>
>>          .update_curr            = update_curr_scx,
>> -
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       .uclamp_enabled         = 1,
>> -#endif
>>   };
>>
>>   static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 857808da23d8..7e5a653811ad 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>           * Let's add the task's estimated utilization to the cfs_rq's
>>           * estimated utilization, before we update schedutil.
>>           */
>> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>> +               uclamp_rq_inc(rq, p);
>>                  util_est_enqueue(&rq->cfs, p);
>> +       }
>>
>>          if (flags & ENQUEUE_DELAYED) {
>>                  requeue_delayed_entity(se);
>> @@ -7183,8 +7185,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>>    */
>>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>   {
>> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
>> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) {
>> +               uclamp_rq_dec(rq, p);
>>                  util_est_dequeue(&rq->cfs, p);
>> +       }
>>
>>          util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>>          if (dequeue_entities(rq, &p->se, flags) < 0)
>> @@ -13660,10 +13664,6 @@ DEFINE_SCHED_CLASS(fair) = {
>>   #ifdef CONFIG_SCHED_CORE
>>          .task_is_throttled      = task_is_throttled_fair,
>>   #endif
>> -
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       .uclamp_enabled         = 1,
>> -#endif
>>   };
>>
>>   #ifdef CONFIG_SCHED_DEBUG
>> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>> index 4b8e33c615b1..7c0642ea85f2 100644
>> --- a/kernel/sched/rt.c
>> +++ b/kernel/sched/rt.c
>> @@ -1471,6 +1471,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>>   {
>>          struct sched_rt_entity *rt_se = &p->rt;
>>
>> +       uclamp_rq_inc(rq, p);
>> +
>>          if (flags & ENQUEUE_WAKEUP)
>>                  rt_se->timeout = 0;
>>
>> @@ -1487,6 +1489,8 @@ static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>>   {
>>          struct sched_rt_entity *rt_se = &p->rt;
>>
>> +       uclamp_rq_dec(rq, p);
>> +
>>          update_curr_rt(rq);
>>          dequeue_rt_entity(rt_se, flags);
>>
>> @@ -2649,10 +2653,6 @@ DEFINE_SCHED_CLASS(rt) = {
>>   #ifdef CONFIG_SCHED_CORE
>>          .task_is_throttled      = task_is_throttled_rt,
>>   #endif
>> -
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       .uclamp_enabled         = 1,
>> -#endif
>>   };
>>
>>   #ifdef CONFIG_RT_GROUP_SCHED
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index ab16d3d0e51c..990d87e8d8ed 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2410,10 +2410,6 @@ extern s64 update_curr_common(struct rq *rq);
>>
>>   struct sched_class {
>>
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       int uclamp_enabled;
>> -#endif
> 
> Why delete the uclamp_enable?
> 

After moving uclamp enqueue dequeue into each sched_class, we no longer 
check sched_class->uclamp_enabled in core.c and it won't be used anywhere.

>> -
>>          void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
>>          bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
>>          void (*yield_task)   (struct rq *rq);
>> @@ -3393,6 +3389,8 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
>>   #ifdef CONFIG_UCLAMP_TASK
>>
>>   unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
>> +void uclamp_rq_inc(struct rq *rq, struct task_struct *p);
>> +void uclamp_rq_dec(struct rq *rq, struct task_struct *p);
>>
>>   static inline unsigned long uclamp_rq_get(struct rq *rq,
>>                                            enum uclamp_id clamp_id)
>> @@ -3470,6 +3468,9 @@ uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
>>
>>   #else /* !CONFIG_UCLAMP_TASK: */
>>
>> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { };
>> +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { };
>> +
>>   static inline unsigned long
>>   uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
>>   {
>> --
>> 2.34.1
>>
>>

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Dietmar Eggemann 11 months, 1 week ago

On 27/02/2025 14:54, Hongyan Xia wrote:

[...]

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 857808da23d8..7e5a653811ad 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	 * Let's add the task's estimated utilization to the cfs_rq's
>  	 * estimated utilization, before we update schedutil.
>  	 */
> -	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> +	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
> +		uclamp_rq_inc(rq, p);
>  		util_est_enqueue(&rq->cfs, p);
> +	}

So you want to have p uclamp-enqueued so that its uclamp_min value
counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
enqueue_task_fair?

  if (p->in_iowait)
    cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);

  enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
  cpufreq_update_util()

But if you do this before requeue_delayed_entity() (1) you will not
uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?

[...]

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Xuewen Yan 11 months, 1 week ago

On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 27/02/2025 14:54, Hongyan Xia wrote:
>
> [...]
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 857808da23d8..7e5a653811ad 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >        * Let's add the task's estimated utilization to the cfs_rq's
> >        * estimated utilization, before we update schedutil.
> >        */
> > -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> > +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
> > +             uclamp_rq_inc(rq, p);
> >               util_est_enqueue(&rq->cfs, p);
> > +     }
>
> So you want to have p uclamp-enqueued so that its uclamp_min value
> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
> enqueue_task_fair?
>
>   if (p->in_iowait)
>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>
>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>   cpufreq_update_util()
>
> But if you do this before requeue_delayed_entity() (1) you will not
> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>

Could we change to the following:

when enqueue:

-     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
& ENQUEUE_RESTORE))))
+     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
+             uclamp_rq_inc(rq, p);
               util_est_enqueue(&rq->cfs, p);
 +     }


when dequeue:

-       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) ||
(flags & DEQUEUE_SAVE))))
+       if (!p->se.sched_delayed) {
+               uclamp_rq_dec(rq, p);
                util_est_dequeue(&rq->cfs, p);
+       }


> [...]
>

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Dietmar Eggemann 11 months, 1 week ago

On 06/03/2025 13:01, Xuewen Yan wrote:
> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
>>
>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>
>> [...]
>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 857808da23d8..7e5a653811ad 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>        * Let's add the task's estimated utilization to the cfs_rq's
>>>        * estimated utilization, before we update schedutil.
>>>        */
>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>>> +             uclamp_rq_inc(rq, p);
>>>               util_est_enqueue(&rq->cfs, p);
>>> +     }
>>
>> So you want to have p uclamp-enqueued so that its uclamp_min value
>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
>> enqueue_task_fair?
>>
>>   if (p->in_iowait)
>>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>
>>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>   cpufreq_update_util()
>>
>> But if you do this before requeue_delayed_entity() (1) you will not
>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>>
> 
> Could we change to the following:
> 
> when enqueue:
> 
> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
> & ENQUEUE_RESTORE))))
> +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))

Why you want to check ENQUEUE_DELAYED as well here? Isn't
!p->se.sched_delayed implying !ENQUEUE_DELAYED).

> +             uclamp_rq_inc(rq, p);
>                util_est_enqueue(&rq->cfs, p);
>  +     }
> 
> 
> when dequeue:
> 
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) ||
> (flags & DEQUEUE_SAVE))))
> +       if (!p->se.sched_delayed) {
> +               uclamp_rq_dec(rq, p);
>                 util_est_dequeue(&rq->cfs, p);
> +       }

So you want to exclude all delayed tasks from util_est and uclamp?

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Xuewen Yan 11 months ago

On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 06/03/2025 13:01, Xuewen Yan wrote:
> > On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
> > <dietmar.eggemann@arm.com> wrote:
> >>
> >> On 27/02/2025 14:54, Hongyan Xia wrote:
> >>
> >> [...]
> >>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 857808da23d8..7e5a653811ad 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>>        * Let's add the task's estimated utilization to the cfs_rq's
> >>>        * estimated utilization, before we update schedutil.
> >>>        */
> >>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> >>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
> >>> +             uclamp_rq_inc(rq, p);
> >>>               util_est_enqueue(&rq->cfs, p);
> >>> +     }
> >>
> >> So you want to have p uclamp-enqueued so that its uclamp_min value
> >> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
> >> enqueue_task_fair?
> >>
> >>   if (p->in_iowait)
> >>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> >>
> >>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
> >>   cpufreq_update_util()
> >>
> >> But if you do this before requeue_delayed_entity() (1) you will not
> >> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
> >>
> >
> > Could we change to the following:
> >
> > when enqueue:
> >
> > -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
> > & ENQUEUE_RESTORE))))
> > +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
>
> Why you want to check ENQUEUE_DELAYED as well here? Isn't
> !p->se.sched_delayed implying !ENQUEUE_DELAYED).

Indeed, the (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))) is equal to
the  (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
ENQUEUE_RESTORE)))).
I just think it might be easier to read using the ENQUEUE_DELAYED flag.
Because we only allow enq the uclamp and util_est when wake up the delayed-task.


>
> > +             uclamp_rq_inc(rq, p);
> >                util_est_enqueue(&rq->cfs, p);
> >  +     }
> >
> >
> > when dequeue:
> >
> > -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) ||
> > (flags & DEQUEUE_SAVE))))
> > +       if (!p->se.sched_delayed) {
> > +               uclamp_rq_dec(rq, p);
> >                 util_est_dequeue(&rq->cfs, p);
> > +       }
>
> So you want to exclude all delayed tasks from util_est and uclamp?

Yes, Because we had dequeued the task's uclamp and util_est when first sleeping。
This is achieved by putting the update of uclamp and util_est before
dequeue_entity.
For other dequeue, we should always ignore it. So there is no need to
judge the flags and migration.

BR

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Dietmar Eggemann 11 months ago

On 10/03/2025 03:41, Xuewen Yan wrote:
> On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
>>
>> On 06/03/2025 13:01, Xuewen Yan wrote:
>>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
>>> <dietmar.eggemann@arm.com> wrote:
>>>>
>>>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>>>
>>>> [...]
>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 857808da23d8..7e5a653811ad 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>        * Let's add the task's estimated utilization to the cfs_rq's
>>>>>        * estimated utilization, before we update schedutil.
>>>>>        */
>>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>>>>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>>>>> +             uclamp_rq_inc(rq, p);
>>>>>               util_est_enqueue(&rq->cfs, p);
>>>>> +     }
>>>>
>>>> So you want to have p uclamp-enqueued so that its uclamp_min value
>>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
>>>> enqueue_task_fair?
>>>>
>>>>   if (p->in_iowait)
>>>>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>>>
>>>>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>>>   cpufreq_update_util()
>>>>
>>>> But if you do this before requeue_delayed_entity() (1) you will not
>>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>>>>
>>>
>>> Could we change to the following:
>>>
>>> when enqueue:
>>>
>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>> & ENQUEUE_RESTORE))))
>>> +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
>>
>> Why you want to check ENQUEUE_DELAYED as well here? Isn't
>> !p->se.sched_delayed implying !ENQUEUE_DELAYED).
> 
> Indeed, the (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))) is equal to
> the  (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
> ENQUEUE_RESTORE)))).
> I just think it might be easier to read using the ENQUEUE_DELAYED flag.
> Because we only allow enq the uclamp and util_est when wake up the delayed-task.

OK, I see.

So that means we would not have to move the uclamp handling into the sched
classes necessarily, we could use flags in enqueue_task() as well:

-->8--

Subject: [PATCH] Align uclamp and util_est and call before freq update

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 kernel/sched/core.c | 14 ++++++++------
 kernel/sched/fair.c |  4 ++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b60916d77482..f833108a3b2d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1747,7 +1747,8 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 	}
 }
 
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p,
+				 int flags)
 {
 	enum uclamp_id clamp_id;
 
@@ -1763,7 +1764,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
-	if (p->se.sched_delayed)
+	if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
 		return;
 
 	for_each_clamp_id(clamp_id)
@@ -2067,12 +2068,13 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
-	p->sched_class->enqueue_task(rq, p, flags);
 	/*
-	 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
-	 * ->sched_delayed.
+	 * Can be before ->enqueue_task() because uclamp considers the
+	 * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
+	 * in ->enqueue_task().
 	 */
-	uclamp_rq_inc(rq, p);
+	uclamp_rq_inc(rq, p, flags);
+	p->sched_class->enqueue_task(rq, p, flags);
 
 	psi_enqueue(p, flags);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 061a29e88ee2..e26d1dfea601 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6951,7 +6951,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * Let's add the task's estimated utilization to the cfs_rq's
 	 * estimated utilization, before we update schedutil.
 	 */
-	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+	if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
 		util_est_enqueue(&rq->cfs, p);
 
 	if (flags & ENQUEUE_DELAYED) {
@@ -7193,7 +7193,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+	if (!p->se.sched_delayed)
 		util_est_dequeue(&rq->cfs, p);
 
 	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
-- 
2.34.1

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Xuewen Yan 11 months ago

Hi Dietmar  and Hongyan,

On Mon, Mar 10, 2025 at 6:53 PM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 10/03/2025 03:41, Xuewen Yan wrote:
> > On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
> > <dietmar.eggemann@arm.com> wrote:
> >>
> >> On 06/03/2025 13:01, Xuewen Yan wrote:
> >>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
> >>> <dietmar.eggemann@arm.com> wrote:
> >>>>
> >>>> On 27/02/2025 14:54, Hongyan Xia wrote:
> >>>>
> >>>> [...]
> >>>>
> >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>> index 857808da23d8..7e5a653811ad 100644
> >>>>> --- a/kernel/sched/fair.c
> >>>>> +++ b/kernel/sched/fair.c
> >>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>>>>        * Let's add the task's estimated utilization to the cfs_rq's
> >>>>>        * estimated utilization, before we update schedutil.
> >>>>>        */
> >>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> >>>>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
> >>>>> +             uclamp_rq_inc(rq, p);
> >>>>>               util_est_enqueue(&rq->cfs, p);
> >>>>> +     }
> >>>>
> >>>> So you want to have p uclamp-enqueued so that its uclamp_min value
> >>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
> >>>> enqueue_task_fair?
> >>>>
> >>>>   if (p->in_iowait)
> >>>>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> >>>>
> >>>>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
> >>>>   cpufreq_update_util()
> >>>>
> >>>> But if you do this before requeue_delayed_entity() (1) you will not
> >>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
> >>>>
> >>>
> >>> Could we change to the following:
> >>>
> >>> when enqueue:
> >>>
> >>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
> >>> & ENQUEUE_RESTORE))))
> >>> +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
> >>
> >> Why you want to check ENQUEUE_DELAYED as well here? Isn't
> >> !p->se.sched_delayed implying !ENQUEUE_DELAYED).
> >
> > Indeed, the (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))) is equal to
> > the  (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
> > ENQUEUE_RESTORE)))).
> > I just think it might be easier to read using the ENQUEUE_DELAYED flag.
> > Because we only allow enq the uclamp and util_est when wake up the delayed-task.
>
> OK, I see.
>
> So that means we would not have to move the uclamp handling into the sched
> classes necessarily, we could use flags in enqueue_task() as well:
>
> -->8--
>
> Subject: [PATCH] Align uclamp and util_est and call before freq update
>
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> ---
>  kernel/sched/core.c | 14 ++++++++------
>  kernel/sched/fair.c |  4 ++--
>  2 files changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b60916d77482..f833108a3b2d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1747,7 +1747,8 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>         }
>  }
>
> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p,
> +                                int flags)
>  {
>         enum uclamp_id clamp_id;
>
> @@ -1763,7 +1764,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>         if (unlikely(!p->sched_class->uclamp_enabled))
>                 return;
>
> -       if (p->se.sched_delayed)
> +       if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
>                 return;
>
>         for_each_clamp_id(clamp_id)
> @@ -2067,12 +2068,13 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>         if (!(flags & ENQUEUE_NOCLOCK))
>                 update_rq_clock(rq);
>
> -       p->sched_class->enqueue_task(rq, p, flags);
>         /*
> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> -        * ->sched_delayed.
> +        * Can be before ->enqueue_task() because uclamp considers the
> +        * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
> +        * in ->enqueue_task().
>          */
> -       uclamp_rq_inc(rq, p);
> +       uclamp_rq_inc(rq, p, flags);
> +       p->sched_class->enqueue_task(rq, p, flags);
>
>         psi_enqueue(p, flags);
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 061a29e88ee2..e26d1dfea601 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6951,7 +6951,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>          * Let's add the task's estimated utilization to the cfs_rq's
>          * estimated utilization, before we update schedutil.
>          */
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> +       if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
>                 util_est_enqueue(&rq->cfs, p);
>
>         if (flags & ENQUEUE_DELAYED) {
> @@ -7193,7 +7193,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> +       if (!p->se.sched_delayed)
Whether uclamp should be placed in the class might require further
discussion, but at this point, I believe the conditions for util-est
can be simplified.
I discovered another issue with util-est in the past couple of days,
and I have also included the simplification of its condition check in
the same patch.
Could you please help review it?
https://lore.kernel.org/all/20250314090909.8404-1-xuewen.yan@unisoc.com/

Thanks!
BR

>                 util_est_dequeue(&rq->cfs, p);
>
>         util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
> --
> 2.34.1
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Xuewen Yan 11 months ago

Hi Dietmar,

On Mon, Mar 10, 2025 at 6:53 PM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 10/03/2025 03:41, Xuewen Yan wrote:
> > On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
> > <dietmar.eggemann@arm.com> wrote:
> >>
> >> On 06/03/2025 13:01, Xuewen Yan wrote:
> >>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
> >>> <dietmar.eggemann@arm.com> wrote:
> >>>>
> >>>> On 27/02/2025 14:54, Hongyan Xia wrote:
> >>>>
> >>>> [...]
> >>>>
> >>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>>> index 857808da23d8..7e5a653811ad 100644
> >>>>> --- a/kernel/sched/fair.c
> >>>>> +++ b/kernel/sched/fair.c
> >>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>>>>        * Let's add the task's estimated utilization to the cfs_rq's
> >>>>>        * estimated utilization, before we update schedutil.
> >>>>>        */
> >>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> >>>>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
> >>>>> +             uclamp_rq_inc(rq, p);
> >>>>>               util_est_enqueue(&rq->cfs, p);
> >>>>> +     }
> >>>>
> >>>> So you want to have p uclamp-enqueued so that its uclamp_min value
> >>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
> >>>> enqueue_task_fair?
> >>>>
> >>>>   if (p->in_iowait)
> >>>>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> >>>>
> >>>>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
> >>>>   cpufreq_update_util()
> >>>>
> >>>> But if you do this before requeue_delayed_entity() (1) you will not
> >>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
> >>>>
> >>>
> >>> Could we change to the following:
> >>>
> >>> when enqueue:
> >>>
> >>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
> >>> & ENQUEUE_RESTORE))))
> >>> +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
> >>
> >> Why you want to check ENQUEUE_DELAYED as well here? Isn't
> >> !p->se.sched_delayed implying !ENQUEUE_DELAYED).
> >
> > Indeed, the (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))) is equal to
> > the  (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
> > ENQUEUE_RESTORE)))).
> > I just think it might be easier to read using the ENQUEUE_DELAYED flag.
> > Because we only allow enq the uclamp and util_est when wake up the delayed-task.
>
> OK, I see.
>
> So that means we would not have to move the uclamp handling into the sched
> classes necessarily, we could use flags in enqueue_task() as well:
>
> -->8--
>
> Subject: [PATCH] Align uclamp and util_est and call before freq update
>
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> ---
>  kernel/sched/core.c | 14 ++++++++------
>  kernel/sched/fair.c |  4 ++--
>  2 files changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b60916d77482..f833108a3b2d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1747,7 +1747,8 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>         }
>  }
>
> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p,
> +                                int flags)
>  {
>         enum uclamp_id clamp_id;
>
> @@ -1763,7 +1764,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>         if (unlikely(!p->sched_class->uclamp_enabled))
>                 return;
>
> -       if (p->se.sched_delayed)
> +       if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
>                 return;
>
>         for_each_clamp_id(clamp_id)
> @@ -2067,12 +2068,13 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>         if (!(flags & ENQUEUE_NOCLOCK))
>                 update_rq_clock(rq);
>
> -       p->sched_class->enqueue_task(rq, p, flags);
>         /*
> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
> -        * ->sched_delayed.
> +        * Can be before ->enqueue_task() because uclamp considers the
> +        * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
> +        * in ->enqueue_task().
>          */
> -       uclamp_rq_inc(rq, p);
> +       uclamp_rq_inc(rq, p, flags);
> +       p->sched_class->enqueue_task(rq, p, flags);
>
>         psi_enqueue(p, flags);
>

I submitted a patch similar to yours before:

https://lore.kernel.org/all/CAB8ipk_AvaOWp9QhmnFDdbFSWcKLhCH151=no6kRO2z+pSJfyQ@mail.gmail.com/

And Hongyan fears that as more complexity goes into each sched_class
like delayed dequeue,
so it's better to just let the sched_class handle how uclamp is
enqueued and dequeued within itself rather than leaking into core.c.


> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 061a29e88ee2..e26d1dfea601 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6951,7 +6951,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>          * Let's add the task's estimated utilization to the cfs_rq's
>          * estimated utilization, before we update schedutil.
>          */
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
> +       if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
>                 util_est_enqueue(&rq->cfs, p);
>
>         if (flags & ENQUEUE_DELAYED) {
> @@ -7193,7 +7193,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> +       if (!p->se.sched_delayed)
>                 util_est_dequeue(&rq->cfs, p);
>
>         util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
> --
> 2.34.1

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Dietmar Eggemann 11 months ago

On 10/03/2025 12:03, Xuewen Yan wrote:
> Hi Dietmar,
> 
> On Mon, Mar 10, 2025 at 6:53 PM Dietmar Eggemann
> <dietmar.eggemann@arm.com> wrote:
>>
>> On 10/03/2025 03:41, Xuewen Yan wrote:
>>> On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
>>> <dietmar.eggemann@arm.com> wrote:
>>>>
>>>> On 06/03/2025 13:01, Xuewen Yan wrote:
>>>>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
>>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>>
>>>>>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>>>>>
>>>>>> [...]
>>>>>>
>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>> index 857808da23d8..7e5a653811ad 100644
>>>>>>> --- a/kernel/sched/fair.c
>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>        * Let's add the task's estimated utilization to the cfs_rq's
>>>>>>>        * estimated utilization, before we update schedutil.
>>>>>>>        */
>>>>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>>>>>>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>>>>>>> +             uclamp_rq_inc(rq, p);
>>>>>>>               util_est_enqueue(&rq->cfs, p);
>>>>>>> +     }
>>>>>>
>>>>>> So you want to have p uclamp-enqueued so that its uclamp_min value
>>>>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
>>>>>> enqueue_task_fair?
>>>>>>
>>>>>>   if (p->in_iowait)
>>>>>>     cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>>>>>
>>>>>>   enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>>>>>   cpufreq_update_util()
>>>>>>
>>>>>> But if you do this before requeue_delayed_entity() (1) you will not
>>>>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>>>>>>
>>>>>
>>>>> Could we change to the following:
>>>>>
>>>>> when enqueue:
>>>>>
>>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>>>> & ENQUEUE_RESTORE))))
>>>>> +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
>>>>
>>>> Why you want to check ENQUEUE_DELAYED as well here? Isn't
>>>> !p->se.sched_delayed implying !ENQUEUE_DELAYED).
>>>
>>> Indeed, the (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))) is equal to
>>> the  (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
>>> ENQUEUE_RESTORE)))).
>>> I just think it might be easier to read using the ENQUEUE_DELAYED flag.
>>> Because we only allow enq the uclamp and util_est when wake up the delayed-task.
>>
>> OK, I see.
>>
>> So that means we would not have to move the uclamp handling into the sched
>> classes necessarily, we could use flags in enqueue_task() as well:
>>
>> -->8--
>>
>> Subject: [PATCH] Align uclamp and util_est and call before freq update
>>
>> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> ---
>>  kernel/sched/core.c | 14 ++++++++------
>>  kernel/sched/fair.c |  4 ++--
>>  2 files changed, 10 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index b60916d77482..f833108a3b2d 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -1747,7 +1747,8 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>>         }
>>  }
>>
>> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p,
>> +                                int flags)
>>  {
>>         enum uclamp_id clamp_id;
>>
>> @@ -1763,7 +1764,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>         if (unlikely(!p->sched_class->uclamp_enabled))
>>                 return;
>>
>> -       if (p->se.sched_delayed)
>> +       if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
>>                 return;
>>
>>         for_each_clamp_id(clamp_id)
>> @@ -2067,12 +2068,13 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>>         if (!(flags & ENQUEUE_NOCLOCK))
>>                 update_rq_clock(rq);
>>
>> -       p->sched_class->enqueue_task(rq, p, flags);
>>         /*
>> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
>> -        * ->sched_delayed.
>> +        * Can be before ->enqueue_task() because uclamp considers the
>> +        * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
>> +        * in ->enqueue_task().
>>          */
>> -       uclamp_rq_inc(rq, p);
>> +       uclamp_rq_inc(rq, p, flags);
>> +       p->sched_class->enqueue_task(rq, p, flags);
>>
>>         psi_enqueue(p, flags);
>>
> 
> I submitted a patch similar to yours before:
> 
> https://lore.kernel.org/all/CAB8ipk_AvaOWp9QhmnFDdbFSWcKLhCH151=no6kRO2z+pSJfyQ@mail.gmail.com/
> 
> And Hongyan fears that as more complexity goes into each sched_class
> like delayed dequeue,
> so it's better to just let the sched_class handle how uclamp is
> enqueued and dequeued within itself rather than leaking into core.c.

Ah, OK. Your patch didn't have 'sched' in the subject so I didn't see it
immediately.

I would prefer that uclamp stays in core.c. ENQUEUE_DELAYED among all
the other flags is already used there (ttwu_runnable()).

task_struct contains  sched_{,rt_,dl_}entity}. We just have to be
careful when switching policies.

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months ago

On 10/03/2025 11:22, Dietmar Eggemann wrote:
> On 10/03/2025 12:03, Xuewen Yan wrote:
>> Hi Dietmar,
>>
>> On Mon, Mar 10, 2025 at 6:53 PM Dietmar Eggemann
>> <dietmar.eggemann@arm.com> wrote:
>>>
>>> On 10/03/2025 03:41, Xuewen Yan wrote:
>>>> On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>
>>>>> On 06/03/2025 13:01, Xuewen Yan wrote:
>>>>>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
>>>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>>>
>>>>>>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>>>>>>
>>>>>>> [...]
>>>>>>>
>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>>> index 857808da23d8..7e5a653811ad 100644
>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>>         * Let's add the task's estimated utilization to the cfs_rq's
>>>>>>>>         * estimated utilization, before we update schedutil.
>>>>>>>>         */
>>>>>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>>>>>>>> +     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>>>>>>>> +             uclamp_rq_inc(rq, p);
>>>>>>>>                util_est_enqueue(&rq->cfs, p);
>>>>>>>> +     }
>>>>>>>
>>>>>>> So you want to have p uclamp-enqueued so that its uclamp_min value
>>>>>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
>>>>>>> enqueue_task_fair?
>>>>>>>
>>>>>>>    if (p->in_iowait)
>>>>>>>      cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>>>>>>
>>>>>>>    enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>>>>>>    cpufreq_update_util()
>>>>>>>
>>>>>>> But if you do this before requeue_delayed_entity() (1) you will not
>>>>>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>>>>>>>
>>>>>>
>>>>>> Could we change to the following:
>>>>>>
>>>>>> when enqueue:
>>>>>>
>>>>>> -     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>>>>> & ENQUEUE_RESTORE))))
>>>>>> +     if (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)))
>>>>>
>>>>> Why you want to check ENQUEUE_DELAYED as well here? Isn't
>>>>> !p->se.sched_delayed implying !ENQUEUE_DELAYED).
>>>>
>>>> Indeed, the (!(p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))) is equal to
>>>> the  (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
>>>> ENQUEUE_RESTORE)))).
>>>> I just think it might be easier to read using the ENQUEUE_DELAYED flag.
>>>> Because we only allow enq the uclamp and util_est when wake up the delayed-task.
>>>
>>> OK, I see.
>>>
>>> So that means we would not have to move the uclamp handling into the sched
>>> classes necessarily, we could use flags in enqueue_task() as well:
>>>
>>> -->8--
>>>
>>> Subject: [PATCH] Align uclamp and util_est and call before freq update
>>>
>>> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>>> ---
>>>   kernel/sched/core.c | 14 ++++++++------
>>>   kernel/sched/fair.c |  4 ++--
>>>   2 files changed, 10 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index b60916d77482..f833108a3b2d 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -1747,7 +1747,8 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>>>          }
>>>   }
>>>
>>> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p,
>>> +                                int flags)
>>>   {
>>>          enum uclamp_id clamp_id;
>>>
>>> @@ -1763,7 +1764,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>>          if (unlikely(!p->sched_class->uclamp_enabled))
>>>                  return;
>>>
>>> -       if (p->se.sched_delayed)
>>> +       if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
>>>                  return;
>>>
>>>          for_each_clamp_id(clamp_id)
>>> @@ -2067,12 +2068,13 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>>>          if (!(flags & ENQUEUE_NOCLOCK))
>>>                  update_rq_clock(rq);
>>>
>>> -       p->sched_class->enqueue_task(rq, p, flags);
>>>          /*
>>> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
>>> -        * ->sched_delayed.
>>> +        * Can be before ->enqueue_task() because uclamp considers the
>>> +        * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
>>> +        * in ->enqueue_task().
>>>           */
>>> -       uclamp_rq_inc(rq, p);
>>> +       uclamp_rq_inc(rq, p, flags);
>>> +       p->sched_class->enqueue_task(rq, p, flags);
>>>
>>>          psi_enqueue(p, flags);
>>>
>>
>> I submitted a patch similar to yours before:
>>
>> https://lore.kernel.org/all/CAB8ipk_AvaOWp9QhmnFDdbFSWcKLhCH151=no6kRO2z+pSJfyQ@mail.gmail.com/
>>
>> And Hongyan fears that as more complexity goes into each sched_class
>> like delayed dequeue,
>> so it's better to just let the sched_class handle how uclamp is
>> enqueued and dequeued within itself rather than leaking into core.c.
> 
> Ah, OK. Your patch didn't have 'sched' in the subject so I didn't see it
> immediately.
> 
> I would prefer that uclamp stays in core.c. ENQUEUE_DELAYED among all
> the other flags is already used there (ttwu_runnable()).
> 
> task_struct contains  sched_{,rt_,dl_}entity}. We just have to be
> careful when switching policies.

I lean towards letting each class handle uclamp. We've seen the trouble 
with delayed dequeue. Just like the if condition we have for util_est, 
if uclamp is in each class then we can re-use the condition easily, 
otherwise we need to carefully synchronize the enqueue/dequeue between 
core.c and the sub class.

Also I think so far we are assuming delayed dequeue is the only trouble 
maker. If RT and sched_ext have their own corner cases (I think maybe 
sched_ext is likely because it may eventually want the ext scheduler to 
be able to decide on uclamp itself) then the uclamp inc/dec in core.c 
need to cater for that as well. Once a task is in a class, the variables 
in another class may be in an undefined state, so checking corner cases 
for all the sub-classes in a centralized place like core.c may not even 
be easy to get right.

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Dietmar Eggemann 11 months ago

On 10/03/2025 12:56, Hongyan Xia wrote:
> On 10/03/2025 11:22, Dietmar Eggemann wrote:
>> On 10/03/2025 12:03, Xuewen Yan wrote:
>>> Hi Dietmar,
>>>
>>> On Mon, Mar 10, 2025 at 6:53 PM Dietmar Eggemann
>>> <dietmar.eggemann@arm.com> wrote:
>>>>
>>>> On 10/03/2025 03:41, Xuewen Yan wrote:
>>>>> On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
>>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>>
>>>>>> On 06/03/2025 13:01, Xuewen Yan wrote:
>>>>>>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
>>>>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>>>>
>>>>>>>> On 27/02/2025 14:54, Hongyan Xia wrote:

[...]

>>> I submitted a patch similar to yours before:
>>>
>>> https://lore.kernel.org/all/CAB8ipk_AvaOWp9QhmnFDdbFSWcKLhCH151=no6kRO2z+pSJfyQ@mail.gmail.com/
>>>
>>> And Hongyan fears that as more complexity goes into each sched_class
>>> like delayed dequeue,
>>> so it's better to just let the sched_class handle how uclamp is
>>> enqueued and dequeued within itself rather than leaking into core.c.
>>
>> Ah, OK. Your patch didn't have 'sched' in the subject so I didn't see it
>> immediately.
>>
>> I would prefer that uclamp stays in core.c. ENQUEUE_DELAYED among all
>> the other flags is already used there (ttwu_runnable()).
>>
>> task_struct contains  sched_{,rt_,dl_}entity}. We just have to be
>> careful when switching policies.
> 
> I lean towards letting each class handle uclamp. We've seen the trouble
> with delayed dequeue. Just like the if condition we have for util_est,
> if uclamp is in each class then we can re-use the condition easily,
> otherwise we need to carefully synchronize the enqueue/dequeue between
> core.c and the sub class.
> 
> Also I think so far we are assuming delayed dequeue is the only trouble
> maker. If RT and sched_ext have their own corner cases (I think maybe
> sched_ext is likely because it may eventually want the ext scheduler to
> be able to decide on uclamp itself) then the uclamp inc/dec in core.c
> need to cater for that as well. Once a task is in a class, the variables
> in another class may be in an undefined state, so checking corner cases
> for all the sub-classes in a centralized place like core.c may not even
> be easy to get right.

I do understand your concern with sched_ext but I still prefer the less
invasive change to get uclamp & util_est aligned for fair.c aligned.

AFAICS, related to policy changes, we have a
SCHED_WARN_ON(p->se.sched_delayed) in switched_to_fair() so far.

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months ago

On 10/03/2025 12:24, Dietmar Eggemann wrote:
> On 10/03/2025 12:56, Hongyan Xia wrote:
>> On 10/03/2025 11:22, Dietmar Eggemann wrote:
>>> On 10/03/2025 12:03, Xuewen Yan wrote:
>>>> Hi Dietmar,
>>>>
>>>> On Mon, Mar 10, 2025 at 6:53 PM Dietmar Eggemann
>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>
>>>>> On 10/03/2025 03:41, Xuewen Yan wrote:
>>>>>> On Sat, Mar 8, 2025 at 2:32 AM Dietmar Eggemann
>>>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>>>
>>>>>>> On 06/03/2025 13:01, Xuewen Yan wrote:
>>>>>>>> On Thu, Mar 6, 2025 at 2:24 AM Dietmar Eggemann
>>>>>>>> <dietmar.eggemann@arm.com> wrote:
>>>>>>>>>
>>>>>>>>> On 27/02/2025 14:54, Hongyan Xia wrote:
> 
> [...]
> 
>>>> I submitted a patch similar to yours before:
>>>>
>>>> https://lore.kernel.org/all/CAB8ipk_AvaOWp9QhmnFDdbFSWcKLhCH151=no6kRO2z+pSJfyQ@mail.gmail.com/
>>>>
>>>> And Hongyan fears that as more complexity goes into each sched_class
>>>> like delayed dequeue,
>>>> so it's better to just let the sched_class handle how uclamp is
>>>> enqueued and dequeued within itself rather than leaking into core.c.
>>>
>>> Ah, OK. Your patch didn't have 'sched' in the subject so I didn't see it
>>> immediately.
>>>
>>> I would prefer that uclamp stays in core.c. ENQUEUE_DELAYED among all
>>> the other flags is already used there (ttwu_runnable()).
>>>
>>> task_struct contains  sched_{,rt_,dl_}entity}. We just have to be
>>> careful when switching policies.
>>
>> I lean towards letting each class handle uclamp. We've seen the trouble
>> with delayed dequeue. Just like the if condition we have for util_est,
>> if uclamp is in each class then we can re-use the condition easily,
>> otherwise we need to carefully synchronize the enqueue/dequeue between
>> core.c and the sub class.
>>
>> Also I think so far we are assuming delayed dequeue is the only trouble
>> maker. If RT and sched_ext have their own corner cases (I think maybe
>> sched_ext is likely because it may eventually want the ext scheduler to
>> be able to decide on uclamp itself) then the uclamp inc/dec in core.c
>> need to cater for that as well. Once a task is in a class, the variables
>> in another class may be in an undefined state, so checking corner cases
>> for all the sub-classes in a centralized place like core.c may not even
>> be easy to get right.
> 
> I do understand your concern with sched_ext but I still prefer the less
> invasive change to get uclamp & util_est aligned for fair.c aligned.
> 
> AFAICS, related to policy changes, we have a
> SCHED_WARN_ON(p->se.sched_delayed) in switched_to_fair() so far.

I'm okay with staying in core.c, but care is needed to make sure uclamp 
inc and dec are balanced. We have been bitten by the unbalanced uclamp 
and util_est during delayed dequeue fixes and we finally made sure 
util_est is balanced (at least so far it seems). Reusing the same logic 
and moving uclamp into fair.c makes sure we don't get bitten again and 
is one reason why I prefer moving into each class.

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months, 1 week ago

On 05/03/2025 18:22, Dietmar Eggemann wrote:
> On 27/02/2025 14:54, Hongyan Xia wrote:
> 
> [...]
> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 857808da23d8..7e5a653811ad 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>   	 * Let's add the task's estimated utilization to the cfs_rq's
>>   	 * estimated utilization, before we update schedutil.
>>   	 */
>> -	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>> +	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>> +		uclamp_rq_inc(rq, p);
>>   		util_est_enqueue(&rq->cfs, p);
>> +	}
> 
> So you want to have p uclamp-enqueued so that its uclamp_min value
> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
> enqueue_task_fair?
> 
>    if (p->in_iowait)
>      cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
> 
>    enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>    cpufreq_update_util()
> 
> But if you do this before requeue_delayed_entity() (1) you will not
> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?

Sorry I'm not sure I'm following. The only condition of the 
uclamp_rq_inc() here should be

     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & 
ENQUEUE_RESTORE))))

Could you elaborate why it doesn't get enqueued?

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Dietmar Eggemann 11 months, 1 week ago

On 06/03/2025 11:53, Hongyan Xia wrote:
> On 05/03/2025 18:22, Dietmar Eggemann wrote:
>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>
>> [...]
>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 857808da23d8..7e5a653811ad 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct
>>> task_struct *p, int flags)
>>>        * Let's add the task's estimated utilization to the cfs_rq's
>>>        * estimated utilization, before we update schedutil.
>>>        */
>>> -    if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>> & ENQUEUE_RESTORE))))
>>> +    if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>> & ENQUEUE_RESTORE)))) {
>>> +        uclamp_rq_inc(rq, p);
>>>           util_est_enqueue(&rq->cfs, p);
>>> +    }
>>
>> So you want to have p uclamp-enqueued so that its uclamp_min value
>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
>> enqueue_task_fair?
>>
>>    if (p->in_iowait)
>>      cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>
>>    enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>    cpufreq_update_util()
>>
>> But if you do this before requeue_delayed_entity() (1) you will not
>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
> 
> Sorry I'm not sure I'm following. The only condition of the
> uclamp_rq_inc() here should be
> 
>     if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
> ENQUEUE_RESTORE))))
> 
> Could you elaborate why it doesn't get enqueued?

Let's say 'p->se.sched_delayed = 1' and we are in

enqueue_task()

  enqueue_task_fair()

    if (!(p->se.sched_delayed && ...)

      uclamp_rq_inc(rq, p);

So p wouldn't be included here.

But then p would be requeued in

      requeue_delayed_entity(se)

since you removed the uclamp_rq_inc() from enqueue_task() (after the
p->sched_class->enqueue_task) p will not be considered for uclamp.

[...]

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months, 1 week ago

On 06/03/2025 13:48, Dietmar Eggemann wrote:
> On 06/03/2025 11:53, Hongyan Xia wrote:
>> On 05/03/2025 18:22, Dietmar Eggemann wrote:
>>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>>
>>> [...]
>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index 857808da23d8..7e5a653811ad 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct
>>>> task_struct *p, int flags)
>>>>         * Let's add the task's estimated utilization to the cfs_rq's
>>>>         * estimated utilization, before we update schedutil.
>>>>         */
>>>> -    if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>>> & ENQUEUE_RESTORE))))
>>>> +    if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>>> & ENQUEUE_RESTORE)))) {
>>>> +        uclamp_rq_inc(rq, p);
>>>>            util_est_enqueue(&rq->cfs, p);
>>>> +    }
>>>
>>> So you want to have p uclamp-enqueued so that its uclamp_min value
>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls later in
>>> enqueue_task_fair?
>>>
>>>     if (p->in_iowait)
>>>       cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>>
>>>     enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>>     cpufreq_update_util()
>>>
>>> But if you do this before requeue_delayed_entity() (1) you will not
>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>>
>> Sorry I'm not sure I'm following. The only condition of the
>> uclamp_rq_inc() here should be
>>
>>      if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
>> ENQUEUE_RESTORE))))
>>
>> Could you elaborate why it doesn't get enqueued?
> 
> Let's say 'p->se.sched_delayed = 1' and we are in
> 
> enqueue_task()
> 
>    enqueue_task_fair()
> 
>      if (!(p->se.sched_delayed && ...)
> 
>        uclamp_rq_inc(rq, p);
> 
> So p wouldn't be included here.
> 
> But then p would be requeued in
> 
>        requeue_delayed_entity(se)
> 
> since you removed the uclamp_rq_inc() from enqueue_task() (after the
> p->sched_class->enqueue_task) p will not be considered for uclamp.
> 

I doubt this would be a concern as there are other conditions after 
checking p->se.sched_delayed. You would only skip the uclamp inc if you 
are both sched_delayed and meet the second part of the if.

Another reason is that, I think whatever we do should be consistent with 
what we did for util_est. If util_est also affects cpufreq then I doubt 
uclamp should be enqueue/dequeued differently, as it would be difficult 
to argue why sometimes util_est affects cpufreq while uclamp doesn't and 
why sometimes uclamp does and util_est doesn't.

Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

Posted by Hongyan Xia 11 months, 1 week ago

On 06/03/2025 14:26, Hongyan Xia wrote:
> On 06/03/2025 13:48, Dietmar Eggemann wrote:
>> On 06/03/2025 11:53, Hongyan Xia wrote:
>>> On 05/03/2025 18:22, Dietmar Eggemann wrote:
>>>> On 27/02/2025 14:54, Hongyan Xia wrote:
>>>>
>>>> [...]
>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 857808da23d8..7e5a653811ad 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct
>>>>> task_struct *p, int flags)
>>>>>         * Let's add the task's estimated utilization to the cfs_rq's
>>>>>         * estimated utilization, before we update schedutil.
>>>>>         */
>>>>> -    if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>>>> & ENQUEUE_RESTORE))))
>>>>> +    if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags
>>>>> & ENQUEUE_RESTORE)))) {
>>>>> +        uclamp_rq_inc(rq, p);
>>>>>            util_est_enqueue(&rq->cfs, p);
>>>>> +    }
>>>>
>>>> So you want to have p uclamp-enqueued so that its uclamp_min value
>>>> counts for the cpufreq_update_util()/cfs_rq_util_change() calls 
>>>> later in
>>>> enqueue_task_fair?
>>>>
>>>>     if (p->in_iowait)
>>>>       cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
>>>>
>>>>     enqueue_entity() -> update_load_avg() -> cfs_rq_util_change() ->
>>>>     cpufreq_update_util()
>>>>
>>>> But if you do this before requeue_delayed_entity() (1) you will not
>>>> uclamp-enqueue p which got his ->sched_delayed just cleared in (1)?
>>>
>>> Sorry I'm not sure I'm following. The only condition of the
>>> uclamp_rq_inc() here should be
>>>
>>>      if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
>>> ENQUEUE_RESTORE))))
>>>
>>> Could you elaborate why it doesn't get enqueued?
>>
>> Let's say 'p->se.sched_delayed = 1' and we are in
>>
>> enqueue_task()
>>
>>    enqueue_task_fair()
>>
>>      if (!(p->se.sched_delayed && ...)
>>
>>        uclamp_rq_inc(rq, p);
>>
>> So p wouldn't be included here.
>>
>> But then p would be requeued in
>>
>>        requeue_delayed_entity(se)
>>
>> since you removed the uclamp_rq_inc() from enqueue_task() (after the
>> p->sched_class->enqueue_task) p will not be considered for uclamp.
>>
> 
> I doubt this would be a concern as there are other conditions after 
> checking p->se.sched_delayed. You would only skip the uclamp inc if you 
> are both sched_delayed and meet the second part of the if.
> 
> Another reason is that, I think whatever we do should be consistent with 
> what we did for util_est. If util_est also affects cpufreq then I doubt 
> uclamp should be enqueue/dequeued differently, as it would be difficult 
> to argue why sometimes util_est affects cpufreq while uclamp doesn't and 
> why sometimes uclamp does and util_est doesn't.
> 

Sorry what I just said might be a bit misleading. What I wanted to say 
was that util_est and uclamp should be in sync, but exactly when to 
enqueue or dequeue these two can be changed.

Do we want to first agree on one thing, which is, should we keep the 
util_est and uclamp on the rq while a task is being delayed?