[PATCH 2/3] sched/fair: Generalize misfit lb by adding a misfit reason

Qais Yousef posted 3 patches 2 years ago
[PATCH 2/3] sched/fair: Generalize misfit lb by adding a misfit reason
Posted by Qais Yousef 2 years ago
MISFIT_PERF is what is currently implemented. It indicates that the task
requires moving to a more performant CPU.

Guard misfit handling in find_busiest_queue and update_sd_pick_busiest
with MISFIT_PERF. They explicitly assume this type of misfit

This generalizes misfit lb to allow for more types of misfits to be
added. Like MISFIT_POWER to help uclamp_max being more effective, and
MISFIT_LATENCY to help latency sensitive tasks to be spread in
oversubscribe conditions.

Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
---
 kernel/sched/fair.c  | 28 +++++++++++++++++++++++-----
 kernel/sched/sched.h |  8 ++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eb9e891182cc..dd49b89a6e3e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5063,7 +5063,8 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
 	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
 }
 
-static inline int is_misfit_task(struct task_struct *p, struct rq *rq)
+static inline int is_misfit_task(struct task_struct *p, struct rq *rq,
+				 misfit_reason_t *reason)
 {
 	if (!p || p->nr_cpus_allowed == 1)
 		return 0;
@@ -5071,16 +5072,21 @@ static inline int is_misfit_task(struct task_struct *p, struct rq *rq)
 	if (task_fits_cpu(p, cpu_of(rq)))
 		return 0;
 
+	if (reason)
+		*reason = MISFIT_PERF;
 	return 1;
 }
 
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 {
+	misfit_reason_t reason;
+
 	if (!sched_asym_cpucap_active())
 		return;
 
-	if (!is_misfit_task(p, rq)) {
+	if (!is_misfit_task(p, rq, &reason)) {
 		rq->misfit_task_load = 0;
+		rq->misfit_reason = -1;
 		return;
 	}
 
@@ -5089,6 +5095,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 	 * task_h_load() returns 0.
 	 */
 	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+	rq->misfit_reason = reason;
 }
 
 #else /* CONFIG_SMP */
@@ -9111,7 +9118,7 @@ static int detach_tasks(struct lb_env *env)
 
 		case migrate_misfit:
 			/* This is not a misfit task */
-			if (!is_misfit_task(p, cpu_rq(env->src_cpu)))
+			if (!is_misfit_task(p, cpu_rq(env->src_cpu), NULL))
 				goto next;
 
 			env->imbalance = 0;
@@ -9426,6 +9433,7 @@ struct sg_lb_stats {
 	unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
 	unsigned int group_smt_balance;  /* Task on busy SMT be moved */
 	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+	misfit_reason_t group_misfit_reason;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
@@ -9904,6 +9912,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			/* Check for a misfit task on the cpu */
 			if (sgs->group_misfit_task_load < rq->misfit_task_load) {
 				sgs->group_misfit_task_load = rq->misfit_task_load;
+				sgs->group_misfit_reason = rq->misfit_reason;
 				*sg_status |= SG_OVERLOAD;
 			}
 		} else if ((env->idle != CPU_NOT_IDLE) &&
@@ -9969,6 +9978,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	 */
 	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
 	    (sgs->group_type == group_misfit_task) &&
+	    (sgs->group_misfit_reason == MISFIT_PERF) &&
 	    (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
 	     sds->local_stat.group_type != group_has_spare))
 		return false;
@@ -10193,6 +10203,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 
 	for_each_cpu(i, sched_group_span(group)) {
 		struct rq *rq = cpu_rq(i);
+		misfit_reason_t reason;
 		unsigned int local;
 
 		sgs->group_load += cpu_load_without(rq, p);
@@ -10212,9 +10223,15 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 
 		/* Check if task fits in the CPU */
 		if (sd->flags & SD_ASYM_CPUCAPACITY &&
-		    sgs->group_misfit_task_load &&
-		    !is_misfit_task(p, rq))
+		    sgs->group_misfit_task_load) {
+		    if (!is_misfit_task(p, rq, &reason)) {
 			sgs->group_misfit_task_load = 0;
+			sgs->group_misfit_reason = -1;
+		    } else {
+			sgs->group_misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+			sgs->group_misfit_reason = reason;
+		    }
+		}
 
 	}
 
@@ -11008,6 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		 * average load.
 		 */
 		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+		    rq->misfit_reason == MISFIT_PERF &&
 		    !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
 		    nr_running == 1)
 			continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..399b6526afab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -962,6 +962,10 @@ struct balance_callback {
 	void (*func)(struct rq *rq);
 };
 
+typedef enum misfit_reason {
+	MISFIT_PERF,		/* Requires moving to a more performant CPU */
+} misfit_reason_t;
+
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -1168,6 +1172,10 @@ struct rq {
 	call_single_data_t	cfsb_csd;
 	struct list_head	cfsb_csd_list;
 #endif
+
+#ifdef CONFIG_SMP
+	misfit_reason_t		misfit_reason;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
2.34.1
Re: [PATCH 2/3] sched/fair: Generalize misfit lb by adding a misfit reason
Posted by Xuewen Yan 1 year, 5 months ago
Hi Qais

On Sat, Dec 9, 2023 at 9:19 AM Qais Yousef <qyousef@layalina.io> wrote:
>
> MISFIT_PERF is what is currently implemented. It indicates that the task
> requires moving to a more performant CPU.
>
> Guard misfit handling in find_busiest_queue and update_sd_pick_busiest
> with MISFIT_PERF. They explicitly assume this type of misfit
>
> This generalizes misfit lb to allow for more types of misfits to be
> added. Like MISFIT_POWER to help uclamp_max being more effective, and
> MISFIT_LATENCY to help latency sensitive tasks to be spread in
> oversubscribe conditions.
>
> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> ---
>  kernel/sched/fair.c  | 28 +++++++++++++++++++++++-----
>  kernel/sched/sched.h |  8 ++++++++
>  2 files changed, 31 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index eb9e891182cc..dd49b89a6e3e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5063,7 +5063,8 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
>         return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
>  }
>
> -static inline int is_misfit_task(struct task_struct *p, struct rq *rq)
> +static inline int is_misfit_task(struct task_struct *p, struct rq *rq,
> +                                misfit_reason_t *reason)
>  {
>         if (!p || p->nr_cpus_allowed == 1)
>                 return 0;
> @@ -5071,16 +5072,21 @@ static inline int is_misfit_task(struct task_struct *p, struct rq *rq)
>         if (task_fits_cpu(p, cpu_of(rq)))
>                 return 0;
>
> +       if (reason)
> +               *reason = MISFIT_PERF;
>         return 1;
>  }
>
>  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
>  {
> +       misfit_reason_t reason;
> +
>         if (!sched_asym_cpucap_active())
>                 return;
>
> -       if (!is_misfit_task(p, rq)) {
> +       if (!is_misfit_task(p, rq, &reason)) {
>                 rq->misfit_task_load = 0;
> +               rq->misfit_reason = -1;
>                 return;
>         }
>
> @@ -5089,6 +5095,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
>          * task_h_load() returns 0.
>          */
>         rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
> +       rq->misfit_reason = reason;
>  }
>
>  #else /* CONFIG_SMP */
> @@ -9111,7 +9118,7 @@ static int detach_tasks(struct lb_env *env)
>
>                 case migrate_misfit:
>                         /* This is not a misfit task */
> -                       if (!is_misfit_task(p, cpu_rq(env->src_cpu)))
> +                       if (!is_misfit_task(p, cpu_rq(env->src_cpu), NULL))
>                                 goto next;
>
>                         env->imbalance = 0;
> @@ -9426,6 +9433,7 @@ struct sg_lb_stats {
>         unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
>         unsigned int group_smt_balance;  /* Task on busy SMT be moved */
>         unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
> +       misfit_reason_t group_misfit_reason;
>  #ifdef CONFIG_NUMA_BALANCING
>         unsigned int nr_numa_running;
>         unsigned int nr_preferred_running;
> @@ -9904,6 +9912,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>                         /* Check for a misfit task on the cpu */
>                         if (sgs->group_misfit_task_load < rq->misfit_task_load) {
>                                 sgs->group_misfit_task_load = rq->misfit_task_load;
> +                               sgs->group_misfit_reason = rq->misfit_reason;
>                                 *sg_status |= SG_OVERLOAD;
>                         }
>                 } else if ((env->idle != CPU_NOT_IDLE) &&
> @@ -9969,6 +9978,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>          */
>         if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
>             (sgs->group_type == group_misfit_task) &&
> +           (sgs->group_misfit_reason == MISFIT_PERF) &&
>             (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
>              sds->local_stat.group_type != group_has_spare))
>                 return false;
> @@ -10193,6 +10203,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>
>         for_each_cpu(i, sched_group_span(group)) {
>                 struct rq *rq = cpu_rq(i);
> +               misfit_reason_t reason;
>                 unsigned int local;
>
>                 sgs->group_load += cpu_load_without(rq, p);
> @@ -10212,9 +10223,15 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>
>                 /* Check if task fits in the CPU */
>                 if (sd->flags & SD_ASYM_CPUCAPACITY &&
> -                   sgs->group_misfit_task_load &&
> -                   !is_misfit_task(p, rq))
> +                   sgs->group_misfit_task_load) {
> +                   if (!is_misfit_task(p, rq, &reason)) {
>                         sgs->group_misfit_task_load = 0;
> +                       sgs->group_misfit_reason = -1;
> +                   } else {
> +                       sgs->group_misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
> +                       sgs->group_misfit_reason = reason;
> +                   }
> +               }
>
>         }
>
> @@ -11008,6 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
>                  * average load.
>                  */
>                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> +                   rq->misfit_reason == MISFIT_PERF &&

In Android, I found this would cause a task loop to change the CPUs.
Maybe this should be removed. Because for the same capacity cpus, we
should skip this cpu when nr_running=1.

>                     !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
>                     nr_running == 1)
>                         continue;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e58a54bda77d..399b6526afab 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -962,6 +962,10 @@ struct balance_callback {
>         void (*func)(struct rq *rq);
>  };
>
> +typedef enum misfit_reason {
> +       MISFIT_PERF,            /* Requires moving to a more performant CPU */
> +} misfit_reason_t;
> +
>  /*
>   * This is the main, per-CPU runqueue data structure.
>   *
> @@ -1168,6 +1172,10 @@ struct rq {
>         call_single_data_t      cfsb_csd;
>         struct list_head        cfsb_csd_list;
>  #endif
> +
> +#ifdef CONFIG_SMP
> +       misfit_reason_t         misfit_reason;
> +#endif
>  };
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> --
> 2.34.1
>
Re: [PATCH 2/3] sched/fair: Generalize misfit lb by adding a misfit reason
Posted by Qais Yousef 1 year, 5 months ago
Hi Xuewen

On 07/17/24 16:26, Xuewen Yan wrote:
> Hi Qais
> 
> On Sat, Dec 9, 2023 at 9:19 AM Qais Yousef <qyousef@layalina.io> wrote:

> > @@ -11008,6 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> >                  * average load.
> >                  */
> >                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > +                   rq->misfit_reason == MISFIT_PERF &&
> 
> In Android, I found this would cause a task loop to change the CPUs.
> Maybe this should be removed. Because for the same capacity cpus, we
> should skip this cpu when nr_running=1.

Could you explain a bit more? Are you saying this is changing the behavior for
some use case? The check will ensure this path is only triggered for misfit
upmigration. Which AFAICT the only reason why this path was added.

The problem is that to implement another misfit reason, the check for
capacity_greater() is not true except for MISFIT_PERF. For MISFIT_POWER, we
want the CPU to be smaller.

I think Vincent is working on a better way to handle all of this now.

> 
> >                     !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
> >                     nr_running == 1)
> >                         continue;
Re: [PATCH 2/3] sched/fair: Generalize misfit lb by adding a misfit reason
Posted by Xuewen Yan 1 year, 5 months ago
Hi Qais

On Thu, Jul 25, 2024 at 5:35 AM Qais Yousef <qyousef@layalina.io> wrote:
>
> Hi Xuewen
>
> On 07/17/24 16:26, Xuewen Yan wrote:
> > Hi Qais
> >
> > On Sat, Dec 9, 2023 at 9:19 AM Qais Yousef <qyousef@layalina.io> wrote:
>
> > > @@ -11008,6 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> > >                  * average load.
> > >                  */
> > >                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > > +                   rq->misfit_reason == MISFIT_PERF &&
> >
> > In Android, I found this would cause a task loop to change the CPUs.
> > Maybe this should be removed. Because for the same capacity cpus, we
> > should skip this cpu when nr_running=1.
>
> Could you explain a bit more? Are you saying this is changing the behavior for
> some use case? The check will ensure this path is only triggered for misfit
> upmigration. Which AFAICT the only reason why this path was added.
>
> The problem is that to implement another misfit reason, the check for
> capacity_greater() is not true except for MISFIT_PERF. For MISFIT_POWER, we
> want the CPU to be smaller.

Sorry, it was my mistake.
After debugging, I found that there was a problem with my handling of
MISFIT_PERF.
But it is true that due to the influence of rt and irq load,
capacity_greater() sometimes does cause some confusion.
Sometimes we find that due to the different capacities between small
cores, a misfit task will migrate several times between small cores,
for example:
If capacity_cpu3 > capacity_cpu2 > capacity_cpu1 >capacity_cpu0,
the misfit task may migrate as follows: cpu0->cpu1->cpu2->cpu3.
I don't know if this migration is really necessary, but it does cause
me some confusion.

Thanks!

>
> I think Vincent is working on a better way to handle all of this now.
>
> >
> > >                     !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
> > >                     nr_running == 1)
> > >                         continue;
Re: [PATCH 2/3] sched/fair: Generalize misfit lb by adding a misfit reason
Posted by Qais Yousef 1 year, 4 months ago
On 07/29/24 18:47, Xuewen Yan wrote:
> Hi Qais
> 
> On Thu, Jul 25, 2024 at 5:35 AM Qais Yousef <qyousef@layalina.io> wrote:
> >
> > Hi Xuewen
> >
> > On 07/17/24 16:26, Xuewen Yan wrote:
> > > Hi Qais
> > >
> > > On Sat, Dec 9, 2023 at 9:19 AM Qais Yousef <qyousef@layalina.io> wrote:
> >
> > > > @@ -11008,6 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> > > >                  * average load.
> > > >                  */
> > > >                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > > > +                   rq->misfit_reason == MISFIT_PERF &&
> > >
> > > In Android, I found this would cause a task loop to change the CPUs.
> > > Maybe this should be removed. Because for the same capacity cpus, we
> > > should skip this cpu when nr_running=1.
> >
> > Could you explain a bit more? Are you saying this is changing the behavior for
> > some use case? The check will ensure this path is only triggered for misfit
> > upmigration. Which AFAICT the only reason why this path was added.
> >
> > The problem is that to implement another misfit reason, the check for
> > capacity_greater() is not true except for MISFIT_PERF. For MISFIT_POWER, we
> > want the CPU to be smaller.
> 
> Sorry, it was my mistake.

Np, it's always good to hear back in case there's a problem :)

> After debugging, I found that there was a problem with my handling of
> MISFIT_PERF.
> But it is true that due to the influence of rt and irq load,
> capacity_greater() sometimes does cause some confusion.
> Sometimes we find that due to the different capacities between small
> cores, a misfit task will migrate several times between small cores,
> for example:
> If capacity_cpu3 > capacity_cpu2 > capacity_cpu1 >capacity_cpu0,
> the misfit task may migrate as follows: cpu0->cpu1->cpu2->cpu3.
> I don't know if this migration is really necessary, but it does cause
> me some confusion.

It should be cheap in theory.

But have you verified that the load_balance type is misfit and not load balance
trying to distribute load on little cores? I think it is harmless if it is
caused by misfit, but yes looks unnecessary to me too.

I'd love to remove this 5% magic margin, but I have no idea how yet.