[PATCH] sched/fair: Refactor cpu_util_without()

Dietmar Eggemann posted 1 patch 4 years, 3 months ago
There is a newer version of this series
kernel/sched/fair.c | 143 ++++++++++++++++++--------------------------
1 file changed, 59 insertions(+), 84 deletions(-)
[PATCH] sched/fair: Refactor cpu_util_without()
Posted by Dietmar Eggemann 4 years, 3 months ago
Except the 'task has no contribution or is new' condition at the
beginning of cpu_util_without(), a cpu_util_next(..., dst_cpu = -1)
call can replace the rest of this function.

The UTIL_EST specific check for a race between select_task_rq_fair()
and detach_task() in case of an enqueued or running WF_EXEC task has
to be moved to cpu_util_next().
This was initially introduced by commit c469933e7721
("sched/fair: Fix cpu_util_wake() for 'execl' type workloads").
UnixBench's `execl` throughput tests were run on the dual socket 40
CPUs Intel E5-2690 v2 machine to make sure the regression doesn't
occur again.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---

There is still a lot of CPU utilization related code. cpu_util_without()
and cpu_util_next() are very similar. In fact the former can be
refactored to use a call to the latter to be able to remove some
redundancy.

 kernel/sched/fair.c | 143 ++++++++++++++++++--------------------------
 1 file changed, 59 insertions(+), 84 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16874e112fe6..c084c2e29e40 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6511,6 +6511,64 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	return target;
 }
 
+/*
+ * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+ * to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+{
+	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	/*
+	 * If @p migrates from @cpu to another, remove its contribution. Or,
+	 * if @p migrates from another CPU to @cpu, add its contribution. In
+	 * the other cases, @cpu is not impacted by the migration, so the
+	 * util_avg should already be correct.
+	 */
+	if (task_cpu(p) == cpu && dst_cpu != cpu)
+		lsub_positive(&util, task_util(p));
+	else if (task_cpu(p) != cpu && dst_cpu == cpu)
+		util += task_util(p);
+
+	if (sched_feat(UTIL_EST)) {
+		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+		/*
+		 * During wake-up, the task isn't enqueued yet and doesn't
+		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+		 * so just add it (if needed) to "simulate" what will be
+		 * cpu_util after the task has been enqueued.
+		 */
+		if (dst_cpu == cpu)
+			util_est += _task_util_est(p);
+
+		/*
+		 * Despite the following checks we still have a small window
+		 * for a possible race, when an execl's select_task_rq_fair()
+		 * races with LB's detach_task():
+		 *
+		 *   detach_task()
+		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
+		 *     ---------------------------------- A
+		 *     deactivate_task()                   \
+		 *       dequeue_task()                     + RaceTime
+		 *         util_est_dequeue()              /
+		 *     ---------------------------------- B
+		 *
+		 * The additional check on "current == p" it's required to
+		 * properly fix the execl regression and it helps in further
+		 * reducing the chances for the above race.
+		 */
+		if (unlikely(task_on_rq_queued(p) || current == p))
+			lsub_positive(&util_est, _task_util_est(p));
+
+		util = max(util, util_est);
+	}
+
+	return min(util, capacity_orig_of(cpu));
+}
+
 /*
  * cpu_util_without: compute cpu utilization without any contributions from *p
  * @cpu: the CPU which utilization is requested
@@ -6526,19 +6584,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  */
 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
-	struct cfs_rq *cfs_rq;
-	unsigned int util;
-
 	/* Task has no contribution or is new */
 	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 		return cpu_util_cfs(cpu);
 
-	cfs_rq = &cpu_rq(cpu)->cfs;
-	util = READ_ONCE(cfs_rq->avg.util_avg);
-
-	/* Discount task's util from CPU's util */
-	lsub_positive(&util, task_util(p));
-
 	/*
 	 * Covered cases:
 	 *
@@ -6560,82 +6609,8 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 	 *    estimation of the spare capacity on that CPU, by just
 	 *    considering the expected utilization of tasks already
 	 *    runnable on that CPU.
-	 *
-	 * Cases a) and b) are covered by the above code, while case c) is
-	 * covered by the following code when estimated utilization is
-	 * enabled.
 	 */
-	if (sched_feat(UTIL_EST)) {
-		unsigned int estimated =
-			READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
-		/*
-		 * Despite the following checks we still have a small window
-		 * for a possible race, when an execl's select_task_rq_fair()
-		 * races with LB's detach_task():
-		 *
-		 *   detach_task()
-		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
-		 *     ---------------------------------- A
-		 *     deactivate_task()                   \
-		 *       dequeue_task()                     + RaceTime
-		 *         util_est_dequeue()              /
-		 *     ---------------------------------- B
-		 *
-		 * The additional check on "current == p" it's required to
-		 * properly fix the execl regression and it helps in further
-		 * reducing the chances for the above race.
-		 */
-		if (unlikely(task_on_rq_queued(p) || current == p))
-			lsub_positive(&estimated, _task_util_est(p));
-
-		util = max(util, estimated);
-	}
-
-	/*
-	 * Utilization (estimated) can exceed the CPU capacity, thus let's
-	 * clamp to the maximum CPU capacity to ensure consistency with
-	 * cpu_util.
-	 */
-	return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
-{
-	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
-	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
-
-	/*
-	 * If @p migrates from @cpu to another, remove its contribution. Or,
-	 * if @p migrates from another CPU to @cpu, add its contribution. In
-	 * the other cases, @cpu is not impacted by the migration, so the
-	 * util_avg should already be correct.
-	 */
-	if (task_cpu(p) == cpu && dst_cpu != cpu)
-		lsub_positive(&util, task_util(p));
-	else if (task_cpu(p) != cpu && dst_cpu == cpu)
-		util += task_util(p);
-
-	if (sched_feat(UTIL_EST)) {
-		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
-		/*
-		 * During wake-up, the task isn't enqueued yet and doesn't
-		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
-		 * so just add it (if needed) to "simulate" what will be
-		 * cpu_util after the task has been enqueued.
-		 */
-		if (dst_cpu == cpu)
-			util_est += _task_util_est(p);
-
-		util = max(util, util_est);
-	}
-
-	return min(util, capacity_orig_of(cpu));
+	return cpu_util_next(cpu, p, -1);
 }
 
 /*
-- 
2.25.1
Re: [PATCH] sched/fair: Refactor cpu_util_without()
Posted by Vincent Guittot 4 years, 3 months ago
On Tue, 1 Mar 2022 at 18:17, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> Except the 'task has no contribution or is new' condition at the
> beginning of cpu_util_without(), a cpu_util_next(..., dst_cpu = -1)
> call can replace the rest of this function.
>
> The UTIL_EST specific check for a race between select_task_rq_fair()
> and detach_task() in case of an enqueued or running WF_EXEC task has
> to be moved to cpu_util_next().
> This was initially introduced by commit c469933e7721
> ("sched/fair: Fix cpu_util_wake() for 'execl' type workloads").
> UnixBench's `execl` throughput tests were run on the dual socket 40
> CPUs Intel E5-2690 v2 machine to make sure the regression doesn't
> occur again.
>
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

I have only minor comment

> ---
>
> There is still a lot of CPU utilization related code. cpu_util_without()
> and cpu_util_next() are very similar. In fact the former can be
> refactored to use a call to the latter to be able to remove some
> redundancy.
>
>  kernel/sched/fair.c | 143 ++++++++++++++++++--------------------------
>  1 file changed, 59 insertions(+), 84 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 16874e112fe6..c084c2e29e40 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6511,6 +6511,64 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>         return target;
>  }
>
> +/*
> + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
> + * to @dst_cpu.
> + */
> +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
> +{
> +       struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
> +       unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
> +
> +       /*
> +        * If @p migrates from @cpu to another, remove its contribution. Or,
> +        * if @p migrates from another CPU to @cpu, add its contribution. In
> +        * the other cases, @cpu is not impacted by the migration, so the
> +        * util_avg should already be correct.
> +        */
> +       if (task_cpu(p) == cpu && dst_cpu != cpu)
> +               lsub_positive(&util, task_util(p));
> +       else if (task_cpu(p) != cpu && dst_cpu == cpu)
> +               util += task_util(p);
> +
> +       if (sched_feat(UTIL_EST)) {
> +               util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +
> +               /*
> +                * During wake-up, the task isn't enqueued yet and doesn't
> +                * appear in the cfs_rq->avg.util_est.enqueued of any rq,
> +                * so just add it (if needed) to "simulate" what will be
> +                * cpu_util after the task has been enqueued.
> +                */
> +               if (dst_cpu == cpu)
> +                       util_est += _task_util_est(p);
> +

Could you add a comment that explains why the addition above will not
be removed below by the lsub_positive below so it isn't worth trying
to optimize such a case?

> +               /*
> +                * Despite the following checks we still have a small window
> +                * for a possible race, when an execl's select_task_rq_fair()
> +                * races with LB's detach_task():
> +                *
> +                *   detach_task()
> +                *     p->on_rq = TASK_ON_RQ_MIGRATING;
> +                *     ---------------------------------- A
> +                *     deactivate_task()                   \
> +                *       dequeue_task()                     + RaceTime
> +                *         util_est_dequeue()              /
> +                *     ---------------------------------- B
> +                *
> +                * The additional check on "current == p" it's required to
> +                * properly fix the execl regression and it helps in further
> +                * reducing the chances for the above race.
> +                */
> +               if (unlikely(task_on_rq_queued(p) || current == p))
> +                       lsub_positive(&util_est, _task_util_est(p));
> +
> +               util = max(util, util_est);
> +       }
> +
> +       return min(util, capacity_orig_of(cpu));
> +}
> +
>  /*
>   * cpu_util_without: compute cpu utilization without any contributions from *p
>   * @cpu: the CPU which utilization is requested
> @@ -6526,19 +6584,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
>   */
>  static unsigned long cpu_util_without(int cpu, struct task_struct *p)
>  {
> -       struct cfs_rq *cfs_rq;
> -       unsigned int util;
> -
>         /* Task has no contribution or is new */
>         if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
>                 return cpu_util_cfs(cpu);
>
> -       cfs_rq = &cpu_rq(cpu)->cfs;
> -       util = READ_ONCE(cfs_rq->avg.util_avg);
> -
> -       /* Discount task's util from CPU's util */
> -       lsub_positive(&util, task_util(p));
> -
>         /*
>          * Covered cases:
>          *
> @@ -6560,82 +6609,8 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
>          *    estimation of the spare capacity on that CPU, by just
>          *    considering the expected utilization of tasks already
>          *    runnable on that CPU.

The comment about the covered cases above should be moved in
cpu_util_next() which is where the cases are covered now

> -        *
> -        * Cases a) and b) are covered by the above code, while case c) is
> -        * covered by the following code when estimated utilization is
> -        * enabled.
>          */
> -       if (sched_feat(UTIL_EST)) {
> -               unsigned int estimated =
> -                       READ_ONCE(cfs_rq->avg.util_est.enqueued);
> -
> -               /*
> -                * Despite the following checks we still have a small window
> -                * for a possible race, when an execl's select_task_rq_fair()
> -                * races with LB's detach_task():
> -                *
> -                *   detach_task()
> -                *     p->on_rq = TASK_ON_RQ_MIGRATING;
> -                *     ---------------------------------- A
> -                *     deactivate_task()                   \
> -                *       dequeue_task()                     + RaceTime
> -                *         util_est_dequeue()              /
> -                *     ---------------------------------- B
> -                *
> -                * The additional check on "current == p" it's required to
> -                * properly fix the execl regression and it helps in further
> -                * reducing the chances for the above race.
> -                */
> -               if (unlikely(task_on_rq_queued(p) || current == p))
> -                       lsub_positive(&estimated, _task_util_est(p));
> -
> -               util = max(util, estimated);
> -       }
> -
> -       /*
> -        * Utilization (estimated) can exceed the CPU capacity, thus let's
> -        * clamp to the maximum CPU capacity to ensure consistency with
> -        * cpu_util.
> -        */
> -       return min_t(unsigned long, util, capacity_orig_of(cpu));
> -}
> -
> -/*
> - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
> - * to @dst_cpu.
> - */
> -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
> -{
> -       struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
> -       unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
> -
> -       /*
> -        * If @p migrates from @cpu to another, remove its contribution. Or,
> -        * if @p migrates from another CPU to @cpu, add its contribution. In
> -        * the other cases, @cpu is not impacted by the migration, so the
> -        * util_avg should already be correct.
> -        */
> -       if (task_cpu(p) == cpu && dst_cpu != cpu)
> -               lsub_positive(&util, task_util(p));
> -       else if (task_cpu(p) != cpu && dst_cpu == cpu)
> -               util += task_util(p);
> -
> -       if (sched_feat(UTIL_EST)) {
> -               util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
> -
> -               /*
> -                * During wake-up, the task isn't enqueued yet and doesn't
> -                * appear in the cfs_rq->avg.util_est.enqueued of any rq,
> -                * so just add it (if needed) to "simulate" what will be
> -                * cpu_util after the task has been enqueued.
> -                */
> -               if (dst_cpu == cpu)
> -                       util_est += _task_util_est(p);
> -
> -               util = max(util, util_est);
> -       }
> -
> -       return min(util, capacity_orig_of(cpu));
> +       return cpu_util_next(cpu, p, -1);
>  }
>
>  /*
> --
> 2.25.1
>
Re: [PATCH] sched/fair: Refactor cpu_util_without()
Posted by Dietmar Eggemann 4 years, 3 months ago
- Valentin Schneider <Valentin.Schneider@arm.com>

On 02/03/2022 10:09, Vincent Guittot wrote:
> On Tue, 1 Mar 2022 at 18:17, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

[...]

> I have only minor comment

Thanks for the review!

[...]

>> +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
>> +{

[...]

>> +       if (sched_feat(UTIL_EST)) {
>> +               util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
>> +
>> +               /*
>> +                * During wake-up, the task isn't enqueued yet and doesn't
>> +                * appear in the cfs_rq->avg.util_est.enqueued of any rq,
>> +                * so just add it (if needed) to "simulate" what will be
>> +                * cpu_util after the task has been enqueued.
>> +                */
>> +               if (dst_cpu == cpu)
>> +                       util_est += _task_util_est(p);
>> +
> 
> Could you add a comment that explains why the addition above will not
> be removed below by the lsub_positive below so it isn't worth trying
> to optimize such a case?

Yes. I rewored the comments in cpu_util_next() so they also apply when
called by cpu_util_without(). And I use a `if{}/else if{}` here too in v2.
>> +               /*
>> +                * Despite the following checks we still have a small window
>> +                * for a possible race, when an execl's select_task_rq_fair()
>> +                * races with LB's detach_task():
>> +                *
>> +                *   detach_task()
>> +                *     p->on_rq = TASK_ON_RQ_MIGRATING;
>> +                *     ---------------------------------- A
>> +                *     deactivate_task()                   \
>> +                *       dequeue_task()                     + RaceTime
>> +                *         util_est_dequeue()              /
>> +                *     ---------------------------------- B
>> +                *
>> +                * The additional check on "current == p" it's required to
>> +                * properly fix the execl regression and it helps in further
>> +                * reducing the chances for the above race.
>> +                */
>> +               if (unlikely(task_on_rq_queued(p) || current == p))
>> +                       lsub_positive(&util_est, _task_util_est(p));

I did a lot of testing on mainline & v4.20 and there wasn't one
occurrence of `p->on_rq == TASK_ON_RQ_MIGRATING` here. Not for WF_EXEC
tasks (p->on_rq = TASK_ON_RQ_QUEUED) and in case of v4.20 not for
WF_EXEC and WF_TTWU tasks (p->on_rq = 0). So I assume it's not needed. I
left it in v2 though and mentioned it in the additional comment section
of the patch.

[...]

>>  static unsigned long cpu_util_without(int cpu, struct task_struct *p)
>>  {

[...]

>>         /*
>>          * Covered cases:
>>          *
>> @@ -6560,82 +6609,8 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
>>          *    estimation of the spare capacity on that CPU, by just
>>          *    considering the expected utilization of tasks already
>>          *    runnable on that CPU.
> 
> The comment about the covered cases above should be moved in
> cpu_util_next() which is where the cases are covered now

Yes. I Incorporated it into the comments in cpu_util_next() in v2.

[...]