[v2] Enable Cluster Scheduling for x86 Hybrid CPUs

[Patch v2 3/6] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups

Posted by Tim Chen 2 years, 8 months ago

From: Tim C Chen <tim.c.chen@linux.intel.com>

In the current prefer sibling load balancing code, there is an implicit
assumption that the busiest sched group and local sched group are
equivalent, hence the tasks to be moved is simply the difference in
number of tasks between the two groups (i.e. imbalance) divided by two.

However, we may have different number of cores between the cluster groups,
say when we take CPU offline or we have hybrid groups.  In that case,
we should balance between the two groups such that #tasks/#cores ratio
is the same between the same between both groups.  Hence the imbalance
computed will need to reflect this.

Additionally when we have asymmetric packing, we will need to bias the
balancing according to whether the busiest group is favored or the local
group if favored.  If the destination group is favored and has idle
cores, we should move at least one task from the busiest group to avoid
leaving favored idle core unused.  But if the busiest group is favored,
we should limit the number of tasks we move so we will not create idle
cores in busiest group, leaving favored cores unused.

Adjust the sibling imbalance computation to take into account of the
above considerations.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 kernel/sched/fair.c  | 65 +++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |  5 ++++
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03573362274f..0b0904263d51 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9372,6 +9372,65 @@ static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
 	return false;
 }
 
+static inline long sibling_imbalance(struct lb_env *env,
+				    struct sd_lb_stats *sds,
+				    struct sg_lb_stats *busiest,
+				    struct sg_lb_stats *local)
+{
+	int ncores_busiest, ncores_local;
+	long imbalance;
+
+	if (env->idle == CPU_NOT_IDLE)
+		return 0;
+
+	ncores_busiest = sds->busiest->cores;
+	ncores_local = sds->local->cores;
+
+	if (ncores_busiest == ncores_local &&
+	    (!(env->sd->flags & SD_ASYM_PACKING) ||
+	      sched_asym_equal(env->dst_cpu,
+			      sds->busiest->asym_prefer_cpu))) {
+		imbalance = busiest->sum_nr_running;
+		lsub_positive(&imbalance, local->sum_nr_running);
+		return imbalance;
+	}
+
+	/* Balance such that nr_running/ncores ratio are same on both groups */
+	imbalance = ncores_local * busiest->sum_nr_running;
+	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+	/* Normalize imbalance to become tasks to be moved to restore balance */
+	imbalance /= ncores_local + ncores_busiest;
+
+	if (env->sd->flags & SD_ASYM_PACKING) {
+		int limit;
+
+		if (!busiest->sum_nr_running)
+			goto out;
+
+		if (sched_asym_prefer(env->dst_cpu, sds->busiest->asym_prefer_cpu)) {
+			/* Don't leave preferred core idle */
+			if (imbalance == 0 && local->sum_nr_running < ncores_local)
+				imbalance = 1;
+			goto out;
+		}
+
+		/* Limit tasks moved from preferred group, don't leave cores idle */
+		limit = busiest->sum_nr_running;
+		lsub_positive(&limit, ncores_busiest);
+		if (imbalance > limit)
+			imbalance = limit;
+
+		goto out;
+	}
+
+	/* Take advantage of resource in an empty sched group */
+	if (imbalance == 0 && local->sum_nr_running == 0 &&
+	    busiest->sum_nr_running > 1)
+		imbalance = 1;
+out:
+	return imbalance << 1;
+}
+
 static inline bool
 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 {
@@ -10230,14 +10289,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		}
 
 		if (busiest->group_weight == 1 || sds->prefer_sibling) {
-			unsigned int nr_diff = busiest->sum_nr_running;
 			/*
 			 * When prefer sibling, evenly spread running tasks on
 			 * groups.
 			 */
 			env->migration_type = migrate_task;
-			lsub_positive(&nr_diff, local->sum_nr_running);
-			env->imbalance = nr_diff;
+			env->imbalance = sibling_imbalance(env, sds, busiest, local);
 		} else {
 
 			/*
@@ -10424,7 +10481,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	 * group's child domain.
 	 */
 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
-	    busiest->sum_nr_running > local->sum_nr_running + 1)
+	    sibling_imbalance(env, &sds, busiest, local) > 1)
 		goto force_balance;
 
 	if (busiest->group_type != group_overloaded) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5f7f36e45b87..adffe0894cdb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -804,6 +804,11 @@ static inline bool sched_asym_prefer(int a, int b)
 	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
 }
 
+static inline bool sched_asym_equal(int a, int b)
+{
+	return arch_asym_cpu_priority(a) == arch_asym_cpu_priority(b);
+}
+
 struct perf_domain {
 	struct em_perf_domain *em_pd;
 	struct perf_domain *next;
-- 
2.32.0

Re: [Patch v2 3/6] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups

Posted by Peter Zijlstra 2 years, 8 months ago

On Thu, Jun 08, 2023 at 03:32:29PM -0700, Tim Chen wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 03573362274f..0b0904263d51 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9372,6 +9372,65 @@ static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
>  	return false;
>  }
>  
> +static inline long sibling_imbalance(struct lb_env *env,
> +				    struct sd_lb_stats *sds,
> +				    struct sg_lb_stats *busiest,
> +				    struct sg_lb_stats *local)
> +{
> +	int ncores_busiest, ncores_local;
> +	long imbalance;
> +
> +	if (env->idle == CPU_NOT_IDLE)
> +		return 0;
> +
> +	ncores_busiest = sds->busiest->cores;
> +	ncores_local = sds->local->cores;
> +
> +	if (ncores_busiest == ncores_local &&
> +	    (!(env->sd->flags & SD_ASYM_PACKING) ||
> +	      sched_asym_equal(env->dst_cpu,
> +			      sds->busiest->asym_prefer_cpu))) {
> +		imbalance = busiest->sum_nr_running;
> +		lsub_positive(&imbalance, local->sum_nr_running);
> +		return imbalance;
> +	}
> +
> +	/* Balance such that nr_running/ncores ratio are same on both groups */
> +	imbalance = ncores_local * busiest->sum_nr_running;
> +	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
> +	/* Normalize imbalance to become tasks to be moved to restore balance */
> +	imbalance /= ncores_local + ncores_busiest;
> +
> +	if (env->sd->flags & SD_ASYM_PACKING) {
> +		int limit;
> +
> +		if (!busiest->sum_nr_running)
> +			goto out;

This seems out-of-place, shouldn't we have terminate sooner if busiest
is empty?

> +
> +		if (sched_asym_prefer(env->dst_cpu, sds->busiest->asym_prefer_cpu)) {
> +			/* Don't leave preferred core idle */
> +			if (imbalance == 0 && local->sum_nr_running < ncores_local)
> +				imbalance = 1;
> +			goto out;
> +		}
> +
> +		/* Limit tasks moved from preferred group, don't leave cores idle */
> +		limit = busiest->sum_nr_running;
> +		lsub_positive(&limit, ncores_busiest);
> +		if (imbalance > limit)
> +			imbalance = limit;

How does this affect the server parts that have larger than single core
turbo domains?

> +
> +		goto out;
> +	}
> +
> +	/* Take advantage of resource in an empty sched group */
> +	if (imbalance == 0 && local->sum_nr_running == 0 &&
> +	    busiest->sum_nr_running > 1)
> +		imbalance = 1;
> +out:
> +	return imbalance << 1;
> +}


But basically you have:

        LcBn - BcLn
  imb = -----------
           LcBc

Which makes sense, except you then return:

  imb * 2

which then made me wonder about rounding.

Do we want to to add (LcBc -1) or (LcBc/2) to resp. ceil() or round()
the thing before division? Because currently it uses floor().

If you evaludate it like:


        2 * (LcBn - BcLn)
  imb = -----------------
              LcBc

The result is different from what you have now.

What actual behaviour is desired in these low imbalance cases? and can
you write a comment as to why we do as we do etc..?

Re: [Patch v2 3/6] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups

Posted by Tim Chen 2 years, 8 months ago

On Mon, 2023-06-12 at 14:05 +0200, Peter Zijlstra wrote:
> On Thu, Jun 08, 2023 at 03:32:29PM -0700, Tim Chen wrote:
> 
> > 
> > +	if (env->sd->flags & SD_ASYM_PACKING) {
> > +		int limit;
> > +
> > +		if (!busiest->sum_nr_running)
> > +			goto out;
> 
> This seems out-of-place, shouldn't we have terminate sooner if busiest
> is empty?

Yes.  Should move this check to the beginning.

> 
> > +
> > +		if (sched_asym_prefer(env->dst_cpu, sds->busiest->asym_prefer_cpu)) {
> > +			/* Don't leave preferred core idle */
> > +			if (imbalance == 0 && local->sum_nr_running < ncores_local)
> > +				imbalance = 1;
> > +			goto out;
> > +		}
> > +
> > +		/* Limit tasks moved from preferred group, don't leave cores idle */
> > +		limit = busiest->sum_nr_running;
> > +		lsub_positive(&limit, ncores_busiest);
> > +		if (imbalance > limit)
> > +			imbalance = limit;
> 
> How does this affect the server parts that have larger than single core
> turbo domains?

Are you thinking about the case where the local group is completely empty
so there's turbo headroom and we should move at least one task, even though
CPU in busiest group has higher priority?

In other words, are you suggesting we should add

		if (imbalance == 0 && busiest->sum_nr_running > 0 &&
			local->sum_nr_running == 0)
			imbalance = 1;

> 
> > +
> > +		goto out;
> > +	}
> > +
> > +	/* Take advantage of resource in an empty sched group */
> > +	if (imbalance == 0 && local->sum_nr_running == 0 &&
> > +	    busiest->sum_nr_running > 1)
> > +		imbalance = 1;
> > +out:
> > +	return imbalance << 1;
> > +}
> 
> 
> But basically you have:
> 
>         LcBn - BcLn
>   imb = -----------
>            LcBc
> 
> Which makes sense, except you then return:
> 
>   imb * 2
> 
> which then made me wonder about rounding.
> 
> Do we want to to add (LcBc -1) or (LcBc/2) to resp. ceil() or round()
> the thing before division? Because currently it uses floor().
> 
> If you evaludate it like:
> 
> 
>         2 * (LcBn - BcLn)
>   imb = -----------------
>               LcBc
> 
> The result is different from what you have now.

If I do the rounding after multiplying imb by two (call it imb_new),
the difference with imb I am returning now (call it imb_old)
will be at most 1.  Note that imb_old returned is always a multiple of 2.

I will be using imb in calculate_imbalance() and divide it
by 2 there to get the number tasks to move from busiest group.
So when there is a difference of 1 between imb_old and imb_new,
the difference will be trimmed off after the division of 2.

We will get the same number of tasks to move with either
imb_old or imb_new in calculate_imbalance() so the two
computations will arrive at the same result eventually.

> 
> What actual behaviour is desired in these low imbalance cases? and can
> you write a comment as to why we do as we do etc..?

I do not keep imb as 

           2 * (LcBn - BcLn)
   imb = -----------------
               LcBc

as it is easier to leave out the factor of 2
in the middle of sibling_imblalance() computation
so I can directly interpret imb as the number
of tasks to move, and add the factor of two
when I actually need to return the imbalance.

Would you like me to add this reasoning in the comments?

Thanks.

Tim

Re: [Patch v2 3/6] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups

Posted by Peter Zijlstra 2 years, 8 months ago

On Tue, Jun 13, 2023 at 10:46:36AM -0700, Tim Chen wrote:
> On Mon, 2023-06-12 at 14:05 +0200, Peter Zijlstra wrote:

> > > +		/* Limit tasks moved from preferred group, don't leave cores idle */
> > > +		limit = busiest->sum_nr_running;
> > > +		lsub_positive(&limit, ncores_busiest);
> > > +		if (imbalance > limit)
> > > +			imbalance = limit;
> > 
> > How does this affect the server parts that have larger than single core
> > turbo domains?
> 
> Are you thinking about the case where the local group is completely empty
> so there's turbo headroom and we should move at least one task, even though
> CPU in busiest group has higher priority?

Something along those lines, I didn't think it through, just wondered
about the wisdom of piling everything in the highest priority 'domain',
which would depress the turbo range.

Rjw said that on modern client the frequency domains are per-core, but
that on server parts they are still wider -- or something long those
lines. So it might make sense to consider some of that.

> In other words, are you suggesting we should add
> 
> 		if (imbalance == 0 && busiest->sum_nr_running > 0 &&
> 			local->sum_nr_running == 0)
> 			imbalance = 1;

I didn't get that far; and I don't think we have the right topology
information on the servers to even begin considering the effects of the
turbo-range, so perhaps it all doesn't matter.

Just wanted to raise the point for consideration.

Because as-is, the policy of piling extra on the preferred group doesn't
immediately make sense. IIRC the whole ITMT preferred core is mostly
about an extra turbo bin, but if you pile on, the headroom evaporates --
you'll never turbo that high anymore and special casing it doesn't make
sense.

So perhaps I'm not saying more code, but less code is better here.

Dunno, is any of this measurable either way around?

> > > +
> > > +		goto out;
> > > +	}
> > > +
> > > +	/* Take advantage of resource in an empty sched group */
> > > +	if (imbalance == 0 && local->sum_nr_running == 0 &&
> > > +	    busiest->sum_nr_running > 1)
> > > +		imbalance = 1;
> > > +out:
> > > +	return imbalance << 1;
> > > +}
> > 
> > 
> > But basically you have:
> > 
> >         LcBn - BcLn
> >   imb = -----------
> >            LcBc
> > 
> > Which makes sense, except you then return:
> > 
> >   imb * 2
> > 
> > which then made me wonder about rounding.
> > 
> > Do we want to to add (LcBc -1) or (LcBc/2) to resp. ceil() or round()
> > the thing before division? Because currently it uses floor().
> > 
> > If you evaludate it like:
> > 
> > 
> >         2 * (LcBn - BcLn)
> >   imb = -----------------
> >               LcBc
> > 
> > The result is different from what you have now.
> 
> If I do the rounding after multiplying imb by two (call it imb_new),
> the difference with imb I am returning now (call it imb_old)
> will be at most 1.  Note that imb_old returned is always a multiple of 2.
> 
> I will be using imb in calculate_imbalance() and divide it
> by 2 there to get the number tasks to move from busiest group.
> So when there is a difference of 1 between imb_old and imb_new,
> the difference will be trimmed off after the division of 2.
> 
> We will get the same number of tasks to move with either
> imb_old or imb_new in calculate_imbalance() so the two
> computations will arrive at the same result eventually.
> 
> > 
> > What actual behaviour is desired in these low imbalance cases? and can
> > you write a comment as to why we do as we do etc..?
> 
> I do not keep imb as 
> 
>            2 * (LcBn - BcLn)
>    imb = -----------------
>                LcBc
> 
> as it is easier to leave out the factor of 2
> in the middle of sibling_imblalance() computation
> so I can directly interpret imb as the number
> of tasks to move, and add the factor of two
> when I actually need to return the imbalance.
> 
> Would you like me to add this reasoning in the comments?

So if we want a multiple of 2, leaving that multiplication off makes
sense, but I'm not sure I got the argument for or against the rounding.

floor() gets us 1 task to move when there is at least a whole task's
worth of imbalance, but less than 2.

round() would get us 1 task to move when there's at least half a task's
worth of imbalance but less than 1.5.

ceil() will migrate on any imbalance, however small -- which will result
in ping-pong I suppose, so let's disregard that.

The difference, with the multiplcation later, is 0 or 2.

Does the round() still result in ping-pong?

Re: [Patch v2 3/6] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups

Posted by Tim Chen 2 years, 8 months ago

On Thu, 2023-06-15 at 13:07 +0200, Peter Zijlstra wrote:
> On Tue, Jun 13, 2023 at 10:46:36AM -0700, Tim Chen wrote:
> > On Mon, 2023-06-12 at 14:05 +0200, Peter Zijlstra wrote:
> 
> > > > +		/* Limit tasks moved from preferred group, don't leave cores idle */
> > > > +		limit = busiest->sum_nr_running;
> > > > +		lsub_positive(&limit, ncores_busiest);
> > > > +		if (imbalance > limit)
> > > > +			imbalance = limit;
> > > 
> > > How does this affect the server parts that have larger than single core
> > > turbo domains?
> > 
> > Are you thinking about the case where the local group is completely empty
> > so there's turbo headroom and we should move at least one task, even though
> > CPU in busiest group has higher priority?
> 
> Something along those lines, I didn't think it through, just wondered
> about the wisdom of piling everything in the highest priority 'domain',
> which would depress the turbo range.
> 
> Rjw said that on modern client the frequency domains are per-core, but
> that on server parts they are still wider -- or something long those
> lines. So it might make sense to consider some of that.
> 

I see what you are saying. In that case, even for ASYM_PACKING, it probably
makes more sense to simply distribute tasks in proportion to number of cores
in sched group.  And pay attention that we do not let preferred sched group
be idle. 

> 
> > In other words, are you suggesting we should add
> > 
> > 		if (imbalance == 0 && busiest->sum_nr_running > 0 &&
> > 			local->sum_nr_running == 0)
> > 			imbalance = 1;
> 
> I didn't get that far; and I don't think we have the right topology
> information on the servers to even begin considering the effects of the
> turbo-range, so perhaps it all doesn't matter.
> 
> Just wanted to raise the point for consideration.
> 
> Because as-is, the policy of piling extra on the preferred group doesn't
> immediately make sense. IIRC the whole ITMT preferred core is mostly
> about an extra turbo bin, but if you pile on, the headroom evaporates --
> you'll never turbo that high anymore and special casing it doesn't make
> sense.
> 
> So perhaps I'm not saying more code, but less code is better here.
> 
> Dunno, is any of this measurable either way around?

As far as I know, only client CPUs have ITMT and asymmetric turbo.
So this behavior could only be approximated on a client's Atom clusters with
big cores turned off.  

> 
> > > > +
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	/* Take advantage of resource in an empty sched group */
> > > > +	if (imbalance == 0 && local->sum_nr_running == 0 &&
> > > > +	    busiest->sum_nr_running > 1)
> > > > +		imbalance = 1;
> > > > +out:
> > > > +	return imbalance << 1;
> > > > +}
> > > 
> > > 
> > > But basically you have:
> > > 
> > >         LcBn - BcLn
> > >   imb = -----------
> > >            LcBc
> > > 
> > > Which makes sense, except you then return:
> > > 
> > >   imb * 2
> > > 
> > > which then made me wonder about rounding.
> > > 
> > > Do we want to to add (LcBc -1) or (LcBc/2) to resp. ceil() or round()
> > > the thing before division? Because currently it uses floor().
> > > 
> > > If you evaludate it like:
> > > 
> > > 
> > >         2 * (LcBn - BcLn)
> > >   imb = -----------------
> > >               LcBc
> > > 
> > > The result is different from what you have now.
> > 
> > If I do the rounding after multiplying imb by two (call it imb_new),
> > the difference with imb I am returning now (call it imb_old)
> > will be at most 1.  Note that imb_old returned is always a multiple of 2.
> > 
> > I will be using imb in calculate_imbalance() and divide it
> > by 2 there to get the number tasks to move from busiest group.
> > So when there is a difference of 1 between imb_old and imb_new,
> > the difference will be trimmed off after the division of 2.
> > 
> > We will get the same number of tasks to move with either
> > imb_old or imb_new in calculate_imbalance() so the two
> > computations will arrive at the same result eventually.
> > 
> > > 
> > > What actual behaviour is desired in these low imbalance cases? and can
> > > you write a comment as to why we do as we do etc..?
> > 
> > I do not keep imb as 
> > 
> >            2 * (LcBn - BcLn)
> >    imb = -----------------
> >                LcBc
> > 
> > as it is easier to leave out the factor of 2
> > in the middle of sibling_imblalance() computation
> > so I can directly interpret imb as the number
> > of tasks to move, and add the factor of two
> > when I actually need to return the imbalance.
> > 
> > Would you like me to add this reasoning in the comments?
> 
> So if we want a multiple of 2, leaving that multiplication off makes
> sense, but I'm not sure I got the argument for or against the rounding.
> 
> floor() gets us 1 task to move when there is at least a whole task's
> worth of imbalance, but less than 2.
> 
> round() would get us 1 task to move when there's at least half a task's
> worth of imbalance but less than 1.5.
> 
> ceil() will migrate on any imbalance, however small -- which will result
> in ping-pong I suppose, so let's disregard that.
> 
> The difference, with the multiplcation later, is 0 or 2.
> 
> Does the round() still result in ping-pong?

Will have to experiment to see whether round() is better.
I chose floor() on purpose in my initial implementation
to minimize migrations.

Tim

[Patch v2 1/6] sched/fair: Determine active load balance for SMT sched groups
[Patch v2 2/6] sched/topology: Record number of cores in sched group
[Patch v2 3/6] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups
[Patch v2 4/6] sched/fair: Consider the idle state of the whole core for load balance
[Patch v2 5/6] sched/x86: Add cluster topology to hybrid CPU
[Patch v2 6/6] sched/debug: Dump domains' sched group flags