[v1] x86, sched: Dynamic ITMT core ranking support and some yak shaving

[RFC PATCH 8/8] sched/fair: Uncache asym_prefer_cpu and find it during update_sd_lb_stats()

Posted by K Prateek Nayak 1 year, 1 month ago

On AMD processors supporting dynamic preferred core ranking, the
asym_prefer_cpu cached in sched_group can change dynamically. Since
asym_prefer_cpu is cached when the sched domain hierarchy is built,
updating the cached value across the system would require rebuilding
the sched domain which is prohibitively expensive.

All the asym_prefer_cpu comparisons in the load balancing path are only
carried out post the sched group stats have been updated after iterating
all the CPUs in the group. Uncache the asym_prefer_cpu and compute it
while sched group statistics are being updated as a part of sg_lb_stats.

Fixes: f3a052391822 ("cpufreq: amd-pstate: Enable amd-pstate preferred core support")
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
 kernel/sched/fair.c     | 21 +++++++++++++++++++--
 kernel/sched/sched.h    |  1 -
 kernel/sched/topology.c | 15 +--------------
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3f36805ecdca..166b8e831064 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9911,6 +9911,8 @@ struct sg_lb_stats {
 	unsigned int sum_nr_running;		/* Nr of all tasks running in the group */
 	unsigned int sum_h_nr_running;		/* Nr of CFS tasks running in the group */
 	unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
+	unsigned int asym_prefer_cpu;		/* CPU with highest asym priority */
+	int highest_asym_prio;			/* Asym priority of asym_prefer_cpu */
 	unsigned int group_weight;
 	enum group_type group_type;
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
@@ -10243,7 +10245,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
 	    (sgs->group_weight - sgs->idle_cpus != 1))
 		return false;
 
-	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
+	return sched_asym(env->sd, env->dst_cpu, sgs->asym_prefer_cpu);
 }
 
 /* One group has more than one SMT CPU while the other group does not */
@@ -10324,6 +10326,17 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+static inline void
+update_sg_pick_asym_prefer(struct sg_lb_stats *sgs, int cpu)
+{
+	int asym_prio = arch_asym_cpu_priority(cpu);
+
+	if (asym_prio > sgs->highest_asym_prio) {
+		sgs->asym_prefer_cpu = cpu;
+		sgs->highest_asym_prio = asym_prio;
+	}
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10345,6 +10358,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	memset(sgs, 0, sizeof(*sgs));
 
 	local_group = group == sds->local;
+	sgs->highest_asym_prio = INT_MIN;
 
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
@@ -10358,6 +10372,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
+		if (sd_flags & SD_ASYM_PACKING)
+			update_sg_pick_asym_prefer(sgs, i);
+
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
@@ -10479,7 +10496,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
 	case group_asym_packing:
 		/* Prefer to move from lowest priority CPU's work */
-		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
+		return sched_asym_prefer(busiest->asym_prefer_cpu, sgs->asym_prefer_cpu);
 
 	case group_misfit_task:
 		/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aef716c41edb..a3f0d326bd11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2047,7 +2047,6 @@ struct sched_group {
 	unsigned int		group_weight;
 	unsigned int		cores;
 	struct sched_group_capacity *sgc;
-	int			asym_prefer_cpu;	/* CPU of highest priority in group */
 	int			flags;
 
 	/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9c405f0e7b26..20aa087710f0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1302,7 +1302,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	WARN_ON(!sg);
 
 	do {
-		int cpu, cores = 0, max_cpu = -1;
+		int cpu, cores = 0;
 
 		sg->group_weight = cpumask_weight(sched_group_span(sg));
 
@@ -1314,19 +1314,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 #endif
 		}
 		sg->cores = cores;
-
-		if (!(sd->flags & SD_ASYM_PACKING))
-			goto next;
-
-		for_each_cpu(cpu, sched_group_span(sg)) {
-			if (max_cpu < 0)
-				max_cpu = cpu;
-			else if (sched_asym_prefer(cpu, max_cpu))
-				max_cpu = cpu;
-		}
-		sg->asym_prefer_cpu = max_cpu;
-
-next:
 		sg = sg->next;
 	} while (sg != sd->groups);
 
-- 
2.34.1

Re: [RFC PATCH 8/8] sched/fair: Uncache asym_prefer_cpu and find it during update_sd_lb_stats()

Posted by Shrikanth Hegde 1 year, 1 month ago


On 12/12/24 00:25, K Prateek Nayak wrote:
> On AMD processors supporting dynamic preferred core ranking, the
> asym_prefer_cpu cached in sched_group can change dynamically. Since
> asym_prefer_cpu is cached when the sched domain hierarchy is built,
> updating the cached value across the system would require rebuilding
> the sched domain which is prohibitively expensive.
> 
> All the asym_prefer_cpu comparisons in the load balancing path are only
> carried out post the sched group stats have been updated after iterating
> all the CPUs in the group. Uncache the asym_prefer_cpu and compute it
> while sched group statistics are being updated as a part of sg_lb_stats.
> 
> Fixes: f3a052391822 ("cpufreq: amd-pstate: Enable amd-pstate preferred core support")
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> ---
>   kernel/sched/fair.c     | 21 +++++++++++++++++++--
>   kernel/sched/sched.h    |  1 -
>   kernel/sched/topology.c | 15 +--------------
>   3 files changed, 20 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3f36805ecdca..166b8e831064 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9911,6 +9911,8 @@ struct sg_lb_stats {
>   	unsigned int sum_nr_running;		/* Nr of all tasks running in the group */
>   	unsigned int sum_h_nr_running;		/* Nr of CFS tasks running in the group */
>   	unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
> +	unsigned int asym_prefer_cpu;		/* CPU with highest asym priority */
> +	int highest_asym_prio;			/* Asym priority of asym_prefer_cpu */

Its better to move this after group_asym_packing field, so all related 
fields are together.

>   	unsigned int group_weight;
>   	enum group_type group_type;
>   	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
> @@ -10243,7 +10245,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
>   	    (sgs->group_weight - sgs->idle_cpus != 1))
>   		return false;
>   
> -	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
> +	return sched_asym(env->sd, env->dst_cpu, sgs->asym_prefer_cpu);
>   }
>   
>   /* One group has more than one SMT CPU while the other group does not */
> @@ -10324,6 +10326,17 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>   	return check_cpu_capacity(rq, sd);
>   }
>   
> +static inline void
> +update_sg_pick_asym_prefer(struct sg_lb_stats *sgs, int cpu)
> +{
> +	int asym_prio = arch_asym_cpu_priority(cpu);
> +
> +	if (asym_prio > sgs->highest_asym_prio) {
> +		sgs->asym_prefer_cpu = cpu;
> +		sgs->highest_asym_prio = asym_prio;
> +	}
> +}
> +
>   /**
>    * update_sg_lb_stats - Update sched_group's statistics for load balancing.
>    * @env: The load balancing environment.
> @@ -10345,6 +10358,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>   	memset(sgs, 0, sizeof(*sgs));
>   
>   	local_group = group == sds->local;
> +	sgs->highest_asym_prio = INT_MIN;
>   
>   	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>   		struct rq *rq = cpu_rq(i);
> @@ -10358,6 +10372,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>   		nr_running = rq->nr_running;
>   		sgs->sum_nr_running += nr_running;
>   
> +		if (sd_flags & SD_ASYM_PACKING)
> +			update_sg_pick_asym_prefer(sgs, i);
> +
>   		if (cpu_overutilized(i))
>   			*sg_overutilized = 1;
>   
> @@ -10479,7 +10496,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>   
>   	case group_asym_packing:
>   		/* Prefer to move from lowest priority CPU's work */
> -		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
> +		return sched_asym_prefer(busiest->asym_prefer_cpu, sgs->asym_prefer_cpu);
>   
>   	case group_misfit_task:
>   		/*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index aef716c41edb..a3f0d326bd11 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2047,7 +2047,6 @@ struct sched_group {
>   	unsigned int		group_weight;
>   	unsigned int		cores;
>   	struct sched_group_capacity *sgc;
> -	int			asym_prefer_cpu;	/* CPU of highest priority in group */
>   	int			flags;
>   
>   	/*
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9c405f0e7b26..20aa087710f0 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1302,7 +1302,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
>   	WARN_ON(!sg);
>   
>   	do {
> -		int cpu, cores = 0, max_cpu = -1;
> +		int cpu, cores = 0;
>   
>   		sg->group_weight = cpumask_weight(sched_group_span(sg));
>   
> @@ -1314,19 +1314,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
>   #endif
>   		}
>   		sg->cores = cores;
> -
> -		if (!(sd->flags & SD_ASYM_PACKING))
> -			goto next;
> -
> -		for_each_cpu(cpu, sched_group_span(sg)) {
> -			if (max_cpu < 0)
> -				max_cpu = cpu;
> -			else if (sched_asym_prefer(cpu, max_cpu))
> -				max_cpu = cpu;
> -		}
> -		sg->asym_prefer_cpu = max_cpu;
> -
> -next:
>   		sg = sg->next;
>   	} while (sg != sd->groups);
>   

Tried minimal testing of ASYM_PACKING behavior on Power10 Shared VM. It 
is working as expected with the patch as well. (functionality wise, 
performance isn't tested)

Re: [RFC PATCH 8/8] sched/fair: Uncache asym_prefer_cpu and find it during update_sd_lb_stats()

Posted by K Prateek Nayak 1 year, 1 month ago

Hello Shrikanth,

On 12/13/2024 8:32 PM, Shrikanth Hegde wrote:
> 
> 
> On 12/12/24 00:25, K Prateek Nayak wrote:
>> On AMD processors supporting dynamic preferred core ranking, the
>> asym_prefer_cpu cached in sched_group can change dynamically. Since
>> asym_prefer_cpu is cached when the sched domain hierarchy is built,
>> updating the cached value across the system would require rebuilding
>> the sched domain which is prohibitively expensive.
>>
>> All the asym_prefer_cpu comparisons in the load balancing path are only
>> carried out post the sched group stats have been updated after iterating
>> all the CPUs in the group. Uncache the asym_prefer_cpu and compute it
>> while sched group statistics are being updated as a part of sg_lb_stats.
>>
>> Fixes: f3a052391822 ("cpufreq: amd-pstate: Enable amd-pstate preferred core support")
>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> ---
>>   kernel/sched/fair.c     | 21 +++++++++++++++++++--
>>   kernel/sched/sched.h    |  1 -
>>   kernel/sched/topology.c | 15 +--------------
>>   3 files changed, 20 insertions(+), 17 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 3f36805ecdca..166b8e831064 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -9911,6 +9911,8 @@ struct sg_lb_stats {
>>       unsigned int sum_nr_running;        /* Nr of all tasks running in the group */
>>       unsigned int sum_h_nr_running;        /* Nr of CFS tasks running in the group */
>>       unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
>> +    unsigned int asym_prefer_cpu;        /* CPU with highest asym priority */
>> +    int highest_asym_prio;            /* Asym priority of asym_prefer_cpu */
> 
> Its better to move this after group_asym_packing field, so all related fields are together.

Sure, I'll move the around in the next iteration if folks are okay
with this approach.

> 
>>       unsigned int group_weight;
>>       enum group_type group_type;
>>       unsigned int group_asym_packing;    /* Tasks should be moved to preferred CPU */
>> [..snip..]
> 
> Tried minimal testing of ASYM_PACKING behavior on Power10 Shared VM. It is working as expected with the patch as well. (functionality wise, performance isn't tested)

Thank you for testing! Let me know if there are any visible regressions
in which case lets see if the alternate approach suggested in the cover
letter fares any better.

Thanks a ton for reviewing and testing the series.

-- 
Thanks and Regards,
Prateek

[PATCH 1/8] x86/itmt: Convert "sysctl_sched_itmt_enabled" to boolean
[PATCH 2/8] x86/itmt: Use guard() for itmt_update_mutex
[PATCH 3/8] x86/itmt: Move the "sched_itmt_enabled" sysctl to debugfs
[PATCH 4/8] x86/topology: Remove x86_smt_flags and use cpu_smt_flags directly
[PATCH 5/8] x86/topology: Use x86_sched_itmt_flags for PKG domain unconditionally
[PATCH 6/8] sched/fair: Do not compute NUMA Balancing stats unnecessarily during lb
[PATCH 7/8] sched/fair: Do not compute overloaded status unnecessarily during lb
[RFC PATCH 8/8] sched/fair: Uncache asym_prefer_cpu and find it during update_sd_lb_stats()