[RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Rafael J. Wysocki 1 year, 3 months ago

From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Modify intel_pstate to register stub EM perf domains for CPUs on
hybrid platforms via em_dev_register_perf_domain() and to use
em_dev_expand_perf_domain() introduced previously for adding new
CPUs to existing EM perf domains when those CPUs become online for
the first time after driver initialization.

This change is targeting platforms (for example, Lunar Lake) where
"small" CPUs (E-cores) are always more energy-efficient than the "big"
or "performance" CPUs (P-cores) when run at the same HWP performance
level, so it is sufficient to tell the EAS that E-cores are always
preferred (so long as there is enough spare capacity on one of them
to run the given task).

Accordingly, the perf domains are registered per CPU type (that is,
all P-cores belong to one perf domain and all E-cores belong to another
perf domain) and they are registered only if asymmetric CPU capacity is
enabled.  Each perf domain has a one-element states table and that
element only contains the relative cost value (the other fields in
it are not initialized, so they are all equal to zero), and the cost
value for the E-core perf domain is lower.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c |  110 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 104 insertions(+), 6 deletions(-)

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -8,6 +8,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/energy_model.h>
 #include <linux/kernel.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
@@ -938,6 +939,12 @@ static struct freq_attr *hwp_cpufreq_att
 	NULL,
 };
 
+enum hybrid_cpu_type {
+	HYBRID_PCORE = 0,
+	HYBRID_ECORE,
+	HYBRID_NR_TYPES
+};
+
 static struct cpudata *hybrid_max_perf_cpu __read_mostly;
 /*
  * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
@@ -945,6 +952,86 @@ static struct cpudata *hybrid_max_perf_c
  */
 static DEFINE_MUTEX(hybrid_capacity_lock);
 
+#ifdef CONFIG_ENERGY_MODEL
+struct hybrid_em_perf_domain {
+	cpumask_t cpumask;
+	struct device *dev;
+	struct em_data_callback cb;
+};
+
+static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
+			     unsigned long *cost)
+{
+	/*
+	 * The number used here needs to be higher than the analogous
+	 * one in hybrid_ecore_cost() below.  The units and the actual
+	 * values don't matter.
+	 */
+	*cost = 2;
+	return 0;
+}
+
+static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
+			     unsigned long *cost)
+{
+	*cost = 1;
+	return 0;
+}
+
+static struct hybrid_em_perf_domain perf_domains[HYBRID_NR_TYPES] = {
+	[HYBRID_PCORE] = { .cb.get_cost = hybrid_pcore_cost, },
+	[HYBRID_ECORE] = { .cb.get_cost = hybrid_ecore_cost, }
+};
+
+static bool hybrid_register_perf_domain(struct hybrid_em_perf_domain *pd)
+{
+	/*
+	 * Registering EM perf domains without asymmetric CPU capacity
+	 * support enabled is wasteful, so don't do that.
+	 */
+	if (!hybrid_max_perf_cpu)
+		return false;
+
+	pd->dev = get_cpu_device(cpumask_first(&pd->cpumask));
+	if (!pd->dev)
+		return false;
+
+	if (em_dev_register_perf_domain(pd->dev, 1, &pd->cb, &pd->cpumask, false)) {
+		pd->dev = NULL;
+		return false;
+	}
+
+	return true;
+}
+
+static void hybrid_register_all_perf_domains(void)
+{
+	enum hybrid_cpu_type type;
+
+	for (type = HYBRID_PCORE; type < HYBRID_NR_TYPES; type++)
+		hybrid_register_perf_domain(&perf_domains[type]);
+}
+
+static void hybrid_add_to_perf_domain(int cpu, enum hybrid_cpu_type type)
+{
+	struct hybrid_em_perf_domain *pd = &perf_domains[type];
+
+	guard(mutex)(&hybrid_capacity_lock);
+
+	if (cpumask_test_cpu(cpu, &pd->cpumask))
+		return;
+
+	cpumask_set_cpu(cpu, &pd->cpumask);
+	if (pd->dev)
+		em_dev_expand_perf_domain(pd->dev, cpu);
+	else if (hybrid_register_perf_domain(pd))
+		em_rebuild_perf_domains();
+}
+#else /* CONFIG_ENERGY_MODEL */
+static inline void hybrid_register_all_perf_domains(void) {}
+static inline void hybrid_add_to_perf_domain(int cpu, enum hybrid_cpu_type type) {}
+#endif /* !CONFIG_ENERGY_MODEL */
+
 static void hybrid_set_cpu_capacity(struct cpudata *cpu)
 {
 	arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf,
@@ -1034,11 +1121,14 @@ static void __hybrid_refresh_cpu_capacit
 	hybrid_update_cpu_capacity_scaling();
 }
 
-static void hybrid_refresh_cpu_capacity_scaling(void)
+static void hybrid_refresh_cpu_capacity_scaling(bool register_perf_domains)
 {
 	guard(mutex)(&hybrid_capacity_lock);
 
 	__hybrid_refresh_cpu_capacity_scaling();
+
+	if (register_perf_domains)
+		hybrid_register_all_perf_domains();
 }
 
 static void hybrid_init_cpu_capacity_scaling(bool refresh)
@@ -1049,7 +1139,7 @@ static void hybrid_init_cpu_capacity_sca
 	 * operation mode.
 	 */
 	if (refresh) {
-		hybrid_refresh_cpu_capacity_scaling();
+		hybrid_refresh_cpu_capacity_scaling(false);
 		return;
 	}
 
@@ -1059,10 +1149,14 @@ static void hybrid_init_cpu_capacity_sca
 	 * do not do that when SMT is in use.
 	 */
 	if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) {
-		hybrid_refresh_cpu_capacity_scaling();
+		/*
+		 * Perf domains are not registered before setting hybrid_max_perf_cpu,
+		 * so register them all after setting up CPU capacity scaling.
+		 */
+		hybrid_refresh_cpu_capacity_scaling(true);
 		/*
 		 * Disabling ITMT causes sched domains to be rebuilt to disable asym
-		 * packing and enable asym capacity.
+		 * packing and enable asym capacity and EAS.
 		 */
 		sched_clear_itmt_support();
 	}
@@ -2215,12 +2309,16 @@ static int hwp_get_cpu_scaling(int cpu)
 
 	smp_call_function_single(cpu, hybrid_get_type, &cpu_type, 1);
 	/* P-cores have a smaller perf level-to-freqency scaling factor. */
-	if (cpu_type == 0x40)
+	if (cpu_type == 0x40) {
+		hybrid_add_to_perf_domain(cpu, HYBRID_PCORE);
 		return hybrid_scaling_factor;
+	}
 
 	/* Use default core scaling for E-cores */
-	if (cpu_type == 0x20)
+	if (cpu_type == 0x20) {
+		hybrid_add_to_perf_domain(cpu, HYBRID_ECORE);
 		return core_get_scaling();
+	}
 
 	/*
 	 * If reached here, this system is either non-hybrid (like Tiger

Re: [RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Pierre Gondois 1 year, 2 months ago


On 11/8/24 17:46, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> Modify intel_pstate to register stub EM perf domains for CPUs on
> hybrid platforms via em_dev_register_perf_domain() and to use
> em_dev_expand_perf_domain() introduced previously for adding new
> CPUs to existing EM perf domains when those CPUs become online for
> the first time after driver initialization.
> 
> This change is targeting platforms (for example, Lunar Lake) where
> "small" CPUs (E-cores) are always more energy-efficient than the "big"
> or "performance" CPUs (P-cores) when run at the same HWP performance
> level, so it is sufficient to tell the EAS that E-cores are always
> preferred (so long as there is enough spare capacity on one of them
> to run the given task).
> 
> Accordingly, the perf domains are registered per CPU type (that is,
> all P-cores belong to one perf domain and all E-cores belong to another
> perf domain) and they are registered only if asymmetric CPU capacity is
> enabled.  Each perf domain has a one-element states table and that
> element only contains the relative cost value (the other fields in
> it are not initialized, so they are all equal to zero), and the cost
> value for the E-core perf domain is lower.
> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
>   drivers/cpufreq/intel_pstate.c |  110 ++++++++++++++++++++++++++++++++++++++---
>   1 file changed, 104 insertions(+), 6 deletions(-)
> 
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -8,6 +8,7 @@
>   
>   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>   
> +#include <linux/energy_model.h>
>   #include <linux/kernel.h>
>   #include <linux/kernel_stat.h>
>   #include <linux/module.h>
> @@ -938,6 +939,12 @@ static struct freq_attr *hwp_cpufreq_att
>   	NULL,
>   };
>   
> +enum hybrid_cpu_type {
> +	HYBRID_PCORE = 0,
> +	HYBRID_ECORE,
> +	HYBRID_NR_TYPES
> +};
> +
>   static struct cpudata *hybrid_max_perf_cpu __read_mostly;
>   /*
>    * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
> @@ -945,6 +952,86 @@ static struct cpudata *hybrid_max_perf_c
>    */
>   static DEFINE_MUTEX(hybrid_capacity_lock);
>   
> +#ifdef CONFIG_ENERGY_MODEL
> +struct hybrid_em_perf_domain {
> +	cpumask_t cpumask;
> +	struct device *dev;
> +	struct em_data_callback cb;
> +};
> +
> +static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
> +			     unsigned long *cost)
> +{
> +	/*
> +	 * The number used here needs to be higher than the analogous
> +	 * one in hybrid_ecore_cost() below.  The units and the actual
> +	 * values don't matter.
> +	 */
> +	*cost = 2;
> +	return 0;
> +}
> +
> +static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
> +			     unsigned long *cost)
> +{
> +	*cost = 1;
> +	return 0;
> +}

The artificial EM was introduced for CPPC based platforms since these platforms
only provide an 'efficiency class' entry to describe the relative efficiency
of CPUs. The case seems similar to yours.
'Fake' OPPs were created to have an incentive for EAS to balance the load on
the CPUs in one perf. domain. Indeed, in feec(), during the energy
computation of a pd, if the cost is independent from the max_util value,
then one CPU in the pd could end up having a high util, and another CPU a
NULL util.
For CPPC platforms, this was problematic as a lower OPP could have been
selected for the same load, so energy was lost for no reason.

In your case it seems that the OPP selection is done independently on each
CPU. However I assume it is still more energy efficient to have 2 CPUs
loaded at 50% than one CPU loaded at 100% and an idle CPU.

Also as Dietmar suggested, maybe it would make sense to have some
way to prefer an CPU with a "energy saving" HFI configuration than
a similar CPU with a "performance" HFI configuration.

Also, out of curiosity, do you have energy numbers to share ?

Regards,
Pierre

Re: [RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Rafael J. Wysocki 1 year, 2 months ago

On Mon, Nov 18, 2024 at 5:34 PM Pierre Gondois <pierre.gondois@arm.com> wrote:
>
>
>
> On 11/8/24 17:46, Rafael J. Wysocki wrote:
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >
> > Modify intel_pstate to register stub EM perf domains for CPUs on
> > hybrid platforms via em_dev_register_perf_domain() and to use
> > em_dev_expand_perf_domain() introduced previously for adding new
> > CPUs to existing EM perf domains when those CPUs become online for
> > the first time after driver initialization.
> >
> > This change is targeting platforms (for example, Lunar Lake) where
> > "small" CPUs (E-cores) are always more energy-efficient than the "big"
> > or "performance" CPUs (P-cores) when run at the same HWP performance
> > level, so it is sufficient to tell the EAS that E-cores are always
> > preferred (so long as there is enough spare capacity on one of them
> > to run the given task).
> >
> > Accordingly, the perf domains are registered per CPU type (that is,
> > all P-cores belong to one perf domain and all E-cores belong to another
> > perf domain) and they are registered only if asymmetric CPU capacity is
> > enabled.  Each perf domain has a one-element states table and that
> > element only contains the relative cost value (the other fields in
> > it are not initialized, so they are all equal to zero), and the cost
> > value for the E-core perf domain is lower.
> >
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > ---
> >   drivers/cpufreq/intel_pstate.c |  110 ++++++++++++++++++++++++++++++++++++++---
> >   1 file changed, 104 insertions(+), 6 deletions(-)
> >
> > Index: linux-pm/drivers/cpufreq/intel_pstate.c
> > ===================================================================
> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> > +++ linux-pm/drivers/cpufreq/intel_pstate.c
> > @@ -8,6 +8,7 @@
> >
> >   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> >
> > +#include <linux/energy_model.h>
> >   #include <linux/kernel.h>
> >   #include <linux/kernel_stat.h>
> >   #include <linux/module.h>
> > @@ -938,6 +939,12 @@ static struct freq_attr *hwp_cpufreq_att
> >       NULL,
> >   };
> >
> > +enum hybrid_cpu_type {
> > +     HYBRID_PCORE = 0,
> > +     HYBRID_ECORE,
> > +     HYBRID_NR_TYPES
> > +};
> > +
> >   static struct cpudata *hybrid_max_perf_cpu __read_mostly;
> >   /*
> >    * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
> > @@ -945,6 +952,86 @@ static struct cpudata *hybrid_max_perf_c
> >    */
> >   static DEFINE_MUTEX(hybrid_capacity_lock);
> >
> > +#ifdef CONFIG_ENERGY_MODEL
> > +struct hybrid_em_perf_domain {
> > +     cpumask_t cpumask;
> > +     struct device *dev;
> > +     struct em_data_callback cb;
> > +};
> > +
> > +static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
> > +                          unsigned long *cost)
> > +{
> > +     /*
> > +      * The number used here needs to be higher than the analogous
> > +      * one in hybrid_ecore_cost() below.  The units and the actual
> > +      * values don't matter.
> > +      */
> > +     *cost = 2;
> > +     return 0;
> > +}
> > +
> > +static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
> > +                          unsigned long *cost)
> > +{
> > +     *cost = 1;
> > +     return 0;
> > +}
>
> The artificial EM was introduced for CPPC based platforms since these platforms
> only provide an 'efficiency class' entry to describe the relative efficiency
> of CPUs. The case seems similar to yours.

It is, but I don't particularly like the CPPC driver's approach to this.

> 'Fake' OPPs were created to have an incentive for EAS to balance the load on
> the CPUs in one perf. domain. Indeed, in feec(), during the energy
> computation of a pd, if the cost is independent from the max_util value,
> then one CPU in the pd could end up having a high util, and another CPU a
> NULL util.

Isn't this a consequence of disabling load balancing by EAS?

> For CPPC platforms, this was problematic as a lower OPP could have been
> selected for the same load, so energy was lost for no reason.
>
> In your case it seems that the OPP selection is done independently on each
> CPU. However I assume it is still more energy efficient to have 2 CPUs
> loaded at 50% than one CPU loaded at 100% and an idle CPU.

Maybe.

It really depends on the cost of the idle state etc.

> Also as Dietmar suggested, maybe it would make sense to have some
> way to prefer an CPU with a "energy saving" HFI configuration than
> a similar CPU with a "performance" HFI configuration.

As it happens, E-cores have higher energy-efficiency scores in HFI AFAICS.

> Also, out of curiosity, do you have energy numbers to share ?

Not yet, but there will be some going forward.

Thanks!

Re: [RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Pierre Gondois 1 year, 1 month ago


On 11/19/24 18:20, Rafael J. Wysocki wrote:
> On Mon, Nov 18, 2024 at 5:34 PM Pierre Gondois <pierre.gondois@arm.com> wrote:
>>
>>
>>
>> On 11/8/24 17:46, Rafael J. Wysocki wrote:
>>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>>
>>> Modify intel_pstate to register stub EM perf domains for CPUs on
>>> hybrid platforms via em_dev_register_perf_domain() and to use
>>> em_dev_expand_perf_domain() introduced previously for adding new
>>> CPUs to existing EM perf domains when those CPUs become online for
>>> the first time after driver initialization.
>>>
>>> This change is targeting platforms (for example, Lunar Lake) where
>>> "small" CPUs (E-cores) are always more energy-efficient than the "big"
>>> or "performance" CPUs (P-cores) when run at the same HWP performance
>>> level, so it is sufficient to tell the EAS that E-cores are always
>>> preferred (so long as there is enough spare capacity on one of them
>>> to run the given task).
>>>
>>> Accordingly, the perf domains are registered per CPU type (that is,
>>> all P-cores belong to one perf domain and all E-cores belong to another
>>> perf domain) and they are registered only if asymmetric CPU capacity is
>>> enabled.  Each perf domain has a one-element states table and that
>>> element only contains the relative cost value (the other fields in
>>> it are not initialized, so they are all equal to zero), and the cost
>>> value for the E-core perf domain is lower.
>>>
>>> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>> ---
>>>    drivers/cpufreq/intel_pstate.c |  110 ++++++++++++++++++++++++++++++++++++++---
>>>    1 file changed, 104 insertions(+), 6 deletions(-)
>>>
>>> Index: linux-pm/drivers/cpufreq/intel_pstate.c
>>> ===================================================================
>>> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
>>> +++ linux-pm/drivers/cpufreq/intel_pstate.c
>>> @@ -8,6 +8,7 @@
>>>
>>>    #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>>
>>> +#include <linux/energy_model.h>
>>>    #include <linux/kernel.h>
>>>    #include <linux/kernel_stat.h>
>>>    #include <linux/module.h>
>>> @@ -938,6 +939,12 @@ static struct freq_attr *hwp_cpufreq_att
>>>        NULL,
>>>    };
>>>
>>> +enum hybrid_cpu_type {
>>> +     HYBRID_PCORE = 0,
>>> +     HYBRID_ECORE,
>>> +     HYBRID_NR_TYPES
>>> +};
>>> +
>>>    static struct cpudata *hybrid_max_perf_cpu __read_mostly;
>>>    /*
>>>     * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
>>> @@ -945,6 +952,86 @@ static struct cpudata *hybrid_max_perf_c
>>>     */
>>>    static DEFINE_MUTEX(hybrid_capacity_lock);
>>>
>>> +#ifdef CONFIG_ENERGY_MODEL
>>> +struct hybrid_em_perf_domain {
>>> +     cpumask_t cpumask;
>>> +     struct device *dev;
>>> +     struct em_data_callback cb;
>>> +};
>>> +
>>> +static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
>>> +                          unsigned long *cost)
>>> +{
>>> +     /*
>>> +      * The number used here needs to be higher than the analogous
>>> +      * one in hybrid_ecore_cost() below.  The units and the actual
>>> +      * values don't matter.
>>> +      */
>>> +     *cost = 2;
>>> +     return 0;
>>> +}
>>> +
>>> +static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
>>> +                          unsigned long *cost)
>>> +{
>>> +     *cost = 1;
>>> +     return 0;
>>> +}
>>
>> The artificial EM was introduced for CPPC based platforms since these platforms
>> only provide an 'efficiency class' entry to describe the relative efficiency
>> of CPUs. The case seems similar to yours.
> 
> It is, but I don't particularly like the CPPC driver's approach to this.
> 
>> 'Fake' OPPs were created to have an incentive for EAS to balance the load on
>> the CPUs in one perf. domain. Indeed, in feec(), during the energy
>> computation of a pd, if the cost is independent from the max_util value,
>> then one CPU in the pd could end up having a high util, and another CPU a
>> NULL util.
> 
> Isn't this a consequence of disabling load balancing by EAS?

Yes. Going in that direction, this patch from Vincent should help balancing
the load in your case I think. The patch evaluates other factors when the energy
cost of multiple CPU-candidates is the same.

Meaning, if all CPUs of the same type have only one OPP, the number of tasks
and the the load of the CPUs is then compared. This is not the case currently.
Doing so will help to avoid having one CPU close to being overutilized while
others are idle.

However I think it would still be better to have multiple OPPs in the energy model.
Indeed, it would be closer to reality as I assume that for Intel aswell, there
might be frequency domains and the frequency of the domain is lead by the most
utilized CPU.
This would also avoid hitting corner cases. As if there is one big task and many
small tasks, balancing on the number of tasks per CPU is not the best idea.

https://lore.kernel.org/all/20240830130309.2141697-4-vincent.guittot@linaro.org/

> 
>> For CPPC platforms, this was problematic as a lower OPP could have been
>> selected for the same load, so energy was lost for no reason.
>>
>> In your case it seems that the OPP selection is done independently on each
>> CPU. However I assume it is still more energy efficient to have 2 CPUs
>> loaded at 50% than one CPU loaded at 100% and an idle CPU.
> 
> Maybe.
> 
> It really depends on the cost of the idle state etc.
> 
>> Also as Dietmar suggested, maybe it would make sense to have some
>> way to prefer an CPU with a "energy saving" HFI configuration than
>> a similar CPU with a "performance" HFI configuration.
> 
> As it happens, E-cores have higher energy-efficiency scores in HFI AFAICS.
> 
>> Also, out of curiosity, do you have energy numbers to share ?
> 
> Not yet, but there will be some going forward.
> 
> Thanks!

Re: [RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Rafael J. Wysocki 1 year, 1 month ago

On Mon, Dec 16, 2024 at 4:33 PM Pierre Gondois <pierre.gondois@arm.com> wrote:
>
>
>
> On 11/19/24 18:20, Rafael J. Wysocki wrote:
> > On Mon, Nov 18, 2024 at 5:34 PM Pierre Gondois <pierre.gondois@arm.com> wrote:
> >>
> >>
> >>
> >> On 11/8/24 17:46, Rafael J. Wysocki wrote:
> >>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >>>
> >>> Modify intel_pstate to register stub EM perf domains for CPUs on
> >>> hybrid platforms via em_dev_register_perf_domain() and to use
> >>> em_dev_expand_perf_domain() introduced previously for adding new
> >>> CPUs to existing EM perf domains when those CPUs become online for
> >>> the first time after driver initialization.
> >>>
> >>> This change is targeting platforms (for example, Lunar Lake) where
> >>> "small" CPUs (E-cores) are always more energy-efficient than the "big"
> >>> or "performance" CPUs (P-cores) when run at the same HWP performance
> >>> level, so it is sufficient to tell the EAS that E-cores are always
> >>> preferred (so long as there is enough spare capacity on one of them
> >>> to run the given task).
> >>>
> >>> Accordingly, the perf domains are registered per CPU type (that is,
> >>> all P-cores belong to one perf domain and all E-cores belong to another
> >>> perf domain) and they are registered only if asymmetric CPU capacity is
> >>> enabled.  Each perf domain has a one-element states table and that
> >>> element only contains the relative cost value (the other fields in
> >>> it are not initialized, so they are all equal to zero), and the cost
> >>> value for the E-core perf domain is lower.
> >>>
> >>> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >>> ---
> >>>    drivers/cpufreq/intel_pstate.c |  110 ++++++++++++++++++++++++++++++++++++++---
> >>>    1 file changed, 104 insertions(+), 6 deletions(-)
> >>>
> >>> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> >>> ===================================================================
> >>> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> >>> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> >>> @@ -8,6 +8,7 @@
> >>>
> >>>    #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> >>>
> >>> +#include <linux/energy_model.h>
> >>>    #include <linux/kernel.h>
> >>>    #include <linux/kernel_stat.h>
> >>>    #include <linux/module.h>
> >>> @@ -938,6 +939,12 @@ static struct freq_attr *hwp_cpufreq_att
> >>>        NULL,
> >>>    };
> >>>
> >>> +enum hybrid_cpu_type {
> >>> +     HYBRID_PCORE = 0,
> >>> +     HYBRID_ECORE,
> >>> +     HYBRID_NR_TYPES
> >>> +};
> >>> +
> >>>    static struct cpudata *hybrid_max_perf_cpu __read_mostly;
> >>>    /*
> >>>     * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
> >>> @@ -945,6 +952,86 @@ static struct cpudata *hybrid_max_perf_c
> >>>     */
> >>>    static DEFINE_MUTEX(hybrid_capacity_lock);
> >>>
> >>> +#ifdef CONFIG_ENERGY_MODEL
> >>> +struct hybrid_em_perf_domain {
> >>> +     cpumask_t cpumask;
> >>> +     struct device *dev;
> >>> +     struct em_data_callback cb;
> >>> +};
> >>> +
> >>> +static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
> >>> +                          unsigned long *cost)
> >>> +{
> >>> +     /*
> >>> +      * The number used here needs to be higher than the analogous
> >>> +      * one in hybrid_ecore_cost() below.  The units and the actual
> >>> +      * values don't matter.
> >>> +      */
> >>> +     *cost = 2;
> >>> +     return 0;
> >>> +}
> >>> +
> >>> +static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
> >>> +                          unsigned long *cost)
> >>> +{
> >>> +     *cost = 1;
> >>> +     return 0;
> >>> +}
> >>
> >> The artificial EM was introduced for CPPC based platforms since these platforms
> >> only provide an 'efficiency class' entry to describe the relative efficiency
> >> of CPUs. The case seems similar to yours.
> >
> > It is, but I don't particularly like the CPPC driver's approach to this.
> >
> >> 'Fake' OPPs were created to have an incentive for EAS to balance the load on
> >> the CPUs in one perf. domain. Indeed, in feec(), during the energy
> >> computation of a pd, if the cost is independent from the max_util value,
> >> then one CPU in the pd could end up having a high util, and another CPU a
> >> NULL util.
> >
> > Isn't this a consequence of disabling load balancing by EAS?
>
> Yes. Going in that direction, this patch from Vincent should help balancing
> the load in your case I think. The patch evaluates other factors when the energy
> cost of multiple CPU-candidates is the same.
>
> Meaning, if all CPUs of the same type have only one OPP, the number of tasks
> and the the load of the CPUs is then compared. This is not the case currently.
> Doing so will help to avoid having one CPU close to being overutilized while
> others are idle.
>
> However I think it would still be better to have multiple OPPs in the energy model.
> Indeed, it would be closer to reality as I assume that for Intel aswell, there
> might be frequency domains and the frequency of the domain is lead by the most
> utilized CPU.

There are a couple of problems with this on my target platforms.

First, it is not actually known what the real OPPs are and how the
coordination works.

For some cores (P-cores mostly) the voltage can be adjusted per-core
and for some others there are coordination domains, but the
coordination there involves idle states (for instance, one core may be
allowed to run at the max turbo frequency when the other ones in the
same domain are in idle states, but not otherwise) and dynamic
balancing (that is, the effective capacity depends on how much energy
is used by the domain over time).

Thus whatever is put into the perf states table will be way off most
of the time and there isn't even a good way to choose the numbers to
put in there.  Using the entire range of HWP P-states for that would
be completely impractical IMV because it would only increase overhead
for no real benefit.  Either it would need to be done per-CPU, which
doesn't really make sense because CPUs of the same type really share
the same cost-performance curve, or the assumption that the entire
domain is led by the most utilized CPU would need to be lifted to some
extent.  That would require some more intrusive changes in EAS which
I'd rather avoid unless the simplest approach doesn't work.

The second problem is that the current platforms are much smaller than
what we're expecting to see in the future and whatever is done today
needs to scale.

Also, I really wouldn't want to have to register special perf domains
for favored cores that share the cost-performance curve with the other
cores of the same type except that they can turbo-up higher if
everyone else is idle or similar.

> This would also avoid hitting corner cases. As if there is one big task and many
> small tasks, balancing on the number of tasks per CPU is not the best idea.
>
> https://lore.kernel.org/all/20240830130309.2141697-4-vincent.guittot@linaro.org/

Understood.

Re: [RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Dietmar Eggemann 1 year, 2 months ago

On 08/11/2024 17:46, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> Modify intel_pstate to register stub EM perf domains for CPUs on
> hybrid platforms via em_dev_register_perf_domain() and to use
> em_dev_expand_perf_domain() introduced previously for adding new
> CPUs to existing EM perf domains when those CPUs become online for
> the first time after driver initialization.
> 
> This change is targeting platforms (for example, Lunar Lake) where
> "small" CPUs (E-cores) are always more energy-efficient than the "big"
> or "performance" CPUs (P-cores) when run at the same HWP performance
> level, so it is sufficient to tell the EAS that E-cores are always
> preferred (so long as there is enough spare capacity on one of them
> to run the given task).

By treating all big CPUs (ignoring the different itmt prio values
between them) we would have a system in which PD's are not in sync with
the asym_cap_list* or the CPU capacities of individual CPUs and sched
groups within the sched domain. Not sure if we want to go this way?

* used by misfit handling - 22d5607400c6 ("sched/fair: Check if a task
has a fitting CPU when updating misfit")

> Accordingly, the perf domains are registered per CPU type (that is,
> all P-cores belong to one perf domain and all E-cores belong to another
> perf domain) and they are registered only if asymmetric CPU capacity is
> enabled.  Each perf domain has a one-element states table and that
> element only contains the relative cost value (the other fields in
> it are not initialized, so they are all equal to zero), and the cost
> value for the E-core perf domain is lower.

[...]

> +static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
> +			     unsigned long *cost)
> +{
> +	/*
> +	 * The number used here needs to be higher than the analogous
> +	 * one in hybrid_ecore_cost() below.  The units and the actual
> +	 * values don't matter.
> +	 */
> +	*cost = 2;
> +	return 0;

So you're not tying this to HFI energy scores?

> +}
> +
> +static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
> +			     unsigned long *cost)
> +{
> +	*cost = 1;
> +	return 0;
> +}
> +
> +static struct hybrid_em_perf_domain perf_domains[HYBRID_NR_TYPES] = {
> +	[HYBRID_PCORE] = { .cb.get_cost = hybrid_pcore_cost, },
> +	[HYBRID_ECORE] = { .cb.get_cost = hybrid_ecore_cost, }
> +};
> +
> +static bool hybrid_register_perf_domain(struct hybrid_em_perf_domain *pd)
> +{
> +	/*
> +	 * Registering EM perf domains without asymmetric CPU capacity
> +	 * support enabled is wasteful, so don't do that.
> +	 */
> +	if (!hybrid_max_perf_cpu)
> +		return false;
> +
> +	pd->dev = get_cpu_device(cpumask_first(&pd->cpumask));
> +	if (!pd->dev)
> +		return false;
> +
> +	if (em_dev_register_perf_domain(pd->dev, 1, &pd->cb, &pd->cpumask, false)) {
> +		pd->dev = NULL;
> +		return false;
> +	}
> +
> +	return true;
> +}

What are the issues in case you would use the existing ways (non-stub)
to setup the EM?

static int intel_pstate_get_cpu_cost()

static void intel_pstate_register_em(struct cpufreq_policy *policy)

  struct em_data_callback em_cb = EM_ADV_DATA_CB(NULL,
                                              intel_pstate_get_cpu_cost)

  em_dev_register_perf_domain(get_cpu_device(policy->cpu), 1,
                              &em_cb, policy->related_cpus, 1);
                                      ^^^^^^^^^^^^^^^^^^^^*

static void intel_pstate_set_register_em_fct(void)

  default_driver->register_em = intel_pstate_register_em

static int __init intel_pstate_init(void)

  ...
  intel_pstate_set_register_em_fct()
  ...

I guess one issue is the per-CPU policy as an argument to
em_dev_register_perf_domain() (*) ?

> +static void hybrid_register_all_perf_domains(void)
> +{
> +	enum hybrid_cpu_type type;
> +
> +	for (type = HYBRID_PCORE; type < HYBRID_NR_TYPES; type++)
> +		hybrid_register_perf_domain(&perf_domains[type]);
> +}
> +
> +static void hybrid_add_to_perf_domain(int cpu, enum hybrid_cpu_type type)
> +{
> +	struct hybrid_em_perf_domain *pd = &perf_domains[type];
> +
> +	guard(mutex)(&hybrid_capacity_lock);
> +
> +	if (cpumask_test_cpu(cpu, &pd->cpumask))
> +		return;
> +
> +	cpumask_set_cpu(cpu, &pd->cpumask);
> +	if (pd->dev)
> +		em_dev_expand_perf_domain(pd->dev, cpu);
> +	else if (hybrid_register_perf_domain(pd))
> +		em_rebuild_perf_domains();

I assume that the 'if' and the 'else if' condition here are only taken
when the CPU is brought online after boot?

[...]

Re: [RFC][PATCH v0.1 6/6] cpufreq: intel_pstate: Add basic EAS support on hybrid platforms

Posted by Rafael J. Wysocki 1 year, 2 months ago

First off, thanks for all  the feedback!

On Tue, Nov 12, 2024 at 9:21 AM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> On 08/11/2024 17:46, Rafael J. Wysocki wrote:
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> >
> > Modify intel_pstate to register stub EM perf domains for CPUs on
> > hybrid platforms via em_dev_register_perf_domain() and to use
> > em_dev_expand_perf_domain() introduced previously for adding new
> > CPUs to existing EM perf domains when those CPUs become online for
> > the first time after driver initialization.
> >
> > This change is targeting platforms (for example, Lunar Lake) where
> > "small" CPUs (E-cores) are always more energy-efficient than the "big"
> > or "performance" CPUs (P-cores) when run at the same HWP performance
> > level, so it is sufficient to tell the EAS that E-cores are always
> > preferred (so long as there is enough spare capacity on one of them
> > to run the given task).
>
> By treating all big CPUs (ignoring the different itmt prio values
> between them) we would have a system in which PD's are not in sync with
> the asym_cap_list* or the CPU capacities of individual CPUs and sched
> groups within the sched domain. Not sure if we want to go this way?

I guess you want the biggest tasks to be scheduled at the most-capable CPUs.

That's fair, and it may even improve single-threaded performance in
some cases I suppose, but then the cost for the "favored cores" PD
would be the same as for the "other P-cores" PD because there is no
difference between them other than the top-most turbo bin, so we'd
compare PDs with the same cost and that wouldn't be super-useful.

> * used by misfit handling - 22d5607400c6 ("sched/fair: Check if a task
> has a fitting CPU when updating misfit")
>
> > Accordingly, the perf domains are registered per CPU type (that is,
> > all P-cores belong to one perf domain and all E-cores belong to another
> > perf domain) and they are registered only if asymmetric CPU capacity is
> > enabled.  Each perf domain has a one-element states table and that
> > element only contains the relative cost value (the other fields in
> > it are not initialized, so they are all equal to zero), and the cost
> > value for the E-core perf domain is lower.
>
> [...]
>
> > +static int hybrid_pcore_cost(struct device *dev, unsigned long freq,
> > +                          unsigned long *cost)
> > +{
> > +     /*
> > +      * The number used here needs to be higher than the analogous
> > +      * one in hybrid_ecore_cost() below.  The units and the actual
> > +      * values don't matter.
> > +      */
> > +     *cost = 2;
> > +     return 0;
>
> So you're not tying this to HFI energy scores?

Not at this time.

> > +}
> > +
> > +static int hybrid_ecore_cost(struct device *dev, unsigned long freq,
> > +                          unsigned long *cost)
> > +{
> > +     *cost = 1;
> > +     return 0;
> > +}
> > +
> > +static struct hybrid_em_perf_domain perf_domains[HYBRID_NR_TYPES] = {
> > +     [HYBRID_PCORE] = { .cb.get_cost = hybrid_pcore_cost, },
> > +     [HYBRID_ECORE] = { .cb.get_cost = hybrid_ecore_cost, }
> > +};
> > +
> > +static bool hybrid_register_perf_domain(struct hybrid_em_perf_domain *pd)
> > +{
> > +     /*
> > +      * Registering EM perf domains without asymmetric CPU capacity
> > +      * support enabled is wasteful, so don't do that.
> > +      */
> > +     if (!hybrid_max_perf_cpu)
> > +             return false;
> > +
> > +     pd->dev = get_cpu_device(cpumask_first(&pd->cpumask));
> > +     if (!pd->dev)
> > +             return false;
> > +
> > +     if (em_dev_register_perf_domain(pd->dev, 1, &pd->cb, &pd->cpumask, false)) {
> > +             pd->dev = NULL;
> > +             return false;
> > +     }
> > +
> > +     return true;
> > +}
>
> What are the issues in case you would use the existing ways (non-stub)
> to setup the EM?
>
> static int intel_pstate_get_cpu_cost()
>
> static void intel_pstate_register_em(struct cpufreq_policy *policy)
>
>   struct em_data_callback em_cb = EM_ADV_DATA_CB(NULL,
>                                               intel_pstate_get_cpu_cost)
>
>   em_dev_register_perf_domain(get_cpu_device(policy->cpu), 1,
>                               &em_cb, policy->related_cpus, 1);
>                                       ^^^^^^^^^^^^^^^^^^^^*

I'm not sure what you are asking about here, but I'll try to answer.

No, I don't want to register a PD per policy with one CPU in it
because that would mean useless comparing PDs with the same cost and
CPU capacity.

> static void intel_pstate_set_register_em_fct(void)
>
>   default_driver->register_em = intel_pstate_register_em

No, I don't want to register PDs through cpufreq because it is too
early (and see above).

> static int __init intel_pstate_init(void)
>
>   ...
>   intel_pstate_set_register_em_fct()
>   ...
>
> I guess one issue is the per-CPU policy as an argument to
> em_dev_register_perf_domain() (*) ?

Yes.

> > +static void hybrid_register_all_perf_domains(void)
> > +{
> > +     enum hybrid_cpu_type type;
> > +
> > +     for (type = HYBRID_PCORE; type < HYBRID_NR_TYPES; type++)
> > +             hybrid_register_perf_domain(&perf_domains[type]);
> > +}
> > +
> > +static void hybrid_add_to_perf_domain(int cpu, enum hybrid_cpu_type type)
> > +{
> > +     struct hybrid_em_perf_domain *pd = &perf_domains[type];
> > +
> > +     guard(mutex)(&hybrid_capacity_lock);
> > +
> > +     if (cpumask_test_cpu(cpu, &pd->cpumask))
> > +             return;
> > +
> > +     cpumask_set_cpu(cpu, &pd->cpumask);
> > +     if (pd->dev)
> > +             em_dev_expand_perf_domain(pd->dev, cpu);
> > +     else if (hybrid_register_perf_domain(pd))
> > +             em_rebuild_perf_domains();
>
> I assume that the 'if' and the 'else if' condition here are only taken
> when the CPU is brought online after boot?

Yes.