[PATCH v9 1/8] xen/cpufreq: embed hwp into struct cpufreq_policy{}

Penny Zheng posted 8 patches 1 week, 2 days ago
[PATCH v9 1/8] xen/cpufreq: embed hwp into struct cpufreq_policy{}
Posted by Penny Zheng 1 week, 2 days ago
For cpus sharing one cpufreq domain, cpufreq_driver.init() is
only invoked on the firstcpu, so current per-CPU hwp driver data
struct hwp_drv_data{} actually fails to be allocated for cpus other than the
first one. There is no need to make it per-CPU.
We embed struct hwp_drv_data{} into struct cpufreq_policy{}, then cpus could
share the hwp driver data allocated for the firstcpu, like the way they share
struct cpufreq_policy{}. We also make it a union, with "hwp", and later
"amd-cppc" as a sub-struct.

Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Penny Zheng <Penny.Zheng@amd.com>
---
v8 -> v9:
- new commit
---
 xen/arch/x86/acpi/cpufreq/hwp.c    | 32 +++++++++++++-----------------
 xen/include/acpi/cpufreq/cpufreq.h |  6 ++++++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/xen/arch/x86/acpi/cpufreq/hwp.c b/xen/arch/x86/acpi/cpufreq/hwp.c
index 240491c96a..5c98f3eb3e 100644
--- a/xen/arch/x86/acpi/cpufreq/hwp.c
+++ b/xen/arch/x86/acpi/cpufreq/hwp.c
@@ -67,7 +67,6 @@ struct hwp_drv_data
     uint8_t desired;
     uint8_t energy_perf;
 };
-static DEFINE_PER_CPU_READ_MOSTLY(struct hwp_drv_data *, hwp_drv_data);
 
 #define hwp_err(cpu, fmt, args...) \
     printk(XENLOG_ERR "HWP: CPU%u error: " fmt, cpu, ## args)
@@ -224,7 +223,7 @@ static bool __init hwp_available(void)
 
 static int cf_check hwp_cpufreq_verify(struct cpufreq_policy *policy)
 {
-    struct hwp_drv_data *data = per_cpu(hwp_drv_data, policy->cpu);
+    struct hwp_drv_data *data = policy->u.hwp;
 
     if ( !feature_hwp_activity_window && data->activity_window )
     {
@@ -239,7 +238,7 @@ static int cf_check hwp_cpufreq_verify(struct cpufreq_policy *policy)
 static void cf_check hwp_write_request(void *info)
 {
     const struct cpufreq_policy *policy = info;
-    struct hwp_drv_data *data = this_cpu(hwp_drv_data);
+    struct hwp_drv_data *data = policy->u.hwp;
     union hwp_request hwp_req = data->curr_req;
 
     data->ret = 0;
@@ -259,7 +258,7 @@ static int cf_check hwp_cpufreq_target(struct cpufreq_policy *policy,
                                        unsigned int relation)
 {
     unsigned int cpu = policy->cpu;
-    struct hwp_drv_data *data = per_cpu(hwp_drv_data, cpu);
+    struct hwp_drv_data *data = policy->u.hwp;
     /* Zero everything to ensure reserved bits are zero... */
     union hwp_request hwp_req = { .raw = 0 };
 
@@ -350,7 +349,7 @@ static void hwp_get_cpu_speeds(struct cpufreq_policy *policy)
 static void cf_check hwp_init_msrs(void *info)
 {
     struct cpufreq_policy *policy = info;
-    struct hwp_drv_data *data = this_cpu(hwp_drv_data);
+    struct hwp_drv_data *data = policy->u.hwp;
     uint64_t val;
 
     /*
@@ -426,15 +425,14 @@ static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
     policy->governor = &cpufreq_gov_hwp;
 
-    per_cpu(hwp_drv_data, cpu) = data;
+    policy->u.hwp = data;
 
     on_selected_cpus(cpumask_of(cpu), hwp_init_msrs, policy, 1);
 
     if ( data->curr_req.raw == -1 )
     {
         hwp_err(cpu, "Could not initialize HWP properly\n");
-        per_cpu(hwp_drv_data, cpu) = NULL;
-        xfree(data);
+        XFREE(policy->u.hwp);
         return -ENODEV;
     }
 
@@ -462,10 +460,8 @@ static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 static int cf_check hwp_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-    struct hwp_drv_data *data = per_cpu(hwp_drv_data, policy->cpu);
-
-    per_cpu(hwp_drv_data, policy->cpu) = NULL;
-    xfree(data);
+    if ( policy->u.hwp )
+        XFREE(policy->u.hwp);
 
     return 0;
 }
@@ -480,7 +476,7 @@ static int cf_check hwp_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 static void cf_check hwp_set_misc_turbo(void *info)
 {
     const struct cpufreq_policy *policy = info;
-    struct hwp_drv_data *data = per_cpu(hwp_drv_data, policy->cpu);
+    struct hwp_drv_data *data = policy->u.hwp;
     uint64_t msr;
 
     data->ret = 0;
@@ -511,7 +507,7 @@ static int cf_check hwp_cpufreq_update(unsigned int cpu, struct cpufreq_policy *
 {
     on_selected_cpus(cpumask_of(cpu), hwp_set_misc_turbo, policy, 1);
 
-    return per_cpu(hwp_drv_data, cpu)->ret;
+    return policy->u.hwp->ret;
 }
 #endif /* CONFIG_PM_OP */
 
@@ -531,9 +527,10 @@ hwp_cpufreq_driver = {
 int get_hwp_para(unsigned int cpu,
                  struct xen_get_cppc_para *cppc_para)
 {
-    const struct hwp_drv_data *data = per_cpu(hwp_drv_data, cpu);
+    const struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_policy, cpu);
+    const struct hwp_drv_data *data;
 
-    if ( data == NULL )
+    if ( !policy || !(data = policy->u.hwp) )
         return -ENODATA;
 
     cppc_para->features         =
@@ -554,8 +551,7 @@ int get_hwp_para(unsigned int cpu,
 int set_hwp_para(struct cpufreq_policy *policy,
                  struct xen_set_cppc_para *set_cppc)
 {
-    unsigned int cpu = policy->cpu;
-    struct hwp_drv_data *data = per_cpu(hwp_drv_data, cpu);
+    struct hwp_drv_data *data = policy->u.hwp;
     bool cleared_act_window = false;
 
     if ( data == NULL )
diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h
index 5d4881eea8..c0ecd690c5 100644
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -62,6 +62,7 @@ struct perf_limits {
     uint32_t min_policy_pct;
 };
 
+struct hwp_drv_data;
 struct cpufreq_policy {
     cpumask_var_t       cpus;          /* affected CPUs */
     unsigned int        shared_type;   /* ANY or ALL affected CPUs
@@ -81,6 +82,11 @@ struct cpufreq_policy {
     int8_t              turbo;  /* tristate flag: 0 for unsupported
                                  * -1 for disable, 1 for enabled
                                  * See CPUFREQ_TURBO_* below for defines */
+    union {
+#ifdef CONFIG_INTEL
+        struct hwp_drv_data *hwp; /* Driver data for Intel HWP */
+#endif
+    } u;
 };
 DECLARE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_policy);
 
-- 
2.34.1
Re: [PATCH v9 1/8] xen/cpufreq: embed hwp into struct cpufreq_policy{}
Posted by Jan Beulich 1 week, 2 days ago
On 04.09.2025 08:35, Penny Zheng wrote:
> For cpus sharing one cpufreq domain, cpufreq_driver.init() is
> only invoked on the firstcpu, so current per-CPU hwp driver data
> struct hwp_drv_data{} actually fails to be allocated for cpus other than the
> first one. There is no need to make it per-CPU.
> We embed struct hwp_drv_data{} into struct cpufreq_policy{}, then cpus could
> share the hwp driver data allocated for the firstcpu, like the way they share
> struct cpufreq_policy{}. We also make it a union, with "hwp", and later
> "amd-cppc" as a sub-struct.

And ACPI, as per my patch (which then will need re-basing).

> Suggested-by: Jan Beulich <jbeulich@suse.com>

Not quite, this really is Reported-by: as it's a bug you fix, and in turn it
also wants to gain a Fixes: tag. This also will need backporting.

It would also have been nice if you had Cc-ed Jason right away, seeing that
this code was all written by him.

> @@ -259,7 +258,7 @@ static int cf_check hwp_cpufreq_target(struct cpufreq_policy *policy,
>                                         unsigned int relation)
>  {
>      unsigned int cpu = policy->cpu;
> -    struct hwp_drv_data *data = per_cpu(hwp_drv_data, cpu);
> +    struct hwp_drv_data *data = policy->u.hwp;
>      /* Zero everything to ensure reserved bits are zero... */
>      union hwp_request hwp_req = { .raw = 0 };

Further down in this same function we have

    on_selected_cpus(cpumask_of(cpu), hwp_write_request, policy, 1);

That's similarly problematic when the CPU denoted by policy->cpu isn't
online anymore. (It's not quite clear whether all related issues would
want fixing together, or in multiple patches.)

> @@ -350,7 +349,7 @@ static void hwp_get_cpu_speeds(struct cpufreq_policy *policy)
>  static void cf_check hwp_init_msrs(void *info)
>  {
>      struct cpufreq_policy *policy = info;
> -    struct hwp_drv_data *data = this_cpu(hwp_drv_data);
> +    struct hwp_drv_data *data = policy->u.hwp;
>      uint64_t val;
>  
>      /*
> @@ -426,15 +425,14 @@ static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
>  
>      policy->governor = &cpufreq_gov_hwp;
>  
> -    per_cpu(hwp_drv_data, cpu) = data;
> +    policy->u.hwp = data;
>  
>      on_selected_cpus(cpumask_of(cpu), hwp_init_msrs, policy, 1);

If multiple CPUs are in a domain, not all of them will make it here. By
implication the MSRs accessed by hwp_init_msrs() would need to have wider
than thread scope. The SDM, afaics, says nothing either way in this regard
in the Architectural MSRs section. Later model-specific tables have some
data.

Which gets me back to my original question: Is "sharing" actually possible
for HWP? Note further how there are both HWP_REQUEST and HWP_REQUEST_PKG
MSRs, for example. Which one is (to be) used looks to be controlled by
HWP_CTL.PKG_CTL_POLARITY.

> @@ -462,10 +460,8 @@ static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
>  
>  static int cf_check hwp_cpufreq_cpu_exit(struct cpufreq_policy *policy)
>  {
> -    struct hwp_drv_data *data = per_cpu(hwp_drv_data, policy->cpu);
> -
> -    per_cpu(hwp_drv_data, policy->cpu) = NULL;
> -    xfree(data);
> +    if ( policy->u.hwp )
> +        XFREE(policy->u.hwp);

No if() needed here.

> @@ -480,7 +476,7 @@ static int cf_check hwp_cpufreq_cpu_exit(struct cpufreq_policy *policy)
>  static void cf_check hwp_set_misc_turbo(void *info)
>  {
>      const struct cpufreq_policy *policy = info;
> -    struct hwp_drv_data *data = per_cpu(hwp_drv_data, policy->cpu);
> +    struct hwp_drv_data *data = policy->u.hwp;
>      uint64_t msr;
>  
>      data->ret = 0;
> @@ -511,7 +507,7 @@ static int cf_check hwp_cpufreq_update(unsigned int cpu, struct cpufreq_policy *
>  {
>      on_selected_cpus(cpumask_of(cpu), hwp_set_misc_turbo, policy, 1);
>  
> -    return per_cpu(hwp_drv_data, cpu)->ret;
> +    return policy->u.hwp->ret;
>  }
>  #endif /* CONFIG_PM_OP */

Same concern here wrt MSR scope. MISC_ENABLE.TURBO_DISENGAGE's scope is
package as per the few tables which have the bit explicitly explained;
whether that extends to all models is unclear.

> --- a/xen/include/acpi/cpufreq/cpufreq.h
> +++ b/xen/include/acpi/cpufreq/cpufreq.h
> @@ -62,6 +62,7 @@ struct perf_limits {
>      uint32_t min_policy_pct;
>  };
>  
> +struct hwp_drv_data;

This shouldn't be needed.

> @@ -81,6 +82,11 @@ struct cpufreq_policy {
>      int8_t              turbo;  /* tristate flag: 0 for unsupported
>                                   * -1 for disable, 1 for enabled
>                                   * See CPUFREQ_TURBO_* below for defines */
> +    union {
> +#ifdef CONFIG_INTEL
> +        struct hwp_drv_data *hwp; /* Driver data for Intel HWP */
> +#endif

While it may make for a smaller diff, ultimately I think we don't want
this to be a pointer, much like I've done in my patch for the ACPI driver.

> +    } u;

This wants to either not have a name at all, or be named e.g. drv_data.

Jan
Re: [PATCH v9 1/8] xen/cpufreq: embed hwp into struct cpufreq_policy{}
Posted by Jason Andryuk 1 week, 2 days ago
On 2025-09-04 07:50, Jan Beulich wrote:
> On 04.09.2025 08:35, Penny Zheng wrote:
>> For cpus sharing one cpufreq domain, cpufreq_driver.init() is
>> only invoked on the firstcpu, so current per-CPU hwp driver data
>> struct hwp_drv_data{} actually fails to be allocated for cpus other than the
>> first one.
 >> There is no need to make it per-CPU.>> We embed struct 
hwp_drv_data{} into struct cpufreq_policy{}, then cpus could
>> share the hwp driver data allocated for the firstcpu, like the way they share
>> struct cpufreq_policy{}. We also make it a union, with "hwp", and later
>> "amd-cppc" as a sub-struct.
> 
> And ACPI, as per my patch (which then will need re-basing).
> 
>> Suggested-by: Jan Beulich <jbeulich@suse.com>
> 
> Not quite, this really is Reported-by: as it's a bug you fix, and in turn it
> also wants to gain a Fixes: tag. This also will need backporting.
> 
> It would also have been nice if you had Cc-ed Jason right away, seeing that
> this code was all written by him.
> 
>> @@ -259,7 +258,7 @@ static int cf_check hwp_cpufreq_target(struct cpufreq_policy *policy,
>>                                          unsigned int relation)
>>   {
>>       unsigned int cpu = policy->cpu;
>> -    struct hwp_drv_data *data = per_cpu(hwp_drv_data, cpu);
>> +    struct hwp_drv_data *data = policy->u.hwp;
>>       /* Zero everything to ensure reserved bits are zero... */
>>       union hwp_request hwp_req = { .raw = 0 };
> 
> Further down in this same function we have
> 
>      on_selected_cpus(cpumask_of(cpu), hwp_write_request, policy, 1);
> 
> That's similarly problematic when the CPU denoted by policy->cpu isn't
> online anymore. (It's not quite clear whether all related issues would
> want fixing together, or in multiple patches.)
> 
>> @@ -350,7 +349,7 @@ static void hwp_get_cpu_speeds(struct cpufreq_policy *policy)
>>   static void cf_check hwp_init_msrs(void *info)
>>   {
>>       struct cpufreq_policy *policy = info;
>> -    struct hwp_drv_data *data = this_cpu(hwp_drv_data);
>> +    struct hwp_drv_data *data = policy->u.hwp;
>>       uint64_t val;
>>   
>>       /*
>> @@ -426,15 +425,14 @@ static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
>>   
>>       policy->governor = &cpufreq_gov_hwp;
>>   
>> -    per_cpu(hwp_drv_data, cpu) = data;
>> +    policy->u.hwp = data;
>>   
>>       on_selected_cpus(cpumask_of(cpu), hwp_init_msrs, policy, 1);
> 
> If multiple CPUs are in a domain, not all of them will make it here. By
> implication the MSRs accessed by hwp_init_msrs() would need to have wider
> than thread scope. The SDM, afaics, says nothing either way in this regard
> in the Architectural MSRs section. Later model-specific tables have some
> data.

When I wrote the HWP driver, I expected there to be per-cpu 
hwp_drv_data.  policy->cpu looked like the correct way to identify each 
CPU.  I was unaware of the idea of cpufreq_domains, and didn't intend 
there to be any sharing.

> Which gets me back to my original question: Is "sharing" actually possible
> for HWP? Note further how there are both HWP_REQUEST and HWP_REQUEST_PKG
> MSRs, for example. Which one is (to be) used looks to be controlled by
> HWP_CTL.PKG_CTL_POLARITY.

I was aware of the Package Level MSRs, but chose not to support them. 
Topology information didn't seem readily available to the driver, and 
using non-Package Level MSRs is needed for backwards compatibility anyway.

I don't have access to an HWP system, so I cannot check if processors 
share a domain.  I'd feel a little silly if I only ever wrote to CPU 0 :/

I have no proof, but I want to say that at some point I had debug 
statements and saw hwp_cpufreq_target() called for each CPU.

Maybe forcing hw_all=1 in cpufreq_add_cpu()/cpufreq_del_cpu() would 
ensure per-cpu policies?

Regards,
Jason
Re: [PATCH v9 1/8] xen/cpufreq: embed hwp into struct cpufreq_policy{}
Posted by Jan Beulich 5 days, 20 hours ago
On 04.09.2025 20:53, Jason Andryuk wrote:
> On 2025-09-04 07:50, Jan Beulich wrote:
>> On 04.09.2025 08:35, Penny Zheng wrote:
>>> @@ -259,7 +258,7 @@ static int cf_check hwp_cpufreq_target(struct cpufreq_policy *policy,
>>>                                          unsigned int relation)
>>>   {
>>>       unsigned int cpu = policy->cpu;
>>> -    struct hwp_drv_data *data = per_cpu(hwp_drv_data, cpu);
>>> +    struct hwp_drv_data *data = policy->u.hwp;
>>>       /* Zero everything to ensure reserved bits are zero... */
>>>       union hwp_request hwp_req = { .raw = 0 };
>>
>> Further down in this same function we have
>>
>>      on_selected_cpus(cpumask_of(cpu), hwp_write_request, policy, 1);
>>
>> That's similarly problematic when the CPU denoted by policy->cpu isn't
>> online anymore. (It's not quite clear whether all related issues would
>> want fixing together, or in multiple patches.)
>>
>>> @@ -350,7 +349,7 @@ static void hwp_get_cpu_speeds(struct cpufreq_policy *policy)
>>>   static void cf_check hwp_init_msrs(void *info)
>>>   {
>>>       struct cpufreq_policy *policy = info;
>>> -    struct hwp_drv_data *data = this_cpu(hwp_drv_data);
>>> +    struct hwp_drv_data *data = policy->u.hwp;
>>>       uint64_t val;
>>>   
>>>       /*
>>> @@ -426,15 +425,14 @@ static int cf_check hwp_cpufreq_cpu_init(struct cpufreq_policy *policy)
>>>   
>>>       policy->governor = &cpufreq_gov_hwp;
>>>   
>>> -    per_cpu(hwp_drv_data, cpu) = data;
>>> +    policy->u.hwp = data;
>>>   
>>>       on_selected_cpus(cpumask_of(cpu), hwp_init_msrs, policy, 1);
>>
>> If multiple CPUs are in a domain, not all of them will make it here. By
>> implication the MSRs accessed by hwp_init_msrs() would need to have wider
>> than thread scope. The SDM, afaics, says nothing either way in this regard
>> in the Architectural MSRs section. Later model-specific tables have some
>> data.
> 
> When I wrote the HWP driver, I expected there to be per-cpu 
> hwp_drv_data.  policy->cpu looked like the correct way to identify each 
> CPU.  I was unaware of the idea of cpufreq_domains, and didn't intend 
> there to be any sharing.
> 
>> Which gets me back to my original question: Is "sharing" actually possible
>> for HWP? Note further how there are both HWP_REQUEST and HWP_REQUEST_PKG
>> MSRs, for example. Which one is (to be) used looks to be controlled by
>> HWP_CTL.PKG_CTL_POLARITY.
> 
> I was aware of the Package Level MSRs, but chose not to support them. 
> Topology information didn't seem readily available to the driver, and 
> using non-Package Level MSRs is needed for backwards compatibility anyway.
> 
> I don't have access to an HWP system, so I cannot check if processors 
> share a domain.  I'd feel a little silly if I only ever wrote to CPU 0 :/
> 
> I have no proof, but I want to say that at some point I had debug 
> statements and saw hwp_cpufreq_target() called for each CPU.
> 
> Maybe forcing hw_all=1 in cpufreq_add_cpu()/cpufreq_del_cpu() would 
> ensure per-cpu policies?

No, domain info is handed to us from ACPI (_PSD). That's what
cpufreq_add_cpu() evaluates. Therefore if there was evidence that HWP (and
CPPC) can only ever have single-CPU domains, we could refuse such _PSD
being handed to us (ideally already in set_px_pminfo()). But I don't think
we can just go and override what we were told. I fear though that the mere
existence of a package-level (alternative) MSR suggests that such
configurations might be possible.

Jan