[PATCH 3/5] cpufreq/amd-cppc: move driver data into policy

Jan Beulich posted 5 patches 3 days, 22 hours ago
[PATCH 3/5] cpufreq/amd-cppc: move driver data into policy
Posted by Jan Beulich 3 days, 21 hours ago
Share space with the ACPI, powernow, and HWP drivers, avoiding a separate
allocation for each CPU.

This then also reduces the concern over amd_cppc_cpufreq_cpu_init() being
called for all CPUs, or a CPU going offline that's recorded in policy->cpu
(which would result in accesses of per-CPU data of offline CPUs).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
amd_cppc_cpufreq_target() (together with amd_cppc_write_request()) still
requires policy->cpu to be online, though.

--- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
+++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
@@ -31,81 +31,6 @@
 })
 
 /*
- * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and lowest_perf
- * contain the values read from CPPC capability MSR. They represent the limits
- * of managed performance range as well as the dynamic capability, which may
- * change during processor operation
- * Field highest_perf represents highest performance, which is the absolute
- * maximum performance an individual processor may reach, assuming ideal
- * conditions. This performance level may not be sustainable for long
- * durations and may only be achievable if other platform components
- * are in a specific state; for example, it may require other processors be
- * in an idle state. This would be equivalent to the highest frequencies
- * supported by the processor.
- * Field nominal_perf represents maximum sustained performance level of the
- * processor, assuming ideal operating conditions. All cores/processors are
- * expected to be able to sustain their nominal performance state
- * simultaneously.
- * Field lowest_nonlinear_perf represents Lowest Nonlinear Performance, which
- * is the lowest performance level at which nonlinear power savings are
- * achieved. Above this threshold, lower performance levels should be
- * generally more energy efficient than higher performance levels. So in
- * traditional terms, this represents the P-state range of performance levels.
- * Field lowest_perf represents the absolute lowest performance level of the
- * platform. Selecting it may cause an efficiency penalty but should reduce
- * the instantaneous power consumption of the processor. So in traditional
- * terms, this represents the T-state range of performance levels.
- *
- * Field max_perf, min_perf, des_perf store the values for CPPC request MSR.
- * Software passes performance goals through these fields.
- * Field max_perf conveys the maximum performance level at which the platform
- * may run. And it may be set to any performance value in the range
- * [lowest_perf, highest_perf], inclusive.
- * Field min_perf conveys the minimum performance level at which the platform
- * may run. And it may be set to any performance value in the range
- * [lowest_perf, highest_perf], inclusive but must be less than or equal to
- * max_perf.
- * Field des_perf conveys performance level Xen governor is requesting. And it
- * may be set to any performance value in the range [min_perf, max_perf],
- * inclusive. In active mode, des_perf must be zero.
- * Field epp represents energy performance preference, which only has meaning
- * when active mode is enabled. The EPP is used in the CCLK DPM controller
- * to drive the frequency that a core is going to operate during short periods
- * of activity, called minimum active frequency, It could contatin a range of
- * values from 0 to 0xff. An EPP of zero sets the min active frequency to
- * maximum frequency, while an EPP of 0xff sets the min active frequency to
- * approxiately Idle frequency.
- */
-struct amd_cppc_drv_data
-{
-    const struct xen_processor_cppc *cppc_data;
-    union {
-        uint64_t raw;
-        struct {
-            unsigned int lowest_perf:8;
-            unsigned int lowest_nonlinear_perf:8;
-            unsigned int nominal_perf:8;
-            unsigned int highest_perf:8;
-            unsigned int :32;
-        };
-    } caps;
-    union {
-        uint64_t raw;
-        struct {
-            unsigned int max_perf:8;
-            unsigned int min_perf:8;
-            unsigned int des_perf:8;
-            unsigned int epp:8;
-            unsigned int :32;
-        };
-    } req;
-
-    int err;
-};
-
-static DEFINE_PER_CPU_READ_MOSTLY(struct amd_cppc_drv_data *,
-                                  amd_cppc_drv_data);
-/*
  * Core max frequency read from PstateDef as anchor point
  * for freq-to-perf transition
  */
@@ -279,11 +204,11 @@ static void cf_check amd_cppc_write_requ
     wrmsrl(MSR_AMD_CPPC_REQ, data->req.raw);
 }
 
-static void amd_cppc_write_request(unsigned int cpu, uint8_t min_perf,
-                                   uint8_t des_perf, uint8_t max_perf,
-                                   uint8_t epp)
+static void amd_cppc_write_request(struct cpufreq_policy *policy,
+                                   uint8_t min_perf, uint8_t des_perf,
+                                   uint8_t max_perf, uint8_t epp)
 {
-    struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+    struct amd_cppc_drv_data *data = &policy->drv_data.amd_cppc;
     uint64_t prev = data->req.raw;
 
     data->req.min_perf = min_perf;
@@ -295,15 +220,15 @@ static void amd_cppc_write_request(unsig
     if ( prev == data->req.raw )
         return;
 
-    on_selected_cpus(cpumask_of(cpu), amd_cppc_write_request_msrs, data, 1);
+    on_selected_cpus(cpumask_of(policy->cpu), amd_cppc_write_request_msrs,
+                     data, 1);
 }
 
 static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
                                             unsigned int target_freq,
                                             unsigned int relation)
 {
-    unsigned int cpu = policy->cpu;
-    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+    const struct amd_cppc_drv_data *data = &policy->drv_data.amd_cppc;
     uint8_t des_perf;
     int res;
 
@@ -320,7 +245,7 @@ static int cf_check amd_cppc_cpufreq_tar
      * may actually cause an efficiency penalty, So when deciding the min_perf
      * value, we prefer lowest nonlinear performance over lowest performance.
      */
-    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
+    amd_cppc_write_request(policy, data->caps.lowest_nonlinear_perf,
                            des_perf, data->caps.highest_perf,
                            /* Pre-defined BIOS value for passive mode */
                            per_cpu(epp_init, policy->cpu));
@@ -330,7 +255,7 @@ static int cf_check amd_cppc_cpufreq_tar
 static void cf_check amd_cppc_init_msrs(void *info)
 {
     struct cpufreq_policy *policy = info;
-    struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data);
+    struct amd_cppc_drv_data *data = &policy->drv_data.amd_cppc;
     uint64_t val;
     unsigned int min_freq = 0, nominal_freq = 0, max_freq;
 
@@ -431,24 +356,16 @@ static void amd_cppc_boost_init(struct c
 
 static int cf_check amd_cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-    XVFREE(per_cpu(amd_cppc_drv_data, policy->cpu));
-
     return 0;
 }
 
 static int amd_cppc_cpufreq_init_perf(struct cpufreq_policy *policy)
 {
     unsigned int cpu = policy->cpu;
-    struct amd_cppc_drv_data *data;
-
-    data = xvzalloc(struct amd_cppc_drv_data);
-    if ( !data )
-        return -ENOMEM;
+    struct amd_cppc_drv_data *data = &policy->drv_data.amd_cppc;
 
     data->cppc_data = &processor_pminfo[cpu]->cppc_data;
 
-    per_cpu(amd_cppc_drv_data, cpu) = data;
-
     on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
 
     /*
@@ -506,8 +423,7 @@ static void amd_cppc_prepare_policy(stru
                                     uint8_t *max_perf, uint8_t *min_perf,
                                     uint8_t *epp)
 {
-    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data,
-                                                   policy->cpu);
+    const struct amd_cppc_drv_data *data = &policy->drv_data.amd_cppc;
 
     /*
      * On default, set min_perf with lowest_nonlinear_perf, and max_perf
@@ -560,7 +476,7 @@ static int cf_check amd_cppc_epp_set_pol
 
     amd_cppc_prepare_policy(policy, &max_perf, &min_perf, &epp);
 
-    amd_cppc_write_request(policy->cpu, min_perf,
+    amd_cppc_write_request(policy, min_perf,
                            0 /* no des_perf in active mode */,
                            max_perf, epp);
     return 0;
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -74,6 +74,78 @@ struct hwp_drv_data {
     uint8_t energy_perf;
 };
 
+/*
+ * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and lowest_perf
+ * contain the values read from CPPC capability MSR. They represent the limits
+ * of managed performance range as well as the dynamic capability, which may
+ * change during processor operation
+ * Field highest_perf represents highest performance, which is the absolute
+ * maximum performance an individual processor may reach, assuming ideal
+ * conditions. This performance level may not be sustainable for long
+ * durations and may only be achievable if other platform components
+ * are in a specific state; for example, it may require other processors be
+ * in an idle state. This would be equivalent to the highest frequencies
+ * supported by the processor.
+ * Field nominal_perf represents maximum sustained performance level of the
+ * processor, assuming ideal operating conditions. All cores/processors are
+ * expected to be able to sustain their nominal performance state
+ * simultaneously.
+ * Field lowest_nonlinear_perf represents Lowest Nonlinear Performance, which
+ * is the lowest performance level at which nonlinear power savings are
+ * achieved. Above this threshold, lower performance levels should be
+ * generally more energy efficient than higher performance levels. So in
+ * traditional terms, this represents the P-state range of performance levels.
+ * Field lowest_perf represents the absolute lowest performance level of the
+ * platform. Selecting it may cause an efficiency penalty but should reduce
+ * the instantaneous power consumption of the processor. So in traditional
+ * terms, this represents the T-state range of performance levels.
+ *
+ * Field max_perf, min_perf, des_perf store the values for CPPC request MSR.
+ * Software passes performance goals through these fields.
+ * Field max_perf conveys the maximum performance level at which the platform
+ * may run. And it may be set to any performance value in the range
+ * [lowest_perf, highest_perf], inclusive.
+ * Field min_perf conveys the minimum performance level at which the platform
+ * may run. And it may be set to any performance value in the range
+ * [lowest_perf, highest_perf], inclusive but must be less than or equal to
+ * max_perf.
+ * Field des_perf conveys performance level Xen governor is requesting. And it
+ * may be set to any performance value in the range [min_perf, max_perf],
+ * inclusive. In active mode, des_perf must be zero.
+ * Field epp represents energy performance preference, which only has meaning
+ * when active mode is enabled. The EPP is used in the CCLK DPM controller
+ * to drive the frequency that a core is going to operate during short periods
+ * of activity, called minimum active frequency, It could contatin a range of
+ * values from 0 to 0xff. An EPP of zero sets the min active frequency to
+ * maximum frequency, while an EPP of 0xff sets the min active frequency to
+ * approxiately Idle frequency.
+ */
+struct amd_cppc_drv_data {
+    const struct xen_processor_cppc *cppc_data;
+    union {
+        uint64_t raw;
+        struct {
+            unsigned int lowest_perf:8;
+            unsigned int lowest_nonlinear_perf:8;
+            unsigned int nominal_perf:8;
+            unsigned int highest_perf:8;
+            unsigned int :32;
+        };
+    } caps;
+    union {
+        uint64_t raw;
+        struct {
+            unsigned int max_perf:8;
+            unsigned int min_perf:8;
+            unsigned int des_perf:8;
+            unsigned int epp:8;
+            unsigned int :32;
+        };
+    } req;
+
+    int err;
+};
+
 struct cpufreq_cpuinfo {
     unsigned int        max_freq;
     unsigned int        second_max_freq;    /* P1 if Turbo Mode is on */
@@ -120,6 +192,7 @@ struct cpufreq_policy {
     union {
         struct acpi_cpufreq_data acpi;
         struct hwp_drv_data hwp;
+        struct amd_cppc_drv_data amd_cppc;
     }                   drv_data;
 };
 DECLARE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_policy);
RE: [PATCH 3/5] cpufreq/amd-cppc: move driver data into policy
Posted by Penny, Zheng 25 minutes ago
[Public]

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Thursday, January 22, 2026 5:43 PM
> To: xen-devel@lists.xenproject.org
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>; Roger Pau Monné
> <roger.pau@citrix.com>; Penny, Zheng <penny.zheng@amd.com>
> Subject: [PATCH 3/5] cpufreq/amd-cppc: move driver data into policy
>
> Share space with the ACPI, powernow, and HWP drivers, avoiding a separate
> allocation for each CPU.
>
> This then also reduces the concern over amd_cppc_cpufreq_cpu_init() being called
> for all CPUs, or a CPU going offline that's recorded in policy->cpu (which would
> result in accesses of per-CPU data of offline CPUs).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Penny Zheng <penny.zheng@amd.com>

Many thanks,
Penny Zheng