amd-cppc is the AMD CPU performance scaling driver that introduces a
new CPU frequency control mechanism firstly on AMD Zen based CPU series.
The new mechanism is based on Collaborative Processor Performance
Control (CPPC) which is a finer grain frequency management
than legacy ACPI hardware P-States.
Current AMD CPU platforms are using the ACPI P-states driver to
manage CPU frequency and clocks with switching only in 3 P-states.
The new amd-cppc allows a more flexible, low-latency interface for Xen
to directly communicate the performance hints to hardware.
The first version "amd-cppc" could leverage common governors such as
*ondemand*, *performance*, etc, to manage the performance hints. In the
future, we will introduce an advanced active mode to enable autonomous
performence level selection.
Signed-off-by: Penny Zheng <Penny.Zheng@amd.com>
---
v1 -> v2:
- re-construct union caps and req to have anonymous struct instead
- avoid "else" when the earlier if() ends in an unconditional control flow statement
- Add check to avoid chopping off set bits from cast
- make pointers pointer-to-const wherever possible
- remove noisy log
- exclude families before 0x17 before CPPC-feature MSR op
- remove useless variable helpers
- use xvzalloc and XVFREE
- refactor error handling as ENABLE bit can only be cleared by reset
---
v2 -> v3:
- Move all MSR-definations to msr-index.h and follow the required style
- Refactor opening figure braces for struct/union
- Sort overlong lines throughout the series
- Make offset/res int covering underflow scenario
- Error out when amd_max_freq_mhz isn't set
- Introduce amd_get_freq(name) macro to decrease redundancy
- Supported CPU family checked ahead of smp-function
- Nominal freq shall be checked between the [min, max]
- Use APERF/MPREF to calculate current frequency
- Use amd_cppc_cpufreq_cpu_exit() to tidy error path
---
xen/arch/x86/acpi/cpufreq/amd-cppc.c | 370 +++++++++++++++++++++++++++
xen/arch/x86/include/asm/msr-index.h | 5 +
2 files changed, 375 insertions(+)
diff --git a/xen/arch/x86/acpi/cpufreq/amd-cppc.c b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
index 7d482140a2..bf30990c74 100644
--- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
+++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
@@ -14,7 +14,50 @@
#include <xen/domain.h>
#include <xen/init.h>
#include <xen/param.h>
+#include <xen/percpu.h>
+#include <xen/xvmalloc.h>
#include <acpi/cpufreq/cpufreq.h>
+#include <asm/amd.h>
+#include <asm/msr-index.h>
+
+#define amd_cppc_err(cpu, fmt, args...) \
+ printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args)
+#define amd_cppc_warn(fmt, args...) \
+ printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args)
+#define amd_cppc_verbose(fmt, args...) \
+({ \
+ if ( cpufreq_verbose ) \
+ printk(XENLOG_DEBUG "AMD_CPPC: " fmt, ## args); \
+})
+
+struct amd_cppc_drv_data
+{
+ const struct xen_processor_cppc *cppc_data;
+ union {
+ uint64_t raw;
+ struct {
+ unsigned int lowest_perf:8;
+ unsigned int lowest_nonlinear_perf:8;
+ unsigned int nominal_perf:8;
+ unsigned int highest_perf:8;
+ unsigned int :32;
+ };
+ } caps;
+ union {
+ uint64_t raw;
+ struct {
+ unsigned int max_perf:8;
+ unsigned int min_perf:8;
+ unsigned int des_perf:8;
+ unsigned int epp:8;
+ unsigned int :32;
+ };
+ } req;
+
+ int err;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct amd_cppc_drv_data *, amd_cppc_drv_data);
static bool __init amd_cppc_handle_option(const char *s, const char *end)
{
@@ -51,10 +94,337 @@ int __init amd_cppc_cmdline_parse(const char *s, const char *e)
return 0;
}
+/*
+ * If CPPC lowest_freq and nominal_freq registers are exposed then we can
+ * use them to convert perf to freq and vice versa. The conversion is
+ * extrapolated as an linear function passing by the 2 points:
+ * - (Low perf, Low freq)
+ * - (Nominal perf, Nominal freq)
+ */
+static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data,
+ unsigned int freq, uint8_t *perf)
+{
+ const struct xen_processor_cppc *cppc_data = data->cppc_data;
+ uint64_t mul, div;
+ int offset = 0, res;
+
+ if ( freq == (cppc_data->nominal_mhz * 1000) )
+ {
+ *perf = data->caps.nominal_perf;
+ return 0;
+ }
+
+ if ( freq == (cppc_data->lowest_mhz * 1000) )
+ {
+ *perf = data->caps.lowest_perf;
+ return 0;
+ }
+
+ if ( cppc_data->lowest_mhz && cppc_data->nominal_mhz )
+ {
+ mul = data->caps.nominal_perf - data->caps.lowest_perf;
+ div = cppc_data->nominal_mhz - cppc_data->lowest_mhz;
+ /*
+ * We don't need to convert to KHz for computing offset and can
+ * directly use nominal_mhz and lowest_mhz as the division
+ * will remove the frequency unit.
+ */
+ div = div ?: 1;
+ offset = data->caps.nominal_perf -
+ (mul * cppc_data->nominal_mhz) / div;
+ }
+ else
+ {
+ /* Read Processor Max Speed(mhz) as anchor point */
+ mul = data->caps.highest_perf;
+ div = this_cpu(amd_max_freq_mhz);
+ if ( !div )
+ return -EINVAL;
+ }
+
+ res = offset + (mul * freq) / (div * 1000);
+ if ( res > UINT8_MAX )
+ {
+ printk_once(XENLOG_WARNING
+ "Perf value exceeds maximum value 255: %d\n", res);
+ *perf = 0xff;
+ return 0;
+ }
+ *perf = (uint8_t)res;
+
+ return 0;
+}
+
+#define amd_get_freq(name) \
+ static int amd_get_##name##_freq(const struct amd_cppc_drv_data *data, \
+ unsigned int *freq) \
+ { \
+ const struct xen_processor_cppc *cppc_data = data->cppc_data; \
+ uint64_t mul, div, res; \
+ \
+ if ( cppc_data->name##_mhz ) \
+ { \
+ /* Switch to khz */ \
+ *freq = cppc_data->name##_mhz * 1000; \
+ return 0; \
+ } \
+ \
+ /* Read Processor Max Speed(mhz) as anchor point */ \
+ mul = this_cpu(amd_max_freq_mhz); \
+ if ( !mul ) \
+ return -EINVAL; \
+ div = data->caps.highest_perf; \
+ res = (mul * data->caps.name##_perf * 1000) / div; \
+ if ( res > UINT_MAX ) \
+ { \
+ printk(XENLOG_ERR \
+ "Frequeny exceeds maximum value UINT_MAX: %lu\n", res); \
+ return -EINVAL; \
+ } \
+ *freq = (unsigned int)res; \
+ \
+ return 0; \
+ } \
+
+amd_get_freq(lowest);
+amd_get_freq(nominal);
+
+static int amd_get_max_freq(const struct amd_cppc_drv_data *data,
+ unsigned int *max_freq)
+{
+ unsigned int nom_freq, boost_ratio;
+ int res;
+
+ res = amd_get_nominal_freq(data, &nom_freq);
+ if ( res )
+ return res;
+
+ boost_ratio = (unsigned int)(data->caps.highest_perf /
+ data->caps.nominal_perf);
+ *max_freq = nom_freq * boost_ratio;
+
+ return 0;
+}
+
+static int cf_check amd_cppc_cpufreq_verify(struct cpufreq_policy *policy)
+{
+ cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
+
+ return 0;
+}
+
+static void amd_cppc_write_request_msrs(void *info)
+{
+ struct amd_cppc_drv_data *data = info;
+
+ if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->req.raw) )
+ {
+ data->err = -EINVAL;
+ return;
+ }
+}
+
+static int cf_check amd_cppc_write_request(unsigned int cpu, uint8_t min_perf,
+ uint8_t des_perf, uint8_t max_perf)
+{
+ struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+ uint64_t prev = data->req.raw;
+
+ data->req.min_perf = min_perf;
+ data->req.max_perf = max_perf;
+ data->req.des_perf = des_perf;
+
+ if ( prev == data->req.raw )
+ return 0;
+
+ data->err = 0;
+ on_selected_cpus(cpumask_of(cpu), amd_cppc_write_request_msrs, data, 1);
+
+ return data->err;
+}
+
+static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ unsigned int cpu = policy->cpu;
+ const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+ uint8_t des_perf;
+ int res;
+
+ if ( unlikely(!target_freq) )
+ return 0;
+
+ res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
+ if ( res )
+ return res;
+
+ return amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
+ des_perf, data->caps.highest_perf);
+}
+
+static void cf_check amd_cppc_init_msrs(void *info)
+{
+ struct cpufreq_policy *policy = info;
+ struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data);
+ uint64_t val;
+ unsigned int min_freq, nominal_freq, max_freq;
+
+ /* Package level MSR */
+ if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+ {
+ amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
+ goto err;
+ }
+
+ /*
+ * Only when Enable bit is on, the hardware will calculate the processor’s
+ * performance capabilities and initialize the performance level fields in
+ * the CPPC capability registers.
+ */
+ if ( !(val & AMD_CPPC_ENABLE) )
+ {
+ val |= AMD_CPPC_ENABLE;
+ if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+ {
+ amd_cppc_err(policy->cpu,
+ "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
+ goto err;
+ }
+ }
+
+ if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->caps.raw) )
+ {
+ amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
+ goto err;
+ }
+
+ if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 ||
+ data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == 0 )
+ {
+ amd_cppc_err(policy->cpu,
+ "Platform malfunction, read CPPC highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: %u zero value\n",
+ data->caps.highest_perf, data->caps.lowest_perf,
+ data->caps.nominal_perf, data->caps.lowest_nonlinear_perf);
+ goto err;
+ }
+
+ data->err = amd_get_lowest_freq(data, &min_freq);
+ if ( data->err )
+ return;
+
+ data->err = amd_get_nominal_freq(data, &nominal_freq);
+ if ( data->err )
+ return;
+
+ data->err = amd_get_max_freq(data, &max_freq);
+ if ( data->err )
+ return;
+
+ if ( min_freq > max_freq || nominal_freq > max_freq ||
+ nominal_freq < min_freq )
+ {
+ amd_cppc_err(policy->cpu,
+ "min_freq(%u), or max_freq(%u), or nominal_freq(%u) value is incorrect\n",
+ min_freq, max_freq, nominal_freq);
+ goto err;
+ }
+
+ policy->min = min_freq;
+ policy->max = max_freq;
+
+ policy->cpuinfo.min_freq = min_freq;
+ policy->cpuinfo.max_freq = max_freq;
+ policy->cpuinfo.perf_freq = nominal_freq;
+ /*
+ * Set after policy->cpuinfo.perf_freq, as we are taking
+ * APERF/MPERF average frequency as current frequency.
+ */
+ policy->cur = cpufreq_driver_getavg(policy->cpu, GOV_GETAVG);
+
+ return;
+
+ err:
+ data->err = -EINVAL;
+}
+
+/*
+ * The new AMD CPPC driver is different than legacy ACPI hardware P-State,
+ * which has a finer grain frequency range between the highest and lowest
+ * frequency. And boost frequency is actually the frequency which is mapped on
+ * highest performance ratio. The legacy P0 frequency is actually mapped on
+ * nominal performance ratio.
+ */
+static void amd_cppc_boost_init(struct cpufreq_policy *policy,
+ const struct amd_cppc_drv_data *data)
+{
+ if ( data->caps.highest_perf <= data->caps.nominal_perf )
+ return;
+
+ policy->turbo = CPUFREQ_TURBO_ENABLED;
+}
+
+static int cf_check amd_cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+ XVFREE(per_cpu(amd_cppc_drv_data, policy->cpu));
+
+ return 0;
+}
+
+static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+ unsigned int cpu = policy->cpu;
+ struct amd_cppc_drv_data *data;
+ const struct cpuinfo_x86 *c = cpu_data + cpu;
+
+ data = xvzalloc(struct amd_cppc_drv_data);
+ if ( !data )
+ return -ENOMEM;
+
+ data->cppc_data = &processor_pminfo[cpu]->cppc_data;
+
+ per_cpu(amd_cppc_drv_data, cpu) = data;
+
+ /* Feature CPPC is firstly introduced on Zen2 */
+ if ( c->x86 < 0x17 )
+ {
+ printk_once("Unsupported cpu family: %x\n", c->x86);
+ return -EOPNOTSUPP;
+ }
+
+ on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
+
+ /*
+ * If error path takes effective, not only amd-cppc cpufreq driver fails
+ * to initialize, but also we could not fall back to legacy P-states
+ * driver nevertheless we specifies fall back option in cmdline.
+ */
+ if ( data->err )
+ {
+ amd_cppc_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
+ amd_cppc_cpufreq_cpu_exit(policy);
+ return -ENODEV;
+ }
+
+ policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+
+ amd_cppc_boost_init(policy, data);
+
+ amd_cppc_verbose("CPU %u initialized with amd-cppc passive mode\n",
+ policy->cpu);
+
+ return 0;
+}
+
static const struct cpufreq_driver __initconst_cf_clobber
amd_cppc_cpufreq_driver =
{
.name = XEN_AMD_CPPC_DRIVER_NAME,
+ .verify = amd_cppc_cpufreq_verify,
+ .target = amd_cppc_cpufreq_target,
+ .init = amd_cppc_cpufreq_cpu_init,
+ .exit = amd_cppc_cpufreq_cpu_exit,
};
int __init amd_cppc_register_driver(void)
diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h
index 22d9e76e55..985f33eca1 100644
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -238,6 +238,11 @@
#define MSR_AMD_CSTATE_CFG 0xc0010296U
+#define MSR_AMD_CPPC_CAP1 0xc00102b0
+#define MSR_AMD_CPPC_ENABLE 0xc00102b1
+#define AMD_CPPC_ENABLE (_AC(1, ULL) << 0)
+#define MSR_AMD_CPPC_REQ 0xc00102b3
+
/*
* Legacy MSR constants in need of cleanup. No new MSRs below this comment.
*/
--
2.34.1
On 06.03.2025 09:39, Penny Zheng wrote:
> v2 -> v3:
> - Move all MSR-definations to msr-index.h and follow the required style
> - Refactor opening figure braces for struct/union
> - Sort overlong lines throughout the series
> - Make offset/res int covering underflow scenario
> - Error out when amd_max_freq_mhz isn't set
Given the issue with the patch filling amd_max_freq_mhz I wonder how you
successfully tested this patch here.
> - Introduce amd_get_freq(name) macro to decrease redundancy
Hmm, that's not quite what I was hoping for. I'll comment there in more
detail.
> --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> @@ -14,7 +14,50 @@
> #include <xen/domain.h>
> #include <xen/init.h>
> #include <xen/param.h>
> +#include <xen/percpu.h>
> +#include <xen/xvmalloc.h>
> #include <acpi/cpufreq/cpufreq.h>
> +#include <asm/amd.h>
> +#include <asm/msr-index.h>
> +
> +#define amd_cppc_err(cpu, fmt, args...) \
> + printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args)
> +#define amd_cppc_warn(fmt, args...) \
> + printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args)
> +#define amd_cppc_verbose(fmt, args...) \
> +({ \
> + if ( cpufreq_verbose ) \
> + printk(XENLOG_DEBUG "AMD_CPPC: " fmt, ## args); \
> +})
Why would warning and error come with a CPU number at all times, but not
the verbose construct?
> +struct amd_cppc_drv_data
> +{
> + const struct xen_processor_cppc *cppc_data;
> + union {
> + uint64_t raw;
> + struct {
> + unsigned int lowest_perf:8;
> + unsigned int lowest_nonlinear_perf:8;
> + unsigned int nominal_perf:8;
> + unsigned int highest_perf:8;
> + unsigned int :32;
> + };
> + } caps;
> + union {
> + uint64_t raw;
> + struct {
> + unsigned int max_perf:8;
> + unsigned int min_perf:8;
> + unsigned int des_perf:8;
> + unsigned int epp:8;
> + unsigned int :32;
> + };
> + } req;
> +
> + int err;
> +};
> +
> +static DEFINE_PER_CPU_READ_MOSTLY(struct amd_cppc_drv_data *, amd_cppc_drv_data);
Nit: Line length. I wonder what "Sort overlong lines throughout the series"
is meant to say in the revision log.
> @@ -51,10 +94,337 @@ int __init amd_cppc_cmdline_parse(const char *s, const char *e)
> return 0;
> }
>
> +/*
> + * If CPPC lowest_freq and nominal_freq registers are exposed then we can
> + * use them to convert perf to freq and vice versa. The conversion is
> + * extrapolated as an linear function passing by the 2 points:
> + * - (Low perf, Low freq)
> + * - (Nominal perf, Nominal freq)
> + */
> +static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data,
> + unsigned int freq, uint8_t *perf)
> +{
> + const struct xen_processor_cppc *cppc_data = data->cppc_data;
> + uint64_t mul, div;
> + int offset = 0, res;
> +
> + if ( freq == (cppc_data->nominal_mhz * 1000) )
> + {
> + *perf = data->caps.nominal_perf;
> + return 0;
> + }
> +
> + if ( freq == (cppc_data->lowest_mhz * 1000) )
> + {
> + *perf = data->caps.lowest_perf;
> + return 0;
> + }
> +
> + if ( cppc_data->lowest_mhz && cppc_data->nominal_mhz )
> + {
> + mul = data->caps.nominal_perf - data->caps.lowest_perf;
> + div = cppc_data->nominal_mhz - cppc_data->lowest_mhz;
> + /*
> + * We don't need to convert to KHz for computing offset and can
Nit: kHz (i.e. unlike MHz)
> + * directly use nominal_mhz and lowest_mhz as the division
> + * will remove the frequency unit.
> + */
> + div = div ?: 1;
Imo the cppc_data->lowest_mhz >= cppc_data->nominal_mhz case better
wouldn't make it here, but use the fallback path below. Or special-
case cppc_data->lowest_mhz == cppc_data->nominal_mhz: mul would
(hopefully) be zero (i.e. there would be the expectation that
data->caps.nominal_perf == data->caps.lowest_perf, yet no guarantee
without checking), and hence ...
> + offset = data->caps.nominal_perf -
> + (mul * cppc_data->nominal_mhz) / div;
... offset = data->caps.nominal_perf regardless of "div" (as long
as that's not zero). I.e. the "equal" case may still be fine to take
this path.
Or is there a check somewhere that lowest_mhz <= nominal_mhz and
lowest_perf <= nominal_perf, which I'm simply overlooking?
> + }
> + else
> + {
> + /* Read Processor Max Speed(mhz) as anchor point */
> + mul = data->caps.highest_perf;
> + div = this_cpu(amd_max_freq_mhz);
> + if ( !div )
> + return -EINVAL;
> + }
> +
> + res = offset + (mul * freq) / (div * 1000);
> + if ( res > UINT8_MAX )
I can't quite convince myself that res can't end up negative here, in
which case ...
> + {
> + printk_once(XENLOG_WARNING
> + "Perf value exceeds maximum value 255: %d\n", res);
> + *perf = 0xff;
> + return 0;
> + }
> + *perf = (uint8_t)res;
... a bogus value would be stored here.
> + return 0;
> +}
> +
> +#define amd_get_freq(name) \
The macro parameter is used just ...
> + static int amd_get_##name##_freq(const struct amd_cppc_drv_data *data, \
... here, ...
> + unsigned int *freq) \
> + { \
> + const struct xen_processor_cppc *cppc_data = data->cppc_data; \
> + uint64_t mul, div, res; \
> + \
> + if ( cppc_data->name##_mhz ) \
> + { \
> + /* Switch to khz */ \
> + *freq = cppc_data->name##_mhz * 1000; \
... twice here forthe MHz value, and ...
> + return 0; \
> + } \
> + \
> + /* Read Processor Max Speed(mhz) as anchor point */ \
> + mul = this_cpu(amd_max_freq_mhz); \
> + if ( !mul ) \
> + return -EINVAL; \
> + div = data->caps.highest_perf; \
> + res = (mul * data->caps.name##_perf * 1000) / div; \
... here for the respective perf indicator. Why does it take ...
> + if ( res > UINT_MAX ) \
> + { \
> + printk(XENLOG_ERR \
> + "Frequeny exceeds maximum value UINT_MAX: %lu\n", res); \
> + return -EINVAL; \
> + } \
> + *freq = (unsigned int)res; \
> + \
> + return 0; \
> + } \
> +
> +amd_get_freq(lowest);
> +amd_get_freq(nominal);
... two almost identical functions, when one (with two extra input parameters)
would suffice?
In amd_cppc_khz_to_perf() you have a check to avoid division by zero. Why
not the same safeguarding here?
> +static int amd_get_max_freq(const struct amd_cppc_drv_data *data,
> + unsigned int *max_freq)
> +{
> + unsigned int nom_freq, boost_ratio;
> + int res;
> +
> + res = amd_get_nominal_freq(data, &nom_freq);
> + if ( res )
> + return res;
> +
> + boost_ratio = (unsigned int)(data->caps.highest_perf /
> + data->caps.nominal_perf);
Similarly here - I can't spot what would prevent division by zero.
> + *max_freq = nom_freq * boost_ratio;
Nor is it clear to me why (with bogus MSR contents) boost_ratio couldn't
end up being zero, and hence we'd report back ...
> + return 0;
... success with a frequency of 0.
> +}
> +
> +static int cf_check amd_cppc_cpufreq_verify(struct cpufreq_policy *policy)
> +{
> + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
> + policy->cpuinfo.max_freq);
> +
> + return 0;
> +}
> +
> +static void amd_cppc_write_request_msrs(void *info)
> +{
> + struct amd_cppc_drv_data *data = info;
> +
> + if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->req.raw) )
> + {
> + data->err = -EINVAL;
> + return;
> + }
> +}
> +
> +static int cf_check amd_cppc_write_request(unsigned int cpu, uint8_t min_perf,
> + uint8_t des_perf, uint8_t max_perf)
The cf_check looks to be misplaced here, and rather wants to go to
amd_cppc_write_request_msrs() because of ...
> +{
> + struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
> + uint64_t prev = data->req.raw;
> +
> + data->req.min_perf = min_perf;
> + data->req.max_perf = max_perf;
> + data->req.des_perf = des_perf;
> +
> + if ( prev == data->req.raw )
> + return 0;
> +
> + data->err = 0;
> + on_selected_cpus(cpumask_of(cpu), amd_cppc_write_request_msrs, data, 1);
... this use of a function pointer here.
> + return data->err;
> +}
> +
> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> + unsigned int target_freq,
> + unsigned int relation)
> +{
> + unsigned int cpu = policy->cpu;
> + const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
> + uint8_t des_perf;
> + int res;
> +
> + if ( unlikely(!target_freq) )
> + return 0;
Checking other *_cpufreq_target() functions, none would silently ignore
a zero input. (HWP's ignores the input altogether though; Cc-ing Jason
for possible clarification: I would have expected this driver here and
the HWP one to be similar in this regard.)
> + res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> + if ( res )
> + return res;
> +
> + return amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> + des_perf, data->caps.highest_perf);
As before: the use of the "non-linear" value here wants to come with a
(perhaps brief) comment.
> +}
> +
> +static void cf_check amd_cppc_init_msrs(void *info)
> +{
> + struct cpufreq_policy *policy = info;
> + struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data);
> + uint64_t val;
> + unsigned int min_freq, nominal_freq, max_freq;
> +
> + /* Package level MSR */
> + if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
> + {
> + amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
> + goto err;
> + }
> +
> + /*
> + * Only when Enable bit is on, the hardware will calculate the processor’s
> + * performance capabilities and initialize the performance level fields in
> + * the CPPC capability registers.
> + */
> + if ( !(val & AMD_CPPC_ENABLE) )
> + {
> + val |= AMD_CPPC_ENABLE;
> + if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
> + {
> + amd_cppc_err(policy->cpu,
> + "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
> + goto err;
> + }
> + }
> +
> + if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->caps.raw) )
> + {
> + amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
> + goto err;
> + }
> +
> + if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 ||
> + data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == 0 )
> + {
> + amd_cppc_err(policy->cpu,
> + "Platform malfunction, read CPPC highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: %u zero value\n",
I don't think the _perf suffixes are overly relevant in the log message.
> + data->caps.highest_perf, data->caps.lowest_perf,
> + data->caps.nominal_perf, data->caps.lowest_nonlinear_perf);
> + goto err;
> + }
> +
> + data->err = amd_get_lowest_freq(data, &min_freq);
> + if ( data->err )
> + return;
> +
> + data->err = amd_get_nominal_freq(data, &nominal_freq);
> + if ( data->err )
> + return;
> +
> + data->err = amd_get_max_freq(data, &max_freq);
> + if ( data->err )
> + return;
> +
> + if ( min_freq > max_freq || nominal_freq > max_freq ||
> + nominal_freq < min_freq )
> + {
> + amd_cppc_err(policy->cpu,
> + "min_freq(%u), or max_freq(%u), or nominal_freq(%u) value is incorrect\n",
Along the lines of the above, while it wants making clear here it's frequencies,
I question the use of identifier names to express that. E.g.
"min (%u), or max (%u), or nominal (%u) freq value is incorrect\n"?
> + min_freq, max_freq, nominal_freq);
> + goto err;
> + }
> +
> + policy->min = min_freq;
> + policy->max = max_freq;
> +
> + policy->cpuinfo.min_freq = min_freq;
> + policy->cpuinfo.max_freq = max_freq;
> + policy->cpuinfo.perf_freq = nominal_freq;
> + /*
> + * Set after policy->cpuinfo.perf_freq, as we are taking
> + * APERF/MPERF average frequency as current frequency.
> + */
> + policy->cur = cpufreq_driver_getavg(policy->cpu, GOV_GETAVG);
> +
> + return;
> +
> + err:
> + data->err = -EINVAL;
Is we make it here after having set the enable bit, we're hosed (afaict).
We can't fall back to another driver, and we also can't get this driver to
work. I think I did ask before that this be explained in a comment here.
(The only thing the user can do is, aiui, to change the command line option
and reboot.)
Oh, I see you have such a comment at the use site of this function. Please
have a brief comment here then, to refer there.
> +}
> +
> +/*
> + * The new AMD CPPC driver is different than legacy ACPI hardware P-State,
Please omit "new" - that'll be stale rather sooner than later.
> + * which has a finer grain frequency range between the highest and lowest
> + * frequency. And boost frequency is actually the frequency which is mapped on
> + * highest performance ratio. The legacy P0 frequency is actually mapped on
> + * nominal performance ratio.
> + */
> +static void amd_cppc_boost_init(struct cpufreq_policy *policy,
> + const struct amd_cppc_drv_data *data)
> +{
> + if ( data->caps.highest_perf <= data->caps.nominal_perf )
> + return;
> +
> + policy->turbo = CPUFREQ_TURBO_ENABLED;
> +}
> +
> +static int cf_check amd_cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> +{
> + XVFREE(per_cpu(amd_cppc_drv_data, policy->cpu));
> +
> + return 0;
> +}
> +
> +static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> + unsigned int cpu = policy->cpu;
> + struct amd_cppc_drv_data *data;
> + const struct cpuinfo_x86 *c = cpu_data + cpu;
> +
> + data = xvzalloc(struct amd_cppc_drv_data);
> + if ( !data )
> + return -ENOMEM;
> +
> + data->cppc_data = &processor_pminfo[cpu]->cppc_data;
> +
> + per_cpu(amd_cppc_drv_data, cpu) = data;
> +
> + /* Feature CPPC is firstly introduced on Zen2 */
> + if ( c->x86 < 0x17 )
> + {
> + printk_once("Unsupported cpu family: %x\n", c->x86);
> + return -EOPNOTSUPP;
> + }
> +
> + on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
> +
> + /*
> + * If error path takes effective, not only amd-cppc cpufreq driver fails
> + * to initialize, but also we could not fall back to legacy P-states
> + * driver nevertheless we specifies fall back option in cmdline.
> + */
Nit: I'm not a native speaker, but I don't think "nevertheless" can be used here.
Maybe "... but we also cannot fall back to the legacy driver, irrespective of
the command line specifying a fallback option"?
Plus I think it would help to also explain why here, i.e. that the enable bit is
sticky.
> + if ( data->err )
> + {
> + amd_cppc_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
> + amd_cppc_cpufreq_cpu_exit(policy);
> + return -ENODEV;
Why do you not use data->err here?
> --- a/xen/arch/x86/include/asm/msr-index.h
> +++ b/xen/arch/x86/include/asm/msr-index.h
> @@ -238,6 +238,11 @@
>
> #define MSR_AMD_CSTATE_CFG 0xc0010296U
>
> +#define MSR_AMD_CPPC_CAP1 0xc00102b0
> +#define MSR_AMD_CPPC_ENABLE 0xc00102b1
> +#define AMD_CPPC_ENABLE (_AC(1, ULL) << 0)
> +#define MSR_AMD_CPPC_REQ 0xc00102b3
As you can see from the pre-existing #define in context, there are U suffixes
missing here.
Jan
[Public]
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, March 25, 2025 5:58 PM
> To: Penny, Zheng <penny.zheng@amd.com>
> Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-
> devel@lists.xenproject.org; Jason Andryuk <jandryuk@gmail.com>
> Subject: Re: [PATCH v3 09/15] xen/x86: introduce a new amd cppc driver for
> cpufreq scaling
>
> > + * directly use nominal_mhz and lowest_mhz as the division
> > + * will remove the frequency unit.
> > + */
> > + div = div ?: 1;
>
> Imo the cppc_data->lowest_mhz >= cppc_data->nominal_mhz case better
> wouldn't make it here, but use the fallback path below. Or special- case
> cppc_data->lowest_mhz == cppc_data->nominal_mhz: mul would
> (hopefully) be zero (i.e. there would be the expectation that
> data->caps.nominal_perf == data->caps.lowest_perf, yet no guarantee
> without checking), and hence ...
Okay, I'll drop the " div = div ?: 1", to strict the if() to
```
if ( cppc_data->cpc.lowest_mhz && cppc_data->cpc.nominal_mhz &&
(cppc_data->cpc.lowest_mhz != cppc_data->cpc.nominal_mhz) )
```
>
> > + offset = data->caps.nominal_perf -
> > + (mul * cppc_data->nominal_mhz) / div;
>
> ... offset = data->caps.nominal_perf regardless of "div" (as long as that's not zero).
> I.e. the "equal" case may still be fine to take this path.
>
> Or is there a check somewhere that lowest_mhz <= nominal_mhz and lowest_perf
> <= nominal_perf, which I'm simply overlooking?
>
Yes. I overlooked the scenario that lowest_mhz > nominal_mhz and lowest_perf > nominal_perf
and I'll add the check on first read
> > +#define amd_get_freq(name) \
>
> The macro parameter is used just ...
>
> > + static int amd_get_##name##_freq(const struct amd_cppc_drv_data
> > + *data, \
>
> ... here, ...
>
> > + unsigned int *freq) \
> > + { \
> > + const struct xen_processor_cppc *cppc_data = data->cppc_data; \
> > + uint64_t mul, div, res; \
> > + \
> > + if ( cppc_data->name##_mhz ) \
> > + { \
> > + /* Switch to khz */ \
> > + *freq = cppc_data->name##_mhz * 1000; \
>
> ... twice here forthe MHz value, and ...
>
> > + return 0; \
> > + } \
> > + \
> > + /* Read Processor Max Speed(mhz) as anchor point */ \
> > + mul = this_cpu(amd_max_freq_mhz); \
> > + if ( !mul ) \
> > + return -EINVAL; \
> > + div = data->caps.highest_perf; \
> > + res = (mul * data->caps.name##_perf * 1000) / div; \
>
> ... here for the respective perf indicator. Why does it take ...
>
> > + if ( res > UINT_MAX ) \
> > + { \
> > + printk(XENLOG_ERR \
> > + "Frequeny exceeds maximum value UINT_MAX: %lu\n", res); \
> > + return -EINVAL; \
> > + } \
> > + *freq = (unsigned int)res; \
> > + \
> > + return 0; \
> > + } \
> > +
> > +amd_get_freq(lowest);
> > +amd_get_freq(nominal);
>
> ... two almost identical functions, when one (with two extra input parameters) would
> suffice?
>
I had a draft fix here, If it doesn't what you hope for, plz let me know
```
static int amd_get_lowest_and_nominal_freq(const struct amd_cppc_drv_data *data,
unsigned int *lowest_freq,
unsigned int *nominal_freq)
{
const struct xen_processor_cppc *cppc_data = data->cppc_data;
uint64_t mul, div, res;
uint8_t perf;
if ( !lowest_freq && !nominal_freq )
return -EINVAL;
if ( lowest_freq && cppc_data->cpc.lowest_mhz )
/* Switch to khz */
*lowest_freq = cppc_data->cpc.lowest_mhz * 1000;
if ( nominal_freq && cppc_data->cpc.nominal_mhz )
/* Switch to khz */
*nominal_freq = cppc_data->cpc.nominal_mhz * 1000;
/* Still have unresolved frequency */
if ( (lowest_freq && !(*lowest_freq)) ||
(nominal_freq && !(*nominal_freq)) )
{
do {
/* Calculate lowest frequency firstly if need */
if ( lowest_freq && !(*lowest_freq) )
perf = data->caps.lowest_perf;
else
perf = data->caps.nominal_perf;
/* Read Processor Max Speed(MHz) as anchor point */
mul = this_cpu(amd_max_pxfreq_mhz);
if ( mul == INVAL_FREQ_MHZ || !mul )
{
printk(XENLOG_ERR
"Failed to read valid processor max frequency as anchor point: %lu\n",
mul);
return -EINVAL;
}
div = data->caps.highest_perf;
res = (mul * perf * 1000) / div;
if ( res > UINT_MAX || !res )
{
printk(XENLOG_ERR
"Frequeny exceeds maximum value UINT_MAX or being zero value: %lu\n",
res);
return -EINVAL;
}
if ( lowest_freq && !(*lowest_freq) )
*lowest_freq = (unsigned int)res;
else
*nominal_freq = (unsigned int)res;
} while ( nominal_freq && !(*nominal_freq) );
}
return 0;
}
```
> In amd_cppc_khz_to_perf() you have a check to avoid division by zero. Why not
> the same safeguarding here?
>
div = data->caps.highest_perf; For highest_perf non-zero check, it is already added
in amd_cppc_init_msrs()
> > +static int amd_get_max_freq(const struct amd_cppc_drv_data *data,
> > + unsigned int *max_freq) {
> > + unsigned int nom_freq, boost_ratio;
> > + int res;
> > +
> > + res = amd_get_nominal_freq(data, &nom_freq);
> > + if ( res )
> > + return res;
> > +
> > + boost_ratio = (unsigned int)(data->caps.highest_perf /
> > + data->caps.nominal_perf);
>
> Similarly here - I can't spot what would prevent division by zero.
>
In amd_cppc_init_msrs(), before calculating the frequency, we have checked
all caps.xxx_perf info shall not be zero.
I'll complement check to avoid "highest_perf < nominal_perf", to ensure that
the calculation result of boost_ratio must not be zero.
```
if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 ||
data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == 0 ||
data->caps.lowest_perf > data->caps.lowest_nonlinear_perf ||
data->caps.lowest_nonlinear_perf > data->caps.nominal_perf ||
data->caps.nominal_perf > data->caps.highest_perf )
```
>
> Jan
On 03.04.2025 09:40, Penny, Zheng wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Tuesday, March 25, 2025 5:58 PM
>>
>>> +#define amd_get_freq(name) \
>>
>> The macro parameter is used just ...
>>
>>> + static int amd_get_##name##_freq(const struct amd_cppc_drv_data
>>> + *data, \
>>
>> ... here, ...
>>
>>> + unsigned int *freq) \
>>> + { \
>>> + const struct xen_processor_cppc *cppc_data = data->cppc_data; \
>>> + uint64_t mul, div, res; \
>>> + \
>>> + if ( cppc_data->name##_mhz ) \
>>> + { \
>>> + /* Switch to khz */ \
>>> + *freq = cppc_data->name##_mhz * 1000; \
>>
>> ... twice here forthe MHz value, and ...
>>
>>> + return 0; \
>>> + } \
>>> + \
>>> + /* Read Processor Max Speed(mhz) as anchor point */ \
>>> + mul = this_cpu(amd_max_freq_mhz); \
>>> + if ( !mul ) \
>>> + return -EINVAL; \
>>> + div = data->caps.highest_perf; \
>>> + res = (mul * data->caps.name##_perf * 1000) / div; \
>>
>> ... here for the respective perf indicator. Why does it take ...
>>
>>> + if ( res > UINT_MAX ) \
>>> + { \
>>> + printk(XENLOG_ERR \
>>> + "Frequeny exceeds maximum value UINT_MAX: %lu\n", res); \
>>> + return -EINVAL; \
>>> + } \
>>> + *freq = (unsigned int)res; \
>>> + \
>>> + return 0; \
>>> + } \
>>> +
>>> +amd_get_freq(lowest);
>>> +amd_get_freq(nominal);
>>
>> ... two almost identical functions, when one (with two extra input parameters) would
>> suffice?
>>
>
> I had a draft fix here, If it doesn't what you hope for, plz let me know
> ```
> static int amd_get_lowest_and_nominal_freq(const struct amd_cppc_drv_data *data,
> unsigned int *lowest_freq,
> unsigned int *nominal_freq)
Why two outputs now when there was just one in the macro-ized form? I was
rather expecting new inputs to appear, to account for the prior uses of
the macro parameter. (As a result the function is now also quite a bit
more complex than it was before. In particular there was no ...
> {
> const struct xen_processor_cppc *cppc_data = data->cppc_data;
> uint64_t mul, div, res;
> uint8_t perf;
>
> if ( !lowest_freq && !nominal_freq )
> return -EINVAL;
>
> if ( lowest_freq && cppc_data->cpc.lowest_mhz )
> /* Switch to khz */
> *lowest_freq = cppc_data->cpc.lowest_mhz * 1000;
>
> if ( nominal_freq && cppc_data->cpc.nominal_mhz )
> /* Switch to khz */
> *nominal_freq = cppc_data->cpc.nominal_mhz * 1000;
>
> /* Still have unresolved frequency */
> if ( (lowest_freq && !(*lowest_freq)) ||
> (nominal_freq && !(*nominal_freq)) )
> {
> do {
> /* Calculate lowest frequency firstly if need */
> if ( lowest_freq && !(*lowest_freq) )
> perf = data->caps.lowest_perf;
> else
> perf = data->caps.nominal_perf;
>
> /* Read Processor Max Speed(MHz) as anchor point */
> mul = this_cpu(amd_max_pxfreq_mhz);
> if ( mul == INVAL_FREQ_MHZ || !mul )
> {
> printk(XENLOG_ERR
> "Failed to read valid processor max frequency as anchor point: %lu\n",
> mul);
> return -EINVAL;
> }
> div = data->caps.highest_perf;
> res = (mul * perf * 1000) / div;
>
> if ( res > UINT_MAX || !res )
> {
> printk(XENLOG_ERR
> "Frequeny exceeds maximum value UINT_MAX or being zero value: %lu\n",
> res);
> return -EINVAL;
> }
>
> if ( lowest_freq && !(*lowest_freq) )
> *lowest_freq = (unsigned int)res;
> else
> *nominal_freq = (unsigned int)res;
> } while ( nominal_freq && !(*nominal_freq) );
... loop there.)
Jan
> }
>
> return 0;
> }
> ```
On 2025-03-25 05:57, Jan Beulich wrote:
> On 06.03.2025 09:39, Penny Zheng wrote:
>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
>> + unsigned int target_freq,
>> + unsigned int relation)
>> +{
>> + unsigned int cpu = policy->cpu;
>> + const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
>> + uint8_t des_perf;
>> + int res;
>> +
>> + if ( unlikely(!target_freq) )
>> + return 0;
>
> Checking other *_cpufreq_target() functions, none would silently ignore
> a zero input. (HWP's ignores the input altogether though; Cc-ing Jason
> for possible clarification: I would have expected this driver here and
> the HWP one to be similar in this regard.)
Yes, for HWP, the target and relation are ignored. All control is done
by writing MSR_HWP_REQUEST which are "continuous, abstract, unit-less
performance scale" values. Those are applied by set_hwp_para() from
`xenpm set-cpufreq-cppc`.
I think the difference is that this CPPC driver supports both autonomous
and active mode. The HWP driver I wrote only supports the equivalent of
autonomous mode - write the MSR and let the processor figure it out.
I think Penny's implementation also uses the existing governors, whereas
HWP only uses the dedicated hwp_governor.
Hopefully that gives some context.
Regards,
Jason
© 2016 - 2026 Red Hat, Inc.