amd-cppc is the AMD CPU performance scaling driver that introduces a
new CPU frequency control mechanism. The new mechanism is based on
Collaborative Processor Performance Control (CPPC) which is a finer grain
frequency management than legacy ACPI hardware P-States.
Current AMD CPU platforms are using the ACPI P-states driver to
manage CPU frequency and clocks with switching only in 3 P-states, while the
new amd-cppc allows a more flexible, low-latency interface for Xen
to directly communicate the performance hints to hardware.
Default "amd-cppc" driver still leverages Xen governors such as
*ondemand*, *performance*, etc, to calculate the performance hints. In the
future, we will introduce an advanced active mode to enable autonomous
performence level selection.
Signed-off-by: Penny Zheng <Penny.Zheng@amd.com>
---
v1 -> v2:
- re-construct union caps and req to have anonymous struct instead
- avoid "else" when the earlier if() ends in an unconditional control flow statement
- Add check to avoid chopping off set bits from cast
- make pointers pointer-to-const wherever possible
- remove noisy log
- exclude families before 0x17 before CPPC-feature MSR op
- remove useless variable helpers
- use xvzalloc and XVFREE
- refactor error handling as ENABLE bit can only be cleared by reset
---
v2 -> v3:
- Move all MSR-definations to msr-index.h and follow the required style
- Refactor opening figure braces for struct/union
- Sort overlong lines throughout the series
- Make offset/res int covering underflow scenario
- Error out when amd_max_freq_mhz isn't set
- Introduce amd_get_freq(name) macro to decrease redundancy
- Supported CPU family checked ahead of smp-function
- Nominal freq shall be checked between the [min, max]
- Use APERF/MPREF to calculate current frequency
- Use amd_cppc_cpufreq_cpu_exit() to tidy error path
---
v3 -> v4:
- verbose print shall come with a CPU number
- deal with res <= 0 in amd_cppc_khz_to_perf()
- introduce a single helper amd_get_lowest_or_nominal_freq() to cover both
lowest and nominal scenario
- reduce abuse of wrmsr_safe()/rdmsr_safe() with wrmsrl()/rdmsrl()
- move cf_check from amd_cppc_write_request() to amd_cppc_write_request_msrs()
- add comment to explain why setting non_linear_lowest in passive mode
- add check to ensure perf values in
lowest <= non_linear_lowest <= nominal <= highset
- refactor comment for "data->err != 0" scenario
- use "data->err" instead of -ENODEV
- add U suffixes for all msr macro
---
v4 -> v5:
- all freq-values shall be unsigned int type
- remove shortcuts as it is rarely taken
- checking cpc.nominal_mhz and cpc.lowest_mhz are non-zero values is enough
- drop the explicit type cast
- null pointer check is in no need for internal functions
- change amd_get_lowest_or_nominal_freq() to amd_get_cpc_freq()
- clarifying function-wide that the calculated frequency result is to be in kHz
- use array notation
- with cpu_has_cppc check, no need to do cpu family check
---
 xen/arch/x86/acpi/cpufreq/amd-cppc.c | 362 +++++++++++++++++++++++++++
 xen/arch/x86/cpu/amd.c               |   8 +-
 xen/arch/x86/include/asm/amd.h       |   2 +
 xen/arch/x86/include/asm/msr-index.h |   5 +
 4 files changed, 373 insertions(+), 4 deletions(-)
diff --git a/xen/arch/x86/acpi/cpufreq/amd-cppc.c b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
index 9e64bf957a..8f0a30c19d 100644
--- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
+++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
@@ -14,7 +14,56 @@
 #include <xen/domain.h>
 #include <xen/init.h>
 #include <xen/param.h>
+#include <xen/percpu.h>
+#include <xen/xvmalloc.h>
 #include <acpi/cpufreq/cpufreq.h>
+#include <asm/amd.h>
+#include <asm/msr-index.h>
+
+#define amd_cppc_err(cpu, fmt, args...)                             \
+    printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args)
+#define amd_cppc_warn(cpu, fmt, args...)                            \
+    printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args)
+#define amd_cppc_verbose(cpu, fmt, args...)                         \
+({                                                                  \
+    if ( cpufreq_verbose )                                          \
+        printk(XENLOG_DEBUG "AMD_CPPC: CPU%u " fmt, cpu, ## args);  \
+})
+
+struct amd_cppc_drv_data
+{
+    const struct xen_processor_cppc *cppc_data;
+    union {
+        uint64_t raw;
+        struct {
+            unsigned int lowest_perf:8;
+            unsigned int lowest_nonlinear_perf:8;
+            unsigned int nominal_perf:8;
+            unsigned int highest_perf:8;
+            unsigned int :32;
+        };
+    } caps;
+    union {
+        uint64_t raw;
+        struct {
+            unsigned int max_perf:8;
+            unsigned int min_perf:8;
+            unsigned int des_perf:8;
+            unsigned int epp:8;
+            unsigned int :32;
+        };
+    } req;
+
+    int err;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct amd_cppc_drv_data *,
+                                  amd_cppc_drv_data);
+/*
+ * Core max frequency read from PstateDef as anchor point
+ * for freq-to-perf transition
+ */
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, pxfreq_mhz);
 
 static bool __init amd_cppc_handle_option(const char *s, const char *end)
 {
@@ -50,10 +99,323 @@ int __init amd_cppc_cmdline_parse(const char *s, const char *e)
     return 0;
 }
 
+/*
+ * If CPPC lowest_freq and nominal_freq registers are exposed then we can
+ * use them to convert perf to freq and vice versa. The conversion is
+ * extrapolated as an linear function passing by the 2 points:
+ *  - (Low perf, Low freq)
+ *  - (Nominal perf, Nominal freq)
+ * Parameter freq is always in kHz.
+ */
+static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data,
+                                unsigned int freq, uint8_t *perf)
+{
+    const struct xen_processor_cppc *cppc_data = data->cppc_data;
+    unsigned int mul, div;
+    int offset = 0, res;
+
+    if ( cppc_data->cpc.lowest_mhz && cppc_data->cpc.nominal_mhz )
+    {
+        mul = data->caps.nominal_perf - data->caps.lowest_perf;
+        div = cppc_data->cpc.nominal_mhz - cppc_data->cpc.lowest_mhz;
+
+        /*
+         * We don't need to convert to kHz for computing offset and can
+         * directly use nominal_mhz and lowest_mhz as the division
+         * will remove the frequency unit.
+         */
+        offset = data->caps.nominal_perf -
+                 (mul * cppc_data->cpc.nominal_mhz) / div;
+    }
+    else
+    {
+        /* Read Processor Max Speed(MHz) as anchor point */
+        mul = data->caps.highest_perf;
+        div = this_cpu(pxfreq_mhz);
+        if ( !div )
+            return -EINVAL;
+    }
+
+    res = offset + (mul * freq) / (div * 1000);
+    if ( res > UINT8_MAX )
+    {
+        printk_once(XENLOG_WARNING
+                    "Perf value exceeds maximum value 255: %d\n", res);
+        *perf = 0xff;
+        return 0;
+    }
+    if ( res < 0 )
+    {
+        printk_once(XENLOG_WARNING
+                    "Perf value smaller than minimum value 0: %d\n", res);
+        *perf = 0;
+        return 0;
+    }
+    *perf = res;
+
+    return 0;
+}
+
+/*
+ * _CPC may define nominal frequecy and lowest frequency, if not, use
+ * Processor Max Speed as anchor point to calculate.
+ * Output freq stores cpc frequency in kHz
+ */
+static int amd_get_cpc_freq(const struct amd_cppc_drv_data *data,
+                            uint32_t cpc_mhz, uint8_t perf, unsigned int *freq)
+{
+    unsigned int mul, div, res;
+
+    if ( cpc_mhz )
+    {
+        /* Switch to kHz */
+        *freq = cpc_mhz * 1000;
+        return 0;
+    }
+
+    /* Read Processor Max Speed(MHz) as anchor point */
+    mul = this_cpu(pxfreq_mhz);
+    if ( !mul )
+    {
+        printk_once(XENLOG_ERR
+                    "Failed to read valid processor max frequency as anchor point: %u\n",
+                    mul);
+        return -EINVAL;
+    }
+    div = data->caps.highest_perf;
+    res = (mul * perf * 1000) / div;
+    if ( !res )
+        return -EINVAL;
+    *freq = res;
+
+    return 0;
+}
+
+/* Output max_freq stores calculated maximum frequency in kHz */
+static int amd_get_max_freq(const struct amd_cppc_drv_data *data,
+                            unsigned int *max_freq)
+{
+    unsigned int nom_freq = 0;
+    int res;
+
+    res = amd_get_cpc_freq(data, data->cppc_data->cpc.nominal_mhz,
+                           data->caps.nominal_perf, &nom_freq);
+    if ( res )
+        return res;
+
+    *max_freq = (data->caps.highest_perf * nom_freq) / data->caps.nominal_perf;
+
+    return 0;
+}
+
+static int cf_check amd_cppc_cpufreq_verify(struct cpufreq_policy *policy)
+{
+    cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+                                 policy->cpuinfo.max_freq);
+
+    return 0;
+}
+
+static void cf_check amd_cppc_write_request_msrs(void *info)
+{
+    const struct amd_cppc_drv_data *data = info;
+
+    wrmsrl(MSR_AMD_CPPC_REQ, data->req.raw);
+}
+
+static void amd_cppc_write_request(unsigned int cpu, uint8_t min_perf,
+                                   uint8_t des_perf, uint8_t max_perf)
+{
+    struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+    uint64_t prev = data->req.raw;
+
+    data->req.min_perf = min_perf;
+    data->req.max_perf = max_perf;
+    data->req.des_perf = des_perf;
+
+    if ( prev == data->req.raw )
+        return;
+
+    on_selected_cpus(cpumask_of(cpu), amd_cppc_write_request_msrs, data, 1);
+}
+
+static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
+                                            unsigned int target_freq,
+                                            unsigned int relation)
+{
+    unsigned int cpu = policy->cpu;
+    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+    uint8_t des_perf;
+    int res;
+
+    if ( unlikely(!target_freq) )
+        return 0;
+
+    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
+    if ( res )
+        return res;
+
+    /*
+     * Setting with "lowest_nonlinear_perf" to ensure governoring
+     * performance in P-state range.
+     */
+    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
+                           des_perf, data->caps.highest_perf);
+    return 0;
+}
+
+static void cf_check amd_cppc_init_msrs(void *info)
+{
+    struct cpufreq_policy *policy = info;
+    struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data);
+    uint64_t val;
+    unsigned int min_freq = 0, nominal_freq = 0, max_freq;
+
+    /* Package level MSR */
+    rdmsrl(MSR_AMD_CPPC_ENABLE, val);
+    /*
+     * Only when Enable bit is on, the hardware will calculate the processor’s
+     * performance capabilities and initialize the performance level fields in
+     * the CPPC capability registers.
+     */
+    if ( !(val & AMD_CPPC_ENABLE) )
+    {
+        val |= AMD_CPPC_ENABLE;
+        wrmsrl(MSR_AMD_CPPC_ENABLE, val);
+    }
+
+    rdmsrl(MSR_AMD_CPPC_CAP1, data->caps.raw);
+
+    if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 ||
+         data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == 0 ||
+         data->caps.lowest_perf > data->caps.nominal_perf ||
+         data->caps.lowest_nonlinear_perf > data->caps.nominal_perf ||
+         data->caps.nominal_perf > data->caps.highest_perf )
+    {
+        amd_cppc_err(policy->cpu,
+                     "Out of range values: highest(%u), lowest(%u), nominal(%u), lowest_nonlinear(%u)\n",
+                     data->caps.highest_perf, data->caps.lowest_perf,
+                     data->caps.nominal_perf, data->caps.lowest_nonlinear_perf);
+        goto err;
+    }
+
+    amd_process_freq(&cpu_data[policy->cpu],
+                     NULL, NULL, &this_cpu(pxfreq_mhz));
+
+    data->err = amd_get_cpc_freq(data, data->cppc_data->cpc.lowest_mhz,
+                                 data->caps.lowest_perf, &min_freq);
+    if ( data->err )
+        return;
+
+    data->err = amd_get_cpc_freq(data, data->cppc_data->cpc.nominal_mhz,
+                                 data->caps.nominal_perf, &nominal_freq);
+    if ( data->err )
+        return;
+
+    data->err = amd_get_max_freq(data, &max_freq);
+    if ( data->err )
+        return;
+
+    if ( min_freq > nominal_freq || nominal_freq > max_freq )
+    {
+        amd_cppc_err(policy->cpu,
+                     "min(%u), or max(%u), or nominal(%u) freq value is incorrect\n",
+                     min_freq, max_freq, nominal_freq);
+        goto err;
+    }
+
+    policy->min = min_freq;
+    policy->max = max_freq;
+
+    policy->cpuinfo.min_freq = min_freq;
+    policy->cpuinfo.max_freq = max_freq;
+    policy->cpuinfo.perf_freq = nominal_freq;
+    /*
+     * Set after policy->cpuinfo.perf_freq, as we are taking
+     * APERF/MPERF average frequency as current frequency.
+     */
+    policy->cur = cpufreq_driver_getavg(policy->cpu, GOV_GETAVG);
+
+    return;
+
+ err:
+    /*
+     * No fallback shceme is available here, see more explanation at call
+     * site in amd_cppc_cpufreq_cpu_init().
+     */
+    data->err = -EINVAL;
+}
+
+/*
+ * AMD CPPC driver is different than legacy ACPI hardware P-State,
+ * which has a finer grain frequency range between the highest and lowest
+ * frequency. And boost frequency is actually the frequency which is mapped on
+ * highest performance ratio. The legacy P0 frequency is actually mapped on
+ * nominal performance ratio.
+ */
+static void amd_cppc_boost_init(struct cpufreq_policy *policy,
+                                const struct amd_cppc_drv_data *data)
+{
+    if ( data->caps.highest_perf <= data->caps.nominal_perf )
+        return;
+
+    policy->turbo = CPUFREQ_TURBO_ENABLED;
+}
+
+static int cf_check amd_cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    XVFREE(per_cpu(amd_cppc_drv_data, policy->cpu));
+
+    return 0;
+}
+
+static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+    unsigned int cpu = policy->cpu;
+    struct amd_cppc_drv_data *data;
+
+    data = xvzalloc(struct amd_cppc_drv_data);
+    if ( !data )
+        return -ENOMEM;
+
+    data->cppc_data = &processor_pminfo[cpu]->cppc_data;
+
+    per_cpu(amd_cppc_drv_data, cpu) = data;
+
+    on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
+
+    /*
+     * The enable bit is sticky, as we need to enable it at the very first
+     * begining, before CPPC capability values sanity check.
+     * If error path takes effective, not only amd-cppc cpufreq core fails
+     * to initialize, but also we could not fall back to legacy P-states
+     * driver, irrespective of the command line specifying a fallback option.
+     */
+    if ( data->err )
+    {
+        amd_cppc_err(cpu, "Could not initialize cpufreq cores in CPPC mode\n");
+        amd_cppc_cpufreq_cpu_exit(policy);
+        return data->err;
+    }
+
+    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+
+    amd_cppc_boost_init(policy, data);
+
+    amd_cppc_verbose(policy->cpu,
+                     "CPU initialized with amd-cppc passive mode\n");
+
+    return 0;
+}
+
 static const struct cpufreq_driver __initconst_cf_clobber
 amd_cppc_cpufreq_driver =
 {
     .name   = XEN_AMD_CPPC_DRIVER_NAME,
+    .verify = amd_cppc_cpufreq_verify,
+    .target = amd_cppc_cpufreq_target,
+    .init   = amd_cppc_cpufreq_cpu_init,
+    .exit   = amd_cppc_cpufreq_cpu_exit,
 };
 
 int __init amd_cppc_register_driver(void)
diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 8c985466fa..7ec154e9e7 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -611,10 +611,10 @@ static unsigned int attr_const amd_parse_freq(unsigned int family,
 	return freq;
 }
 
-static void amd_process_freq(const struct cpuinfo_x86 *c,
-			     unsigned int *low_mhz,
-			     unsigned int *nom_mhz,
-			     unsigned int *hi_mhz)
+void amd_process_freq(const struct cpuinfo_x86 *c,
+		      unsigned int *low_mhz,
+		      unsigned int *nom_mhz,
+		      unsigned int *hi_mhz)
 {
 	unsigned int idx = 0, h;
 	uint64_t hi, lo, val;
diff --git a/xen/arch/x86/include/asm/amd.h b/xen/arch/x86/include/asm/amd.h
index 9c9599a622..72df42a6f6 100644
--- a/xen/arch/x86/include/asm/amd.h
+++ b/xen/arch/x86/include/asm/amd.h
@@ -173,5 +173,7 @@ extern bool amd_virt_spec_ctrl;
 bool amd_setup_legacy_ssbd(void);
 void amd_set_legacy_ssbd(bool enable);
 void amd_set_cpuid_user_dis(bool enable);
+void amd_process_freq(const struct cpuinfo_x86 *c, unsigned int *low_mhz,
+                      unsigned int *nom_mhz, unsigned int *hi_mhz);
 
 #endif /* __AMD_H__ */
diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h
index 22d9e76e55..e4401186c7 100644
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -238,6 +238,11 @@
 
 #define MSR_AMD_CSTATE_CFG                  0xc0010296U
 
+#define MSR_AMD_CPPC_CAP1                   0xc00102b0U
+#define MSR_AMD_CPPC_ENABLE                 0xc00102b1U
+#define  AMD_CPPC_ENABLE                    (_AC(1, ULL) << 0)
+#define MSR_AMD_CPPC_REQ                    0xc00102b3U
+
 /*
  * Legacy MSR constants in need of cleanup.  No new MSRs below this comment.
  */
-- 
2.34.1
On 27.05.2025 10:48, Penny Zheng wrote:
> --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> @@ -14,7 +14,56 @@
>  #include <xen/domain.h>
>  #include <xen/init.h>
>  #include <xen/param.h>
> +#include <xen/percpu.h>
> +#include <xen/xvmalloc.h>
>  #include <acpi/cpufreq/cpufreq.h>
> +#include <asm/amd.h>
> +#include <asm/msr-index.h>
> +
> +#define amd_cppc_err(cpu, fmt, args...)                             \
> +    printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args)
> +#define amd_cppc_warn(cpu, fmt, args...)                            \
> +    printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args)
> +#define amd_cppc_verbose(cpu, fmt, args...)                         \
> +({                                                                  \
> +    if ( cpufreq_verbose )                                          \
> +        printk(XENLOG_DEBUG "AMD_CPPC: CPU%u " fmt, cpu, ## args);  \
> +})
Nit: Much like in the file name, would you mind using AMD-CPPC in favor of
AMD_CPPC here, too?
> @@ -50,10 +99,323 @@ int __init amd_cppc_cmdline_parse(const char *s, const char *e)
>      return 0;
>  }
>  
> +/*
> + * If CPPC lowest_freq and nominal_freq registers are exposed then we can
> + * use them to convert perf to freq and vice versa. The conversion is
> + * extrapolated as an linear function passing by the 2 points:
> + *  - (Low perf, Low freq)
> + *  - (Nominal perf, Nominal freq)
> + * Parameter freq is always in kHz.
> + */
> +static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data,
> +                                unsigned int freq, uint8_t *perf)
> +{
> +    const struct xen_processor_cppc *cppc_data = data->cppc_data;
> +    unsigned int mul, div;
> +    int offset = 0, res;
> +
> +    if ( cppc_data->cpc.lowest_mhz && cppc_data->cpc.nominal_mhz )
> +    {
> +        mul = data->caps.nominal_perf - data->caps.lowest_perf;
> +        div = cppc_data->cpc.nominal_mhz - cppc_data->cpc.lowest_mhz;
What guarantees both of these values to be non-zero?
> +        /*
> +         * We don't need to convert to kHz for computing offset and can
> +         * directly use nominal_mhz and lowest_mhz as the division
> +         * will remove the frequency unit.
> +         */
> +        offset = data->caps.nominal_perf -
> +                 (mul * cppc_data->cpc.nominal_mhz) / div;
> +    }
> +    else
> +    {
> +        /* Read Processor Max Speed(MHz) as anchor point */
> +        mul = data->caps.highest_perf;
> +        div = this_cpu(pxfreq_mhz);
> +        if ( !div )
> +            return -EINVAL;
What's wrong about the function arguments in this case? (Same question again
on further uses of EINVAL below.)
> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> +                                            unsigned int target_freq,
> +                                            unsigned int relation)
> +{
> +    unsigned int cpu = policy->cpu;
> +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
> +    uint8_t des_perf;
> +    int res;
> +
> +    if ( unlikely(!target_freq) )
> +        return 0;
> +
> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> +    if ( res )
> +        return res;
> +
> +    /*
> +     * Setting with "lowest_nonlinear_perf" to ensure governoring
> +     * performance in P-state range.
> +     */
> +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> +                           des_perf, data->caps.highest_perf);
I fear I don't understand the comment, and hence it remains unclear to me
why lowest_nonlinear_perf is being used here.
> +static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> +    unsigned int cpu = policy->cpu;
> +    struct amd_cppc_drv_data *data;
> +
> +    data = xvzalloc(struct amd_cppc_drv_data);
> +    if ( !data )
> +        return -ENOMEM;
> +
> +    data->cppc_data = &processor_pminfo[cpu]->cppc_data;
> +
> +    per_cpu(amd_cppc_drv_data, cpu) = data;
> +
> +    on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
> +
> +    /*
> +     * The enable bit is sticky, as we need to enable it at the very first
> +     * begining, before CPPC capability values sanity check.
> +     * If error path takes effective, not only amd-cppc cpufreq core fails
Nit: "takes effect" or "is taken".
> +     * to initialize, but also we could not fall back to legacy P-states
> +     * driver, irrespective of the command line specifying a fallback option.
> +     */
> +    if ( data->err )
> +    {
> +        amd_cppc_err(cpu, "Could not initialize cpufreq cores in CPPC mode\n");
Why "cores" (plural)?
Jan
                
            [Public]
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, June 17, 2025 12:00 AM
> To: Penny, Zheng <penny.zheng@amd.com>
> Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for
> cpufreq scaling
>
> On 27.05.2025 10:48, Penny Zheng wrote:
> > --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> > +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> > +        /*
> > +         * We don't need to convert to kHz for computing offset and can
> > +         * directly use nominal_mhz and lowest_mhz as the division
> > +         * will remove the frequency unit.
> > +         */
> > +        offset = data->caps.nominal_perf -
> > +                 (mul * cppc_data->cpc.nominal_mhz) / div;
> > +    }
> > +    else
> > +    {
> > +        /* Read Processor Max Speed(MHz) as anchor point */
> > +        mul = data->caps.highest_perf;
> > +        div = this_cpu(pxfreq_mhz);
> > +        if ( !div )
> > +            return -EINVAL;
>
> What's wrong about the function arguments in this case? (Same question again on
> further uses of EINVAL below.)
>
If we could not get processor max frequency, the whole function is useless...
Maybe -EOPNOTSUPP is more suitable than -EINVAL;
> > +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> > +                                            unsigned int target_freq,
> > +                                            unsigned int relation) {
> > +    unsigned int cpu = policy->cpu;
> > +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
> > +    uint8_t des_perf;
> > +    int res;
> > +
> > +    if ( unlikely(!target_freq) )
> > +        return 0;
> > +
> > +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> > +    if ( res )
> > +        return res;
> > +
> > +    /*
> > +     * Setting with "lowest_nonlinear_perf" to ensure governoring
> > +     * performance in P-state range.
> > +     */
> > +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> > +                           des_perf, data->caps.highest_perf);
>
> I fear I don't understand the comment, and hence it remains unclear to me why
> lowest_nonlinear_perf is being used here.
>
How about
```
Choose lowest nonlinear performance as the minimum performance level at which the platform may run.
Lowest nonlinear performance is the lowest performance level at which nonlinear power savings are achieved,
Above this threshold, lower performance levels should be generally more energy efficient than higher performance levels.
```
>
> Jan
                
            On 02.07.2025 11:49, Penny, Zheng wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Tuesday, June 17, 2025 12:00 AM
>> To: Penny, Zheng <penny.zheng@amd.com>
>>
>> On 27.05.2025 10:48, Penny Zheng wrote:
>>> --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
>>> +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
>>> +        /*
>>> +         * We don't need to convert to kHz for computing offset and can
>>> +         * directly use nominal_mhz and lowest_mhz as the division
>>> +         * will remove the frequency unit.
>>> +         */
>>> +        offset = data->caps.nominal_perf -
>>> +                 (mul * cppc_data->cpc.nominal_mhz) / div;
>>> +    }
>>> +    else
>>> +    {
>>> +        /* Read Processor Max Speed(MHz) as anchor point */
>>> +        mul = data->caps.highest_perf;
>>> +        div = this_cpu(pxfreq_mhz);
>>> +        if ( !div )
>>> +            return -EINVAL;
>>
>> What's wrong about the function arguments in this case? (Same question again on
>> further uses of EINVAL below.)
> 
> If we could not get processor max frequency, the whole function is useless...
> Maybe -EOPNOTSUPP is more suitable than -EINVAL;
I don't like EOPNOTSUPP very much either for the purpose, but it's surely better
than EINVAL.
>>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
>>> +                                            unsigned int target_freq,
>>> +                                            unsigned int relation) {
>>> +    unsigned int cpu = policy->cpu;
>>> +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
>>> +    uint8_t des_perf;
>>> +    int res;
>>> +
>>> +    if ( unlikely(!target_freq) )
>>> +        return 0;
>>> +
>>> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
>>> +    if ( res )
>>> +        return res;
>>> +
>>> +    /*
>>> +     * Setting with "lowest_nonlinear_perf" to ensure governoring
>>> +     * performance in P-state range.
>>> +     */
>>> +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
>>> +                           des_perf, data->caps.highest_perf);
>>
>> I fear I don't understand the comment, and hence it remains unclear to me why
>> lowest_nonlinear_perf is being used here.
> 
> How about
> ```
> Choose lowest nonlinear performance as the minimum performance level at which the platform may run.
> Lowest nonlinear performance is the lowest performance level at which nonlinear power savings are achieved,
> Above this threshold, lower performance levels should be generally more energy efficient than higher performance levels.
> ```
I finally had to go to the ACPI spec to understand what this is about. There looks
to be an implication that lowest <= lowest_nonlinear, and states in that range
would correspond more to T-states than to P-states. With that I think I agree with
the use of lowest_nonlinear here. The comment, however, could do with moving
farther away from merely quoting the pretty abstract text in the ACPI spec, as
such quoting doesn't help in clarifying terminology used, when that terminology
also isn't explained anywhere else in the code base.
Jan
                
            [Public]
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Wednesday, July 2, 2025 6:48 PM
> To: Penny, Zheng <penny.zheng@amd.com>
> Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for
> cpufreq scaling
>
> On 02.07.2025 11:49, Penny, Zheng wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: Tuesday, June 17, 2025 12:00 AM
> >> To: Penny, Zheng <penny.zheng@amd.com>
> >>
> >> On 27.05.2025 10:48, Penny Zheng wrote:
> >>> --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> >>> +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> >>> +        /*
> >>> +         * We don't need to convert to kHz for computing offset and can
> >>> +         * directly use nominal_mhz and lowest_mhz as the division
> >>> +         * will remove the frequency unit.
> >>> +         */
> >>> +        offset = data->caps.nominal_perf -
> >>> +                 (mul * cppc_data->cpc.nominal_mhz) / div;
> >>> +    }
> >>> +    else
> >>> +    {
> >>> +        /* Read Processor Max Speed(MHz) as anchor point */
> >>> +        mul = data->caps.highest_perf;
> >>> +        div = this_cpu(pxfreq_mhz);
> >>> +        if ( !div )
> >>> +            return -EINVAL;
> >>
> >> What's wrong about the function arguments in this case? (Same
> >> question again on further uses of EINVAL below.)
> >
> > If we could not get processor max frequency, the whole function is useless...
> > Maybe -EOPNOTSUPP is more suitable than -EINVAL;
>
> I don't like EOPNOTSUPP very much either for the purpose, but it's surely better
> than EINVAL.
>
> >>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> >>> +                                            unsigned int target_freq,
> >>> +                                            unsigned int relation) {
> >>> +    unsigned int cpu = policy->cpu;
> >>> +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data,
> cpu);
> >>> +    uint8_t des_perf;
> >>> +    int res;
> >>> +
> >>> +    if ( unlikely(!target_freq) )
> >>> +        return 0;
> >>> +
> >>> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> >>> +    if ( res )
> >>> +        return res;
> >>> +
> >>> +    /*
> >>> +     * Setting with "lowest_nonlinear_perf" to ensure governoring
> >>> +     * performance in P-state range.
> >>> +     */
> >>> +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> >>> +                           des_perf, data->caps.highest_perf);
> >>
> >> I fear I don't understand the comment, and hence it remains unclear
> >> to me why lowest_nonlinear_perf is being used here.
> >
> > How about
> > ```
> > Choose lowest nonlinear performance as the minimum performance level at which
> the platform may run.
> > Lowest nonlinear performance is the lowest performance level at which
> > nonlinear power savings are achieved, Above this threshold, lower performance
> levels should be generally more energy efficient than higher performance levels.
> > ```
>
> I finally had to go to the ACPI spec to understand what this is about. There looks to
> be an implication that lowest <= lowest_nonlinear, and states in that range would
> correspond more to T-states than to P-states. With that I think I agree with the use
Yes, It doesn't have definitive conclusion about relation between lowest and lowest_nonlinear
In our internal FW designed spec, it always shows lowest_nonlinear corresponds to P2
> of lowest_nonlinear here. The comment, however, could do with moving farther
> away from merely quoting the pretty abstract text in the ACPI spec, as such
> quoting doesn't help in clarifying terminology used, when that terminology also isn't
> explained anywhere else in the code base.
How about we add detailed explanations about each terminology in the beginning
declaration , see:
```
+/*
+ * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and lowest_perf
+ * contain the values read from CPPC capability MSR.
+ * Field highest_perf represents highest performance, which is the absolute
+ * maximum performance an individual processor may reach, assuming ideal
+ * conditions
+ * Field nominal_perf represents maximum sustained performance level of the
+ * processor, assuming ideal operating conditions.
+ * Field lowest_nonlinear_perf represents Lowest Nonlinear Performance, which
+ * is the lowest performance level at which nonlinear power savings are
+ * achieved. Above this threshold, lower performance levels should be
+ * generally more energy efficient than higher performance levels.
+ * Field lowest_perf represents the absolute lowest performance level of the
+ * platform.
+ *
+ * Field max_perf, min_perf, des_perf store the values for CPPC request MSR.
+ * Field max_perf conveys the maximum performance level at which the platform
+ * may run. And it may be set to any performance value in the range
+ * [lowest_perf, highest_perf], inclusive.
+ * Field min_perf conveys the minimum performance level at which the platform
+ * may run. And it may be set to any performance value in the range
+ * [lowest_perf, highest_perf], inclusive but must be less than or equal to
+ * max_perf.
+ * Field des_perf conveys performance level Xen is requesting. And it may be
+ * set to any performance value in the range [min_perf, max_perf], inclusive.
+ */
+struct amd_cppc_drv_data
+{
+    const struct xen_processor_cppc *cppc_data;
+    union {
+        uint64_t raw;
+        struct {
+            unsigned int lowest_perf:8;
+            unsigned int lowest_nonlinear_perf:8;
+            unsigned int nominal_perf:8;
+            unsigned int highest_perf:8;
+            unsigned int :32;
+        };
+    } caps;
+    union {
+        uint64_t raw;
+        struct {
+            unsigned int max_perf:8;
+            unsigned int min_perf:8;
+            unsigned int des_perf:8;
+            unsigned int epp:8;
+            unsigned int :32;
+        };
+    } req;
+
+    int err;
+};
``
Then here, we could elaborate the reason why we choose lowest_nonlinear_perf over lowest_perf:
```
+    /*
+     * Having a performance level lower than the lowest nonlinear
+     * performance level, such as, lowest_perf <= perf <= lowest_nonliner_perf,
+     * may actually cause an efficiency penalty, So when deciding the min_perf
+     * value, we prefer lowest nonlinear performance over lowest performance
+     */
+    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
+                           des_perf, data->caps.highest_perf);
```
>
> Jan
                
            On 04.07.2025 05:40, Penny, Zheng wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Wednesday, July 2, 2025 6:48 PM
>>
>> On 02.07.2025 11:49, Penny, Zheng wrote:
>>>> -----Original Message-----
>>>> From: Jan Beulich <jbeulich@suse.com>
>>>> Sent: Tuesday, June 17, 2025 12:00 AM
>>>> To: Penny, Zheng <penny.zheng@amd.com>
>>>>
>>>> On 27.05.2025 10:48, Penny Zheng wrote:
>>>>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
>>>>> +                                            unsigned int target_freq,
>>>>> +                                            unsigned int relation) {
>>>>> +    unsigned int cpu = policy->cpu;
>>>>> +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data,
>> cpu);
>>>>> +    uint8_t des_perf;
>>>>> +    int res;
>>>>> +
>>>>> +    if ( unlikely(!target_freq) )
>>>>> +        return 0;
>>>>> +
>>>>> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
>>>>> +    if ( res )
>>>>> +        return res;
>>>>> +
>>>>> +    /*
>>>>> +     * Setting with "lowest_nonlinear_perf" to ensure governoring
>>>>> +     * performance in P-state range.
>>>>> +     */
>>>>> +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
>>>>> +                           des_perf, data->caps.highest_perf);
>>>>
>>>> I fear I don't understand the comment, and hence it remains unclear
>>>> to me why lowest_nonlinear_perf is being used here.
>>>
>>> How about
>>> ```
>>> Choose lowest nonlinear performance as the minimum performance level at which
>> the platform may run.
>>> Lowest nonlinear performance is the lowest performance level at which
>>> nonlinear power savings are achieved, Above this threshold, lower performance
>> levels should be generally more energy efficient than higher performance levels.
>>> ```
>>
>> I finally had to go to the ACPI spec to understand what this is about. There looks to
>> be an implication that lowest <= lowest_nonlinear, and states in that range would
>> correspond more to T-states than to P-states. With that I think I agree with the use
> 
> Yes, It doesn't have definitive conclusion about relation between lowest and lowest_nonlinear
> In our internal FW designed spec, it always shows lowest_nonlinear corresponds to P2
> 
>> of lowest_nonlinear here. The comment, however, could do with moving farther
>> away from merely quoting the pretty abstract text in the ACPI spec, as such
>> quoting doesn't help in clarifying terminology used, when that terminology also isn't
>> explained anywhere else in the code base.
> 
> 
> How about we add detailed explanations about each terminology in the beginning
> declaration , see:
> ```
> +/*
> + * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and lowest_perf
> + * contain the values read from CPPC capability MSR.
> + * Field highest_perf represents highest performance, which is the absolute
> + * maximum performance an individual processor may reach, assuming ideal
> + * conditions
> + * Field nominal_perf represents maximum sustained performance level of the
> + * processor, assuming ideal operating conditions.
> + * Field lowest_nonlinear_perf represents Lowest Nonlinear Performance, which
> + * is the lowest performance level at which nonlinear power savings are
> + * achieved. Above this threshold, lower performance levels should be
> + * generally more energy efficient than higher performance levels.
Which is still only the vague statement also found in the spec. This says nothing
about what happens below that level, or what the relationship to other fields is.
> + * Field lowest_perf represents the absolute lowest performance level of the
> + * platform.
> + *
> + * Field max_perf, min_perf, des_perf store the values for CPPC request MSR.
> + * Field max_perf conveys the maximum performance level at which the platform
> + * may run. And it may be set to any performance value in the range
> + * [lowest_perf, highest_perf], inclusive.
> + * Field min_perf conveys the minimum performance level at which the platform
> + * may run. And it may be set to any performance value in the range
> + * [lowest_perf, highest_perf], inclusive but must be less than or equal to
> + * max_perf.
> + * Field des_perf conveys performance level Xen is requesting. And it may be
> + * set to any performance value in the range [min_perf, max_perf], inclusive.
> + */
> +struct amd_cppc_drv_data
> +{
> +    const struct xen_processor_cppc *cppc_data;
> +    union {
> +        uint64_t raw;
> +        struct {
> +            unsigned int lowest_perf:8;
> +            unsigned int lowest_nonlinear_perf:8;
> +            unsigned int nominal_perf:8;
> +            unsigned int highest_perf:8;
> +            unsigned int :32;
> +        };
> +    } caps;
> +    union {
> +        uint64_t raw;
> +        struct {
> +            unsigned int max_perf:8;
> +            unsigned int min_perf:8;
> +            unsigned int des_perf:8;
> +            unsigned int epp:8;
> +            unsigned int :32;
> +        };
> +    } req;
> +
> +    int err;
> +};
> ``
> Then here, we could elaborate the reason why we choose lowest_nonlinear_perf over lowest_perf:
> ```
> +    /*
> +     * Having a performance level lower than the lowest nonlinear
> +     * performance level, such as, lowest_perf <= perf <= lowest_nonliner_perf,
> +     * may actually cause an efficiency penalty, So when deciding the min_perf
> +     * value, we prefer lowest nonlinear performance over lowest performance
> +     */
> +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> +                           des_perf, data->caps.highest_perf);
> ```
This reads fine to me.
Question then is though: Is setting lowest_perf as the low boundary a good
idea in any of the places? (Iirc it is used in one or two places. Or am I
misremembering?)
Jan
                
            [Public]
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Friday, July 4, 2025 2:21 PM
> To: Penny, Zheng <penny.zheng@amd.com>
> Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for
> cpufreq scaling
>
> On 04.07.2025 05:40, Penny, Zheng wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: Wednesday, July 2, 2025 6:48 PM
> >>
> >> On 02.07.2025 11:49, Penny, Zheng wrote:
> >>>> -----Original Message-----
> >>>> From: Jan Beulich <jbeulich@suse.com>
> >>>> Sent: Tuesday, June 17, 2025 12:00 AM
> >>>> To: Penny, Zheng <penny.zheng@amd.com>
> >>>>
> >>>> On 27.05.2025 10:48, Penny Zheng wrote:
> >>>>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> >>>>> +                                            unsigned int target_freq,
> >>>>> +                                            unsigned int relation) {
> >>>>> +    unsigned int cpu = policy->cpu;
> >>>>> +    const struct amd_cppc_drv_data *data =
> >>>>> +per_cpu(amd_cppc_drv_data,
> >> cpu);
> >>>>> +    uint8_t des_perf;
> >>>>> +    int res;
> >>>>> +
> >>>>> +    if ( unlikely(!target_freq) )
> >>>>> +        return 0;
> >>>>> +
> >>>>> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> >>>>> +    if ( res )
> >>>>> +        return res;
> >>>>> +
> >>>>> +    /*
> >>>>> +     * Setting with "lowest_nonlinear_perf" to ensure governoring
> >>>>> +     * performance in P-state range.
> >>>>> +     */
> >>>>> +    amd_cppc_write_request(policy->cpu, data-
> >caps.lowest_nonlinear_perf,
> >>>>> +                           des_perf, data->caps.highest_perf);
> >>>>
> >>>> I fear I don't understand the comment, and hence it remains unclear
> >>>> to me why lowest_nonlinear_perf is being used here.
> >>>
> >>> How about
> >>> ```
> >>> Choose lowest nonlinear performance as the minimum performance level
> >>> at which
> >> the platform may run.
> >>> Lowest nonlinear performance is the lowest performance level at
> >>> which nonlinear power savings are achieved, Above this threshold,
> >>> lower performance
> >> levels should be generally more energy efficient than higher performance levels.
> >>> ```
> >>
> >> I finally had to go to the ACPI spec to understand what this is
> >> about. There looks to be an implication that lowest <=
> >> lowest_nonlinear, and states in that range would correspond more to
> >> T-states than to P-states. With that I think I agree with the use
> >
> > Yes, It doesn't have definitive conclusion about relation between
> > lowest and lowest_nonlinear In our internal FW designed spec, it
> > always shows lowest_nonlinear corresponds to P2
> >
> >> of lowest_nonlinear here. The comment, however, could do with moving
> >> farther away from merely quoting the pretty abstract text in the ACPI
> >> spec, as such quoting doesn't help in clarifying terminology used,
> >> when that terminology also isn't explained anywhere else in the code base.
> >
> >
> > How about we add detailed explanations about each terminology in the
> > beginning declaration , see:
> > ```
> > +/*
> > + * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and
> > +lowest_perf
> > + * contain the values read from CPPC capability MSR.
> > + * Field highest_perf represents highest performance, which is the
> > +absolute
> > + * maximum performance an individual processor may reach, assuming
> > +ideal
> > + * conditions
> > + * Field nominal_perf represents maximum sustained performance level
> > +of the
> > + * processor, assuming ideal operating conditions.
> > + * Field lowest_nonlinear_perf represents Lowest Nonlinear
> > +Performance, which
> > + * is the lowest performance level at which nonlinear power savings
> > +are
> > + * achieved. Above this threshold, lower performance levels should be
> > + * generally more energy efficient than higher performance levels.
>
> Which is still only the vague statement also found in the spec. This says nothing
> about what happens below that level, or what the relationship to other fields is.
>
> > + * Field lowest_perf represents the absolute lowest performance level
> > +of the
> > + * platform.
> > + *
> > + * Field max_perf, min_perf, des_perf store the values for CPPC request MSR.
> > + * Field max_perf conveys the maximum performance level at which the
> > +platform
> > + * may run. And it may be set to any performance value in the range
> > + * [lowest_perf, highest_perf], inclusive.
> > + * Field min_perf conveys the minimum performance level at which the
> > +platform
> > + * may run. And it may be set to any performance value in the range
> > + * [lowest_perf, highest_perf], inclusive but must be less than or
> > +equal to
> > + * max_perf.
> > + * Field des_perf conveys performance level Xen is requesting. And it
> > +may be
> > + * set to any performance value in the range [min_perf, max_perf], inclusive.
> > + */
> > +struct amd_cppc_drv_data
> > +{
> > +    const struct xen_processor_cppc *cppc_data;
> > +    union {
> > +        uint64_t raw;
> > +        struct {
> > +            unsigned int lowest_perf:8;
> > +            unsigned int lowest_nonlinear_perf:8;
> > +            unsigned int nominal_perf:8;
> > +            unsigned int highest_perf:8;
> > +            unsigned int :32;
> > +        };
> > +    } caps;
> > +    union {
> > +        uint64_t raw;
> > +        struct {
> > +            unsigned int max_perf:8;
> > +            unsigned int min_perf:8;
> > +            unsigned int des_perf:8;
> > +            unsigned int epp:8;
> > +            unsigned int :32;
> > +        };
> > +    } req;
> > +
> > +    int err;
> > +};
> > ``
> > Then here, we could elaborate the reason why we choose lowest_nonlinear_perf
> over lowest_perf:
> > ```
> > +    /*
> > +     * Having a performance level lower than the lowest nonlinear
> > +     * performance level, such as, lowest_perf <= perf <= lowest_nonliner_perf,
> > +     * may actually cause an efficiency penalty, So when deciding the min_perf
> > +     * value, we prefer lowest nonlinear performance over lowest performance
> > +     */
> > +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> > +                           des_perf, data->caps.highest_perf);
> > ```
>
> This reads fine to me.
>
> Question then is though: Is setting lowest_perf as the low boundary a good idea in
> any of the places? (Iirc it is used in one or two places. Or am I
> misremembering?)
Yes, in active mode, I choose lowest_perf as min_perf to try to extend the limitation for active(autonomous) mode
Maybe it is not a good choice. Maybe cpufreq driver is limited to do performance tuning in P-states range.
>
> Jan
                
            On 04.07.2025 09:23, Penny, Zheng wrote:
> [Public]
> 
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Friday, July 4, 2025 2:21 PM
>> To: Penny, Zheng <penny.zheng@amd.com>
>> Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
>> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-
>> devel@lists.xenproject.org
>> Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for
>> cpufreq scaling
>>
>> On 04.07.2025 05:40, Penny, Zheng wrote:
>>>> -----Original Message-----
>>>> From: Jan Beulich <jbeulich@suse.com>
>>>> Sent: Wednesday, July 2, 2025 6:48 PM
>>>>
>>>> On 02.07.2025 11:49, Penny, Zheng wrote:
>>>>>> -----Original Message-----
>>>>>> From: Jan Beulich <jbeulich@suse.com>
>>>>>> Sent: Tuesday, June 17, 2025 12:00 AM
>>>>>> To: Penny, Zheng <penny.zheng@amd.com>
>>>>>>
>>>>>> On 27.05.2025 10:48, Penny Zheng wrote:
>>>>>>> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
>>>>>>> +                                            unsigned int target_freq,
>>>>>>> +                                            unsigned int relation) {
>>>>>>> +    unsigned int cpu = policy->cpu;
>>>>>>> +    const struct amd_cppc_drv_data *data =
>>>>>>> +per_cpu(amd_cppc_drv_data,
>>>> cpu);
>>>>>>> +    uint8_t des_perf;
>>>>>>> +    int res;
>>>>>>> +
>>>>>>> +    if ( unlikely(!target_freq) )
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
>>>>>>> +    if ( res )
>>>>>>> +        return res;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * Setting with "lowest_nonlinear_perf" to ensure governoring
>>>>>>> +     * performance in P-state range.
>>>>>>> +     */
>>>>>>> +    amd_cppc_write_request(policy->cpu, data-
>>> caps.lowest_nonlinear_perf,
>>>>>>> +                           des_perf, data->caps.highest_perf);
>>>>>>
>>>>>> I fear I don't understand the comment, and hence it remains unclear
>>>>>> to me why lowest_nonlinear_perf is being used here.
>>>>>
>>>>> How about
>>>>> ```
>>>>> Choose lowest nonlinear performance as the minimum performance level
>>>>> at which
>>>> the platform may run.
>>>>> Lowest nonlinear performance is the lowest performance level at
>>>>> which nonlinear power savings are achieved, Above this threshold,
>>>>> lower performance
>>>> levels should be generally more energy efficient than higher performance levels.
>>>>> ```
>>>>
>>>> I finally had to go to the ACPI spec to understand what this is
>>>> about. There looks to be an implication that lowest <=
>>>> lowest_nonlinear, and states in that range would correspond more to
>>>> T-states than to P-states. With that I think I agree with the use
>>>
>>> Yes, It doesn't have definitive conclusion about relation between
>>> lowest and lowest_nonlinear In our internal FW designed spec, it
>>> always shows lowest_nonlinear corresponds to P2
>>>
>>>> of lowest_nonlinear here. The comment, however, could do with moving
>>>> farther away from merely quoting the pretty abstract text in the ACPI
>>>> spec, as such quoting doesn't help in clarifying terminology used,
>>>> when that terminology also isn't explained anywhere else in the code base.
>>>
>>>
>>> How about we add detailed explanations about each terminology in the
>>> beginning declaration , see:
>>> ```
>>> +/*
>>> + * Field highest_perf, nominal_perf, lowest_nonlinear_perf, and
>>> +lowest_perf
>>> + * contain the values read from CPPC capability MSR.
>>> + * Field highest_perf represents highest performance, which is the
>>> +absolute
>>> + * maximum performance an individual processor may reach, assuming
>>> +ideal
>>> + * conditions
>>> + * Field nominal_perf represents maximum sustained performance level
>>> +of the
>>> + * processor, assuming ideal operating conditions.
>>> + * Field lowest_nonlinear_perf represents Lowest Nonlinear
>>> +Performance, which
>>> + * is the lowest performance level at which nonlinear power savings
>>> +are
>>> + * achieved. Above this threshold, lower performance levels should be
>>> + * generally more energy efficient than higher performance levels.
>>
>> Which is still only the vague statement also found in the spec. This says nothing
>> about what happens below that level, or what the relationship to other fields is.
>>
>>> + * Field lowest_perf represents the absolute lowest performance level
>>> +of the
>>> + * platform.
>>> + *
>>> + * Field max_perf, min_perf, des_perf store the values for CPPC request MSR.
>>> + * Field max_perf conveys the maximum performance level at which the
>>> +platform
>>> + * may run. And it may be set to any performance value in the range
>>> + * [lowest_perf, highest_perf], inclusive.
>>> + * Field min_perf conveys the minimum performance level at which the
>>> +platform
>>> + * may run. And it may be set to any performance value in the range
>>> + * [lowest_perf, highest_perf], inclusive but must be less than or
>>> +equal to
>>> + * max_perf.
>>> + * Field des_perf conveys performance level Xen is requesting. And it
>>> +may be
>>> + * set to any performance value in the range [min_perf, max_perf], inclusive.
>>> + */
>>> +struct amd_cppc_drv_data
>>> +{
>>> +    const struct xen_processor_cppc *cppc_data;
>>> +    union {
>>> +        uint64_t raw;
>>> +        struct {
>>> +            unsigned int lowest_perf:8;
>>> +            unsigned int lowest_nonlinear_perf:8;
>>> +            unsigned int nominal_perf:8;
>>> +            unsigned int highest_perf:8;
>>> +            unsigned int :32;
>>> +        };
>>> +    } caps;
>>> +    union {
>>> +        uint64_t raw;
>>> +        struct {
>>> +            unsigned int max_perf:8;
>>> +            unsigned int min_perf:8;
>>> +            unsigned int des_perf:8;
>>> +            unsigned int epp:8;
>>> +            unsigned int :32;
>>> +        };
>>> +    } req;
>>> +
>>> +    int err;
>>> +};
>>> ``
>>> Then here, we could elaborate the reason why we choose lowest_nonlinear_perf
>> over lowest_perf:
>>> ```
>>> +    /*
>>> +     * Having a performance level lower than the lowest nonlinear
>>> +     * performance level, such as, lowest_perf <= perf <= lowest_nonliner_perf,
>>> +     * may actually cause an efficiency penalty, So when deciding the min_perf
>>> +     * value, we prefer lowest nonlinear performance over lowest performance
>>> +     */
>>> +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
>>> +                           des_perf, data->caps.highest_perf);
>>> ```
>>
>> This reads fine to me.
>>
>> Question then is though: Is setting lowest_perf as the low boundary a good idea in
>> any of the places? (Iirc it is used in one or two places. Or am I
>> misremembering?)
> 
> Yes, in active mode, I choose lowest_perf as min_perf to try to extend the limitation for active(autonomous) mode
> Maybe it is not a good choice. Maybe cpufreq driver is limited to do performance tuning in P-states range.
Indeed I think we should limit ourselves to P-state management; management of T-states
was never introduced into Xen, so far. But please be sure to make the connection to P-
and T-states in the commentary you add.
Jan
                
            [Public]
> -----Original Message-----
> From: Penny, Zheng
> Sent: Wednesday, July 2, 2025 5:49 PM
> To: Jan Beulich <jbeulich@suse.com>
> Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: RE: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc driver for
> cpufreq scaling
>
>
>
> > -----Original Message-----
> > From: Jan Beulich <jbeulich@suse.com>
> > Sent: Tuesday, June 17, 2025 12:00 AM
> > To: Penny, Zheng <penny.zheng@amd.com>
> > Cc: Huang, Ray <Ray.Huang@amd.com>; Andrew Cooper
> > <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>;
> > xen- devel@lists.xenproject.org
> > Subject: Re: [PATCH v5 10/18] xen/cpufreq: introduce a new amd cppc
> > driver for cpufreq scaling
> >
> > On 27.05.2025 10:48, Penny Zheng wrote:
> > > --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> > > +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> > > +        /*
> > > +         * We don't need to convert to kHz for computing offset and can
> > > +         * directly use nominal_mhz and lowest_mhz as the division
> > > +         * will remove the frequency unit.
> > > +         */
> > > +        offset = data->caps.nominal_perf -
> > > +                 (mul * cppc_data->cpc.nominal_mhz) / div;
> > > +    }
> > > +    else
> > > +    {
> > > +        /* Read Processor Max Speed(MHz) as anchor point */
> > > +        mul = data->caps.highest_perf;
> > > +        div = this_cpu(pxfreq_mhz);
> > > +        if ( !div )
> > > +            return -EINVAL;
> >
> > What's wrong about the function arguments in this case? (Same question
> > again on further uses of EINVAL below.)
> >
>
> If we could not get processor max frequency, the whole function is useless...
> Maybe -EOPNOTSUPP is more suitable than -EINVAL;
>
> > > +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> > > +                                            unsigned int target_freq,
> > > +                                            unsigned int relation) {
> > > +    unsigned int cpu = policy->cpu;
> > > +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data,
> cpu);
> > > +    uint8_t des_perf;
> > > +    int res;
> > > +
> > > +    if ( unlikely(!target_freq) )
> > > +        return 0;
> > > +
> > > +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> > > +    if ( res )
> > > +        return res;
> > > +
> > > +    /*
> > > +     * Setting with "lowest_nonlinear_perf" to ensure governoring
> > > +     * performance in P-state range.
> > > +     */
> > > +    amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> > > +                           des_perf, data->caps.highest_perf);
> >
> > I fear I don't understand the comment, and hence it remains unclear to
> > me why lowest_nonlinear_perf is being used here.
> >
>
> How about
> ```
> Choose lowest nonlinear performance as the minimum performance level at which
> the platform may run.
> Lowest nonlinear performance is the lowest performance level at which nonlinear
> power savings are achieved, Above this threshold, lower performance levels should
> be generally more energy efficient than higher performance levels.
> ```
I'm adding a few more explanation on highest performance too
```
Choose lowest nonlinear performance as the minimum performance level, and highest performance as the maximum performance level.
Lowest nonlinear performance is the lowest performance level at which nonlinear power savings are achieved, Above this threshold, lower
performance levels should be generally more energy efficient than higher performance levels.
And highest performance is the absolute maximum performance an individual processor may reach, assuming ideal conditions.
```
> >
> > Jan
                
            © 2016 - 2025 Red Hat, Inc.