From: Aaron Kling <webgeek1234@gmail.com>
Add support to use OPP table from DT in Tegra186 cpufreq driver.
Tegra SoC's receive the frequency lookup table (LUT) from BPMP-FW.
Cross check the OPP's present in DT against the LUT from BPMP-FW
and enable only those DT OPP's which are present in LUT also.
The OPP table in DT has CPU Frequency to bandwidth mapping where
the bandwidth value is per MC channel. DRAM bandwidth depends on the
number of MC channels which can vary as per the boot configuration.
This per channel bandwidth from OPP table will be later converted by
MC driver to final bandwidth value by multiplying with number of
channels before being handled in the EMC driver.
If OPP table is not present in DT, then use the LUT from BPMP-FW
directy as the CPU frequency table and not do the DRAM frequency
scaling which is same as the current behavior.
Signed-off-by: Aaron Kling <webgeek1234@gmail.com>
---
drivers/cpufreq/tegra186-cpufreq.c | 152 +++++++++++++++++++++++++++++++++++--
1 file changed, 145 insertions(+), 7 deletions(-)
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index bd94beebc4cc2fe6870e13ca55343cedb9729e99..f0abb44e2ed00a301161565e4c4f62cfed4a5814 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -18,6 +18,7 @@
#define EDVD_CORE_VOLT_FREQ_F_SHIFT 0
#define EDVD_CORE_VOLT_FREQ_F_MASK 0xffff
#define EDVD_CORE_VOLT_FREQ_V_SHIFT 16
+#define KHZ 1000
struct tegra186_cpufreq_cpu {
unsigned int bpmp_cluster_id;
@@ -58,7 +59,7 @@ static const struct tegra186_cpufreq_cpu tegra186_cpus[] = {
};
struct tegra186_cpufreq_cluster {
- struct cpufreq_frequency_table *table;
+ struct cpufreq_frequency_table *bpmp_lut;
u32 ref_clk_khz;
u32 div;
};
@@ -66,16 +67,121 @@ struct tegra186_cpufreq_cluster {
struct tegra186_cpufreq_data {
void __iomem *regs;
const struct tegra186_cpufreq_cpu *cpus;
+ bool icc_dram_bw_scaling;
struct tegra186_cpufreq_cluster clusters[];
};
+static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz)
+{
+ struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
+ struct dev_pm_opp *opp;
+ struct device *dev;
+ int ret;
+
+ dev = get_cpu_device(policy->cpu);
+ if (!dev)
+ return -ENODEV;
+
+ opp = dev_pm_opp_find_freq_exact(dev, freq_khz * KHZ, true);
+ if (IS_ERR(opp))
+ return PTR_ERR(opp);
+
+ ret = dev_pm_opp_set_opp(dev, opp);
+ if (ret)
+ data->icc_dram_bw_scaling = false;
+
+ dev_pm_opp_put(opp);
+ return ret;
+}
+
+static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *bpmp_lut,
+ struct cpufreq_frequency_table **opp_table)
+{
+ struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
+ struct cpufreq_frequency_table *freq_table = NULL;
+ struct cpufreq_frequency_table *pos;
+ struct device *cpu_dev;
+ struct dev_pm_opp *opp;
+ unsigned long rate;
+ int ret, max_opps;
+ int j = 0;
+
+ cpu_dev = get_cpu_device(policy->cpu);
+ if (!cpu_dev) {
+ pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu);
+ return -ENODEV;
+ }
+
+ /* Initialize OPP table mentioned in operating-points-v2 property in DT */
+ ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0);
+ if (!ret) {
+ max_opps = dev_pm_opp_get_opp_count(cpu_dev);
+ if (max_opps <= 0) {
+ dev_err(cpu_dev, "Failed to add OPPs\n");
+ return max_opps;
+ }
+
+ /* Disable all opps and cross-validate against LUT later */
+ for (rate = 0; ; rate++) {
+ opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate);
+ if (IS_ERR(opp))
+ break;
+
+ dev_pm_opp_put(opp);
+ dev_pm_opp_disable(cpu_dev, rate);
+ }
+ } else {
+ dev_err(cpu_dev, "Invalid or empty opp table in device tree\n");
+ data->icc_dram_bw_scaling = false;
+ return ret;
+ }
+
+ freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL);
+ if (!freq_table)
+ return -ENOMEM;
+
+ /*
+ * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT.
+ * Enable only those DT OPP's which are present in LUT also.
+ */
+ cpufreq_for_each_valid_entry(pos, bpmp_lut) {
+ opp = dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * KHZ, false);
+ if (IS_ERR(opp))
+ continue;
+
+ dev_pm_opp_put(opp);
+
+ ret = dev_pm_opp_enable(cpu_dev, pos->frequency * KHZ);
+ if (ret < 0)
+ return ret;
+
+ freq_table[j].driver_data = pos->driver_data;
+ freq_table[j].frequency = pos->frequency;
+ j++;
+ }
+
+ freq_table[j].driver_data = pos->driver_data;
+ freq_table[j].frequency = CPUFREQ_TABLE_END;
+
+ *opp_table = &freq_table[0];
+
+ dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus);
+
+ tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency);
+
+ return ret;
+}
+
static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
{
struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id;
+ struct cpufreq_frequency_table *freq_table;
+ struct cpufreq_frequency_table *bpmp_lut;
u32 cpu;
+ int ret;
- policy->freq_table = data->clusters[cluster].table;
policy->cpuinfo.transition_latency = 300 * 1000;
policy->driver_data = NULL;
@@ -85,6 +191,20 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
cpumask_set_cpu(cpu, policy->cpus);
}
+ bpmp_lut = data->clusters[cluster].bpmp_lut;
+
+ if (data->icc_dram_bw_scaling) {
+ ret = tegra_cpufreq_init_cpufreq_table(policy, bpmp_lut, &freq_table);
+ if (!ret) {
+ policy->freq_table = freq_table;
+ return 0;
+ }
+ }
+
+ data->icc_dram_bw_scaling = false;
+ policy->freq_table = bpmp_lut;
+ pr_info("OPP tables missing from DT, EMC frequency scaling disabled\n");
+
return 0;
}
@@ -102,6 +222,10 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy,
writel(edvd_val, data->regs + edvd_offset);
}
+ if (data->icc_dram_bw_scaling)
+ tegra_cpufreq_set_bw(policy, tbl->frequency);
+
+
return 0;
}
@@ -136,7 +260,7 @@ static struct cpufreq_driver tegra186_cpufreq_driver = {
.init = tegra186_cpufreq_init,
};
-static struct cpufreq_frequency_table *init_vhint_table(
+static struct cpufreq_frequency_table *tegra_cpufreq_bpmp_read_lut(
struct platform_device *pdev, struct tegra_bpmp *bpmp,
struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id,
int *num_rates)
@@ -231,6 +355,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
{
struct tegra186_cpufreq_data *data;
struct tegra_bpmp *bpmp;
+ struct device *cpu_dev;
unsigned int i = 0, err, edvd_offset;
int num_rates = 0;
u32 edvd_val, cpu;
@@ -256,9 +381,9 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
for (i = 0; i < TEGRA186_NUM_CLUSTERS; i++) {
struct tegra186_cpufreq_cluster *cluster = &data->clusters[i];
- cluster->table = init_vhint_table(pdev, bpmp, cluster, i, &num_rates);
- if (IS_ERR(cluster->table)) {
- err = PTR_ERR(cluster->table);
+ cluster->bpmp_lut = tegra_cpufreq_bpmp_read_lut(pdev, bpmp, cluster, i, &num_rates);
+ if (IS_ERR(cluster->bpmp_lut)) {
+ err = PTR_ERR(cluster->bpmp_lut);
goto put_bpmp;
} else if (!num_rates) {
err = -EINVAL;
@@ -267,7 +392,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) {
if (data->cpus[cpu].bpmp_cluster_id == i) {
- edvd_val = cluster->table[num_rates - 1].driver_data;
+ edvd_val = cluster->bpmp_lut[num_rates - 1].driver_data;
edvd_offset = data->cpus[cpu].edvd_offset;
writel(edvd_val, data->regs + edvd_offset);
}
@@ -276,6 +401,19 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev)
tegra186_cpufreq_driver.driver_data = data;
+ /* Check for optional OPPv2 and interconnect paths on CPU0 to enable ICC scaling */
+ cpu_dev = get_cpu_device(0);
+ if (!cpu_dev) {
+ err = -EPROBE_DEFER;
+ goto put_bpmp;
+ }
+
+ if (dev_pm_opp_of_get_opp_desc_node(cpu_dev)) {
+ err = dev_pm_opp_of_find_icc_paths(cpu_dev, NULL);
+ if (!err)
+ data->icc_dram_bw_scaling = true;
+ }
+
err = cpufreq_register_driver(&tegra186_cpufreq_driver);
put_bpmp:
--
2.50.1
On 31-08-25, 22:33, Aaron Kling via B4 Relay wrote: > diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c > index bd94beebc4cc2fe6870e13ca55343cedb9729e99..f0abb44e2ed00a301161565e4c4f62cfed4a5814 100644 > --- a/drivers/cpufreq/tegra186-cpufreq.c > +++ b/drivers/cpufreq/tegra186-cpufreq.c > @@ -18,6 +18,7 @@ > #define EDVD_CORE_VOLT_FREQ_F_SHIFT 0 > #define EDVD_CORE_VOLT_FREQ_F_MASK 0xffff > #define EDVD_CORE_VOLT_FREQ_V_SHIFT 16 > +#define KHZ 1000 Can reuse: include/linux/units.h:#define HZ_PER_KHZ 1000UL > +static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz) > +{ > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > + struct dev_pm_opp *opp; > + struct device *dev; > + int ret; > + > + dev = get_cpu_device(policy->cpu); > + if (!dev) > + return -ENODEV; > + > + opp = dev_pm_opp_find_freq_exact(dev, freq_khz * KHZ, true); > + if (IS_ERR(opp)) > + return PTR_ERR(opp); > + > + ret = dev_pm_opp_set_opp(dev, opp); Won't it be easier to use dev_pm_opp_set_rate() instead ? > + if (ret) > + data->icc_dram_bw_scaling = false; > + > + dev_pm_opp_put(opp); The OPP core supports scope based cleanup helpers now, maybe use them to remove all these put calls. > + return ret; > +} > + > +static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy, > + struct cpufreq_frequency_table *bpmp_lut, > + struct cpufreq_frequency_table **opp_table) > +{ > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > + struct cpufreq_frequency_table *freq_table = NULL; > + struct cpufreq_frequency_table *pos; > + struct device *cpu_dev; > + struct dev_pm_opp *opp; > + unsigned long rate; > + int ret, max_opps; > + int j = 0; > + > + cpu_dev = get_cpu_device(policy->cpu); > + if (!cpu_dev) { > + pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu); > + return -ENODEV; > + } > + > + /* Initialize OPP table mentioned in operating-points-v2 property in DT */ > + ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0); > + if (!ret) { If you handle the error case here, then the below can move out of the if/else block. > + max_opps = dev_pm_opp_get_opp_count(cpu_dev); > + if (max_opps <= 0) { > + dev_err(cpu_dev, "Failed to add OPPs\n"); > + return max_opps; > + } > + > + /* Disable all opps and cross-validate against LUT later */ > + for (rate = 0; ; rate++) { Maybe using while(1) would be more readable ? > + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate); > + if (IS_ERR(opp)) > + break; > + > + dev_pm_opp_put(opp); > + dev_pm_opp_disable(cpu_dev, rate); > + } > + } else { > + dev_err(cpu_dev, "Invalid or empty opp table in device tree\n"); > + data->icc_dram_bw_scaling = false; > + return ret; > + } > + > + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); > + if (!freq_table) > + return -ENOMEM; > + > + /* > + * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT. > + * Enable only those DT OPP's which are present in LUT also. > + */ > + cpufreq_for_each_valid_entry(pos, bpmp_lut) { > + opp = dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * KHZ, false); > + if (IS_ERR(opp)) > + continue; > + > + dev_pm_opp_put(opp); > + > + ret = dev_pm_opp_enable(cpu_dev, pos->frequency * KHZ); > + if (ret < 0) > + return ret; > + > + freq_table[j].driver_data = pos->driver_data; > + freq_table[j].frequency = pos->frequency; > + j++; > + } > + > + freq_table[j].driver_data = pos->driver_data; > + freq_table[j].frequency = CPUFREQ_TABLE_END; > + > + *opp_table = &freq_table[0]; > + > + dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); > + > + tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency); Maybe a comment on why exactly you are changing the freq here ? > + > + return ret; > +} > + > static int tegra186_cpufreq_init(struct cpufreq_policy *policy) > { > struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id; > + struct cpufreq_frequency_table *freq_table; > + struct cpufreq_frequency_table *bpmp_lut; > u32 cpu; > + int ret; > > - policy->freq_table = data->clusters[cluster].table; > policy->cpuinfo.transition_latency = 300 * 1000; > policy->driver_data = NULL; > > @@ -85,6 +191,20 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy) > cpumask_set_cpu(cpu, policy->cpus); > } > > + bpmp_lut = data->clusters[cluster].bpmp_lut; > + > + if (data->icc_dram_bw_scaling) { > + ret = tegra_cpufreq_init_cpufreq_table(policy, bpmp_lut, &freq_table); > + if (!ret) { > + policy->freq_table = freq_table; > + return 0; > + } > + } > + > + data->icc_dram_bw_scaling = false; > + policy->freq_table = bpmp_lut; > + pr_info("OPP tables missing from DT, EMC frequency scaling disabled\n"); > + > return 0; > } > > @@ -102,6 +222,10 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy, > writel(edvd_val, data->regs + edvd_offset); > } > > + if (data->icc_dram_bw_scaling) > + tegra_cpufreq_set_bw(policy, tbl->frequency); > + > + > return 0; > } > > @@ -136,7 +260,7 @@ static struct cpufreq_driver tegra186_cpufreq_driver = { > .init = tegra186_cpufreq_init, > }; > > -static struct cpufreq_frequency_table *init_vhint_table( > +static struct cpufreq_frequency_table *tegra_cpufreq_bpmp_read_lut( > struct platform_device *pdev, struct tegra_bpmp *bpmp, > struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id, > int *num_rates) > @@ -231,6 +355,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) > { > struct tegra186_cpufreq_data *data; > struct tegra_bpmp *bpmp; > + struct device *cpu_dev; > unsigned int i = 0, err, edvd_offset; > int num_rates = 0; > u32 edvd_val, cpu; > @@ -256,9 +381,9 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) > for (i = 0; i < TEGRA186_NUM_CLUSTERS; i++) { > struct tegra186_cpufreq_cluster *cluster = &data->clusters[i]; > > - cluster->table = init_vhint_table(pdev, bpmp, cluster, i, &num_rates); > - if (IS_ERR(cluster->table)) { > - err = PTR_ERR(cluster->table); > + cluster->bpmp_lut = tegra_cpufreq_bpmp_read_lut(pdev, bpmp, cluster, i, &num_rates); > + if (IS_ERR(cluster->bpmp_lut)) { > + err = PTR_ERR(cluster->bpmp_lut); > goto put_bpmp; > } else if (!num_rates) { > err = -EINVAL; > @@ -267,7 +392,7 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) > > for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) { > if (data->cpus[cpu].bpmp_cluster_id == i) { > - edvd_val = cluster->table[num_rates - 1].driver_data; > + edvd_val = cluster->bpmp_lut[num_rates - 1].driver_data; > edvd_offset = data->cpus[cpu].edvd_offset; > writel(edvd_val, data->regs + edvd_offset); > } > @@ -276,6 +401,19 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) > > tegra186_cpufreq_driver.driver_data = data; > > + /* Check for optional OPPv2 and interconnect paths on CPU0 to enable ICC scaling */ > + cpu_dev = get_cpu_device(0); > + if (!cpu_dev) { > + err = -EPROBE_DEFER; > + goto put_bpmp; > + } > + > + if (dev_pm_opp_of_get_opp_desc_node(cpu_dev)) { > + err = dev_pm_opp_of_find_icc_paths(cpu_dev, NULL); > + if (!err) > + data->icc_dram_bw_scaling = true; > + } > + > err = cpufreq_register_driver(&tegra186_cpufreq_driver); > > put_bpmp: > > -- > 2.50.1 > -- viresh
On Mon, Sep 1, 2025 at 12:53 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 31-08-25, 22:33, Aaron Kling via B4 Relay wrote: > > diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c > > index bd94beebc4cc2fe6870e13ca55343cedb9729e99..f0abb44e2ed00a301161565e4c4f62cfed4a5814 100644 > > --- a/drivers/cpufreq/tegra186-cpufreq.c > > +++ b/drivers/cpufreq/tegra186-cpufreq.c > > @@ -18,6 +18,7 @@ > > #define EDVD_CORE_VOLT_FREQ_F_SHIFT 0 > > #define EDVD_CORE_VOLT_FREQ_F_MASK 0xffff > > #define EDVD_CORE_VOLT_FREQ_V_SHIFT 16 > > +#define KHZ 1000 > > Can reuse: > > include/linux/units.h:#define HZ_PER_KHZ 1000UL Will do. > > > +static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz) > > +{ > > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > > + struct dev_pm_opp *opp; > > + struct device *dev; > > + int ret; > > + > > + dev = get_cpu_device(policy->cpu); > > + if (!dev) > > + return -ENODEV; > > + > > + opp = dev_pm_opp_find_freq_exact(dev, freq_khz * KHZ, true); > > + if (IS_ERR(opp)) > > + return PTR_ERR(opp); > > + > > + ret = dev_pm_opp_set_opp(dev, opp); > > Won't it be easier to use dev_pm_opp_set_rate() instead ? I'm not very familiar with the opp system. If I read correctly, dev_pm_opp_set_rate() will round to the closest rate while this code will fail if the exact rate isn't found. This code is based on the existing tegra194-cpufreq driver. And I'm unsure if this was done for a reason. I have seen unexpected rates returned from clk_round_rate in the development of this topic, so that could be related. > > > + if (ret) > > + data->icc_dram_bw_scaling = false; > > + > > + dev_pm_opp_put(opp); > > The OPP core supports scope based cleanup helpers now, maybe use them > to remove all these put calls. I will look into this. > > > + return ret; > > +} > > + > > +static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy, > > + struct cpufreq_frequency_table *bpmp_lut, > > + struct cpufreq_frequency_table **opp_table) > > +{ > > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > > + struct cpufreq_frequency_table *freq_table = NULL; > > + struct cpufreq_frequency_table *pos; > > + struct device *cpu_dev; > > + struct dev_pm_opp *opp; > > + unsigned long rate; > > + int ret, max_opps; > > + int j = 0; > > + > > + cpu_dev = get_cpu_device(policy->cpu); > > + if (!cpu_dev) { > > + pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu); > > + return -ENODEV; > > + } > > + > > + /* Initialize OPP table mentioned in operating-points-v2 property in DT */ > > + ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0); > > + if (!ret) { > > If you handle the error case here, then the below can move out of the > if/else block. I'd prefer not to deviate too much from the tegra194-cpufreq code this is based on, so the drivers can be more easily kept in parity to each other. But I will look at making this a bit cleaner as per this and the next comment. > > > + max_opps = dev_pm_opp_get_opp_count(cpu_dev); > > + if (max_opps <= 0) { > > + dev_err(cpu_dev, "Failed to add OPPs\n"); > > + return max_opps; > > + } > > + > > + /* Disable all opps and cross-validate against LUT later */ > > + for (rate = 0; ; rate++) { > > Maybe using while(1) would be more readable ? > > > + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate); > > + if (IS_ERR(opp)) > > + break; > > + > > + dev_pm_opp_put(opp); > > + dev_pm_opp_disable(cpu_dev, rate); > > + } > > + } else { > > + dev_err(cpu_dev, "Invalid or empty opp table in device tree\n"); > > + data->icc_dram_bw_scaling = false; > > + return ret; > > + } > > + > > + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); > > + if (!freq_table) > > + return -ENOMEM; > > + > > + /* > > + * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT. > > + * Enable only those DT OPP's which are present in LUT also. > > + */ > > + cpufreq_for_each_valid_entry(pos, bpmp_lut) { > > + opp = dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * KHZ, false); > > + if (IS_ERR(opp)) > > + continue; > > + > > + dev_pm_opp_put(opp); > > + > > + ret = dev_pm_opp_enable(cpu_dev, pos->frequency * KHZ); > > + if (ret < 0) > > + return ret; > > + > > + freq_table[j].driver_data = pos->driver_data; > > + freq_table[j].frequency = pos->frequency; > > + j++; > > + } > > + > > + freq_table[j].driver_data = pos->driver_data; > > + freq_table[j].frequency = CPUFREQ_TABLE_END; > > + > > + *opp_table = &freq_table[0]; > > + > > + dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); > > + > > + tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency); > > Maybe a comment on why exactly you are changing the freq here ? To my knowledge, this does not change any clocks. The intent here is to prime the interconnect data. In the pre-req series, there's a change that sets all clocks to max frequency during probe. Then my use case involves setting performance governor by default on some boots. During testing, I noticed that the interconnect data provided by this driver was all zeroes. Which led me to notice that set_bw is only called when the target frequency changes. Which wasn't happening because clocks were already set to max. If my understanding here is wrong or there's a better way to handle this, I will fix it. Aaron
+Sumit On 02-09-25, 12:21, Aaron Kling wrote: > On Mon, Sep 1, 2025 at 12:53 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 31-08-25, 22:33, Aaron Kling via B4 Relay wrote: > > > +static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz) > > > +{ > > > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > > > + struct dev_pm_opp *opp; > > > + struct device *dev; > > > + int ret; > > > + > > > + dev = get_cpu_device(policy->cpu); > > > + if (!dev) > > > + return -ENODEV; > > > + > > > + opp = dev_pm_opp_find_freq_exact(dev, freq_khz * KHZ, true); > > > + if (IS_ERR(opp)) > > > + return PTR_ERR(opp); > > > + > > > + ret = dev_pm_opp_set_opp(dev, opp); > > > > Won't it be easier to use dev_pm_opp_set_rate() instead ? > > I'm not very familiar with the opp system. If I read correctly, > dev_pm_opp_set_rate() will round to the closest rate while this code > will fail if the exact rate isn't found. This code is based on the > existing tegra194-cpufreq driver. And I'm unsure if this was done for > a reason. Sumit, any explanation for this ? > I have seen unexpected rates returned from clk_round_rate in > the development of this topic, so that could be related. Right, but we should end up configuring a valid rate from the OPP table. > > > +static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy, > > > + struct cpufreq_frequency_table *bpmp_lut, > > > + struct cpufreq_frequency_table **opp_table) > > > +{ > > > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > > > + struct cpufreq_frequency_table *freq_table = NULL; > > > + struct cpufreq_frequency_table *pos; > > > + struct device *cpu_dev; > > > + struct dev_pm_opp *opp; > > > + unsigned long rate; > > > + int ret, max_opps; > > > + int j = 0; > > > + > > > + cpu_dev = get_cpu_device(policy->cpu); > > > + if (!cpu_dev) { > > > + pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu); > > > + return -ENODEV; > > > + } > > > + > > > + /* Initialize OPP table mentioned in operating-points-v2 property in DT */ > > > + ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0); > > > + if (!ret) { > > > > If you handle the error case here, then the below can move out of the > > if/else block. > > I'd prefer not to deviate too much from the tegra194-cpufreq code this > is based on, so the drivers can be more easily kept in parity to each > other. I am not sure if that is really important here. The kernel normally contains code in this format: if (err) return; keep-working; If you want both the drivers to have similar code, then maybe that code should be moved to another file and used by both. But we shouldn't keep them same when we feel that we can do better. > But I will look at making this a bit cleaner as per this and > the next comment. Thanks. > > > + max_opps = dev_pm_opp_get_opp_count(cpu_dev); > > > + if (max_opps <= 0) { > > > + dev_err(cpu_dev, "Failed to add OPPs\n"); > > > + return max_opps; > > > + } > > > + > > > + /* Disable all opps and cross-validate against LUT later */ > > > + for (rate = 0; ; rate++) { > > > > Maybe using while(1) would be more readable ? > > > > > + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate); > > > + if (IS_ERR(opp)) > > > + break; > > > + > > > + dev_pm_opp_put(opp); > > > + dev_pm_opp_disable(cpu_dev, rate); > > > + } > > > + } else { > > > + dev_err(cpu_dev, "Invalid or empty opp table in device tree\n"); > > > + data->icc_dram_bw_scaling = false; > > > + return ret; > > > + } > > > + > > > + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); > > > + if (!freq_table) > > > + return -ENOMEM; > > > + > > > + /* > > > + * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT. > > > + * Enable only those DT OPP's which are present in LUT also. > > > + */ > > > + cpufreq_for_each_valid_entry(pos, bpmp_lut) { > > > + opp = dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * KHZ, false); > > > + if (IS_ERR(opp)) > > > + continue; > > > + > > > + dev_pm_opp_put(opp); > > > + > > > + ret = dev_pm_opp_enable(cpu_dev, pos->frequency * KHZ); > > > + if (ret < 0) > > > + return ret; > > > + > > > + freq_table[j].driver_data = pos->driver_data; > > > + freq_table[j].frequency = pos->frequency; > > > + j++; > > > + } > > > + > > > + freq_table[j].driver_data = pos->driver_data; > > > + freq_table[j].frequency = CPUFREQ_TABLE_END; > > > + > > > + *opp_table = &freq_table[0]; > > > + > > > + dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); > > > + > > > + tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency); > > > > Maybe a comment on why exactly you are changing the freq here ? I meant bandwidth here. > To my knowledge, this does not change any clocks. The intent here is > to prime the interconnect data. In the pre-req series, there's a > change that sets all clocks to max frequency during probe. Then my use > case involves setting performance governor by default on some boots. > During testing, I noticed that the interconnect data provided by this > driver was all zeroes. Which led me to notice that set_bw is only > called when the target frequency changes. Which wasn't happening > because clocks were already set to max. If my understanding here is > wrong or there's a better way to handle this, I will fix it. There are a lot of synchronization issues here because we are trying to set clk and bw separately. I guess other variables, like regulator, level, etc. (if used) will also be out of sync here. I think the right way to fix this would be to call set-opp for the device, so all the variables are configured properly. -- viresh
On Wed, Sep 3, 2025 at 12:01 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > +Sumit > > On 02-09-25, 12:21, Aaron Kling wrote: > > On Mon, Sep 1, 2025 at 12:53 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > On 31-08-25, 22:33, Aaron Kling via B4 Relay wrote: > > > > +static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz) > > > > +{ > > > > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > > > > + struct dev_pm_opp *opp; > > > > + struct device *dev; > > > > + int ret; > > > > + > > > > + dev = get_cpu_device(policy->cpu); > > > > + if (!dev) > > > > + return -ENODEV; > > > > + > > > > + opp = dev_pm_opp_find_freq_exact(dev, freq_khz * KHZ, true); > > > > + if (IS_ERR(opp)) > > > > + return PTR_ERR(opp); > > > > + > > > > + ret = dev_pm_opp_set_opp(dev, opp); > > > > > > Won't it be easier to use dev_pm_opp_set_rate() instead ? > > > > I'm not very familiar with the opp system. If I read correctly, > > dev_pm_opp_set_rate() will round to the closest rate while this code > > will fail if the exact rate isn't found. This code is based on the > > existing tegra194-cpufreq driver. And I'm unsure if this was done for > > a reason. > > Sumit, any explanation for this ? > > > I have seen unexpected rates returned from clk_round_rate in > > the development of this topic, so that could be related. > > Right, but we should end up configuring a valid rate from the OPP > table. > > > > > +static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy, > > > > + struct cpufreq_frequency_table *bpmp_lut, > > > > + struct cpufreq_frequency_table **opp_table) > > > > +{ > > > > + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); > > > > + struct cpufreq_frequency_table *freq_table = NULL; > > > > + struct cpufreq_frequency_table *pos; > > > > + struct device *cpu_dev; > > > > + struct dev_pm_opp *opp; > > > > + unsigned long rate; > > > > + int ret, max_opps; > > > > + int j = 0; > > > > + > > > > + cpu_dev = get_cpu_device(policy->cpu); > > > > + if (!cpu_dev) { > > > > + pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu); > > > > + return -ENODEV; > > > > + } > > > > + > > > > + /* Initialize OPP table mentioned in operating-points-v2 property in DT */ > > > > + ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0); > > > > + if (!ret) { > > > > > > If you handle the error case here, then the below can move out of the > > > if/else block. > > > > I'd prefer not to deviate too much from the tegra194-cpufreq code this > > is based on, so the drivers can be more easily kept in parity to each > > other. > > I am not sure if that is really important here. The kernel normally > contains code in this format: > > if (err) > return; > > keep-working; > > If you want both the drivers to have similar code, then maybe that > code should be moved to another file and used by both. But we > shouldn't keep them same when we feel that we can do better. > > > But I will look at making this a bit cleaner as per this and > > the next comment. > > Thanks. > > > > > + max_opps = dev_pm_opp_get_opp_count(cpu_dev); > > > > + if (max_opps <= 0) { > > > > + dev_err(cpu_dev, "Failed to add OPPs\n"); > > > > + return max_opps; > > > > + } > > > > + > > > > + /* Disable all opps and cross-validate against LUT later */ > > > > + for (rate = 0; ; rate++) { > > > > > > Maybe using while(1) would be more readable ? > > > > > > > + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate); > > > > + if (IS_ERR(opp)) > > > > + break; > > > > + > > > > + dev_pm_opp_put(opp); > > > > + dev_pm_opp_disable(cpu_dev, rate); > > > > + } > > > > + } else { > > > > + dev_err(cpu_dev, "Invalid or empty opp table in device tree\n"); > > > > + data->icc_dram_bw_scaling = false; > > > > + return ret; > > > > + } > > > > + > > > > + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); > > > > + if (!freq_table) > > > > + return -ENOMEM; > > > > + > > > > + /* > > > > + * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT. > > > > + * Enable only those DT OPP's which are present in LUT also. > > > > + */ > > > > + cpufreq_for_each_valid_entry(pos, bpmp_lut) { > > > > + opp = dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * KHZ, false); > > > > + if (IS_ERR(opp)) > > > > + continue; > > > > + > > > > + dev_pm_opp_put(opp); > > > > + > > > > + ret = dev_pm_opp_enable(cpu_dev, pos->frequency * KHZ); > > > > + if (ret < 0) > > > > + return ret; > > > > + > > > > + freq_table[j].driver_data = pos->driver_data; > > > > + freq_table[j].frequency = pos->frequency; > > > > + j++; > > > > + } > > > > + > > > > + freq_table[j].driver_data = pos->driver_data; > > > > + freq_table[j].frequency = CPUFREQ_TABLE_END; > > > > + > > > > + *opp_table = &freq_table[0]; > > > > + > > > > + dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); > > > > + > > > > + tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency); > > > > > > Maybe a comment on why exactly you are changing the freq here ? > > I meant bandwidth here. > > > To my knowledge, this does not change any clocks. The intent here is > > to prime the interconnect data. In the pre-req series, there's a > > change that sets all clocks to max frequency during probe. Then my use > > case involves setting performance governor by default on some boots. > > During testing, I noticed that the interconnect data provided by this > > driver was all zeroes. Which led me to notice that set_bw is only > > called when the target frequency changes. Which wasn't happening > > because clocks were already set to max. If my understanding here is > > wrong or there's a better way to handle this, I will fix it. > > There are a lot of synchronization issues here because we are trying > to set clk and bw separately. I guess other variables, like regulator, > level, etc. (if used) will also be out of sync here. > > I think the right way to fix this would be to call set-opp for the > device, so all the variables are configured properly. That's what the tegra_cpufreq_set_bw function does. Matches the passed frequency in the opp table and calls dev_pm_opp_set_opp. Aaron
On 03/09/25 10:31, Viresh Kumar wrote: > External email: Use caution opening links or attachments > > > +Sumit > > On 02-09-25, 12:21, Aaron Kling wrote: >> On Mon, Sep 1, 2025 at 12:53 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: >>> On 31-08-25, 22:33, Aaron Kling via B4 Relay wrote: >>>> +static int tegra_cpufreq_set_bw(struct cpufreq_policy *policy, unsigned long freq_khz) >>>> +{ >>>> + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); >>>> + struct dev_pm_opp *opp; >>>> + struct device *dev; >>>> + int ret; >>>> + >>>> + dev = get_cpu_device(policy->cpu); >>>> + if (!dev) >>>> + return -ENODEV; >>>> + >>>> + opp = dev_pm_opp_find_freq_exact(dev, freq_khz * KHZ, true); >>>> + if (IS_ERR(opp)) >>>> + return PTR_ERR(opp); >>>> + >>>> + ret = dev_pm_opp_set_opp(dev, opp); >>> Won't it be easier to use dev_pm_opp_set_rate() instead ? >> I'm not very familiar with the opp system. If I read correctly, >> dev_pm_opp_set_rate() will round to the closest rate while this code >> will fail if the exact rate isn't found. This code is based on the >> existing tegra194-cpufreq driver. And I'm unsure if this was done for >> a reason. > Sumit, any explanation for this ? dev_pm_opp_set_rate() is additionally checking clock. In Tegra194/234 the clock is handled by BPMP-FW and CPU nodes don't have clocks property. So, the clk_round_rate() API causes NULL pointer. I see same in Tegra186/194 dtsi. Tegra210 and previous SoCs have clocks property in CPU node. Thank you, Sumit Gupta >> I have seen unexpected rates returned from clk_round_rate in >> the development of this topic, so that could be related. > Right, but we should end up configuring a valid rate from the OPP > table. > >>>> +static int tegra_cpufreq_init_cpufreq_table(struct cpufreq_policy *policy, >>>> + struct cpufreq_frequency_table *bpmp_lut, >>>> + struct cpufreq_frequency_table **opp_table) >>>> +{ >>>> + struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); >>>> + struct cpufreq_frequency_table *freq_table = NULL; >>>> + struct cpufreq_frequency_table *pos; >>>> + struct device *cpu_dev; >>>> + struct dev_pm_opp *opp; >>>> + unsigned long rate; >>>> + int ret, max_opps; >>>> + int j = 0; >>>> + >>>> + cpu_dev = get_cpu_device(policy->cpu); >>>> + if (!cpu_dev) { >>>> + pr_err("%s: failed to get cpu%d device\n", __func__, policy->cpu); >>>> + return -ENODEV; >>>> + } >>>> + >>>> + /* Initialize OPP table mentioned in operating-points-v2 property in DT */ >>>> + ret = dev_pm_opp_of_add_table_indexed(cpu_dev, 0); >>>> + if (!ret) { >>> If you handle the error case here, then the below can move out of the >>> if/else block. >> I'd prefer not to deviate too much from the tegra194-cpufreq code this >> is based on, so the drivers can be more easily kept in parity to each >> other. > I am not sure if that is really important here. The kernel normally > contains code in this format: > > if (err) > return; > > keep-working; > > If you want both the drivers to have similar code, then maybe that > code should be moved to another file and used by both. But we > shouldn't keep them same when we feel that we can do better. > >> But I will look at making this a bit cleaner as per this and >> the next comment. > Thanks. > >>>> + max_opps = dev_pm_opp_get_opp_count(cpu_dev); >>>> + if (max_opps <= 0) { >>>> + dev_err(cpu_dev, "Failed to add OPPs\n"); >>>> + return max_opps; >>>> + } >>>> + >>>> + /* Disable all opps and cross-validate against LUT later */ >>>> + for (rate = 0; ; rate++) { >>> Maybe using while(1) would be more readable ? >>> >>>> + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &rate); >>>> + if (IS_ERR(opp)) >>>> + break; >>>> + >>>> + dev_pm_opp_put(opp); >>>> + dev_pm_opp_disable(cpu_dev, rate); >>>> + } >>>> + } else { >>>> + dev_err(cpu_dev, "Invalid or empty opp table in device tree\n"); >>>> + data->icc_dram_bw_scaling = false; >>>> + return ret; >>>> + } >>>> + >>>> + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); >>>> + if (!freq_table) >>>> + return -ENOMEM; >>>> + >>>> + /* >>>> + * Cross check the frequencies from BPMP-FW LUT against the OPP's present in DT. >>>> + * Enable only those DT OPP's which are present in LUT also. >>>> + */ >>>> + cpufreq_for_each_valid_entry(pos, bpmp_lut) { >>>> + opp = dev_pm_opp_find_freq_exact(cpu_dev, pos->frequency * KHZ, false); >>>> + if (IS_ERR(opp)) >>>> + continue; >>>> + >>>> + dev_pm_opp_put(opp); >>>> + >>>> + ret = dev_pm_opp_enable(cpu_dev, pos->frequency * KHZ); >>>> + if (ret < 0) >>>> + return ret; >>>> + >>>> + freq_table[j].driver_data = pos->driver_data; >>>> + freq_table[j].frequency = pos->frequency; >>>> + j++; >>>> + } >>>> + >>>> + freq_table[j].driver_data = pos->driver_data; >>>> + freq_table[j].frequency = CPUFREQ_TABLE_END; >>>> + >>>> + *opp_table = &freq_table[0]; >>>> + >>>> + dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); >>>> + >>>> + tegra_cpufreq_set_bw(policy, freq_table[j - 1].frequency); >>> Maybe a comment on why exactly you are changing the freq here ? > I meant bandwidth here. > >> To my knowledge, this does not change any clocks. The intent here is >> to prime the interconnect data. In the pre-req series, there's a >> change that sets all clocks to max frequency during probe. Then my use >> case involves setting performance governor by default on some boots. >> During testing, I noticed that the interconnect data provided by this >> driver was all zeroes. Which led me to notice that set_bw is only >> called when the target frequency changes. Which wasn't happening >> because clocks were already set to max. If my understanding here is >> wrong or there's a better way to handle this, I will fix it. > There are a lot of synchronization issues here because we are trying > to set clk and bw separately. I guess other variables, like regulator, > level, etc. (if used) will also be out of sync here. > > I think the right way to fix this would be to call set-opp for the > device, so all the variables are configured properly. > > -- > viresh
© 2016 - 2025 Red Hat, Inc.