It is necessary to refer to a specific performance domain from a
userspace. For example, the energy model of a particular performance
domain is updated.
To this end, assign a unique ID to each performance domain to address it,
and manage them in a global linked list to look up a specific one by
matching ID. IDA is used for ID assignment, and the mutex is used to
protect the global list from concurrent access.
Note that the mutex (em_pd_list_mutex) is not supposed to hold while
holding em_pd_mutex to avoid ABBA deadlock.
Signed-off-by: Changwoo Min <changwoo@igalia.com>
---
include/linux/energy_model.h | 4 ++++
kernel/power/energy_model.c | 33 ++++++++++++++++++++++++++++++++-
2 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 61d50571ad88..43aa6153dc57 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -54,6 +54,8 @@ struct em_perf_table {
/**
* struct em_perf_domain - Performance domain
* @em_table: Pointer to the runtime modifiable em_perf_table
+ * @node: node in em_pd_list (in energy_model.c)
+ * @id: A unique ID number for each performance domain
* @nr_perf_states: Number of performance states
* @min_perf_state: Minimum allowed Performance State index
* @max_perf_state: Maximum allowed Performance State index
@@ -71,6 +73,8 @@ struct em_perf_table {
*/
struct em_perf_domain {
struct em_perf_table __rcu *em_table;
+ struct list_head node;
+ int id;
int nr_perf_states;
int min_perf_state;
int max_perf_state;
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 8df55397414a..3fe562b6230e 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -23,6 +23,16 @@
*/
static DEFINE_MUTEX(em_pd_mutex);
+/*
+ * Manage performance domains with IDs. One can iterate the performance domains
+ * through the list and pick one with their associated ID. The mutex serializes
+ * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
+ * taken to avoid potential deadlock.
+ */
+static DEFINE_IDA(em_pd_ida);
+static LIST_HEAD(em_pd_list);
+static DEFINE_MUTEX(em_pd_list_mutex);
+
static void em_cpufreq_update_efficiencies(struct device *dev,
struct em_perf_state *table);
static void em_check_capacity_update(void);
@@ -396,7 +406,7 @@ static int em_create_pd(struct device *dev, int nr_states,
struct em_perf_table *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
- int cpu, ret, num_cpus;
+ int cpu, ret, num_cpus, id;
if (_is_cpu_device(dev)) {
num_cpus = cpumask_weight(cpus);
@@ -420,6 +430,13 @@ static int em_create_pd(struct device *dev, int nr_states,
pd->nr_perf_states = nr_states;
+ INIT_LIST_HEAD(&pd->node);
+
+ id = ida_alloc(&em_pd_ida, GFP_KERNEL);
+ if (id < 0)
+ return -ENOMEM;
+ pd->id = id;
+
em_table = em_table_alloc(pd);
if (!em_table)
goto free_pd;
@@ -444,6 +461,7 @@ static int em_create_pd(struct device *dev, int nr_states,
kfree(em_table);
free_pd:
kfree(pd);
+ ida_free(&em_pd_ida, id);
return -EINVAL;
}
@@ -660,6 +678,13 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
unlock:
mutex_unlock(&em_pd_mutex);
+ if (_is_cpu_device(dev))
+ em_check_capacity_update();
+
+ mutex_lock(&em_pd_list_mutex);
+ list_add_tail(&dev->em_pd->node, &em_pd_list);
+ mutex_unlock(&em_pd_list_mutex);
+
return ret;
}
EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
@@ -678,6 +703,10 @@ void em_dev_unregister_perf_domain(struct device *dev)
if (_is_cpu_device(dev))
return;
+ mutex_lock(&em_pd_list_mutex);
+ list_del_init(&dev->em_pd->node);
+ mutex_unlock(&em_pd_list_mutex);
+
/*
* The mutex separates all register/unregister requests and protects
* from potential clean-up/setup issues in the debugfs directories.
@@ -689,6 +718,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
lockdep_is_held(&em_pd_mutex)));
+ ida_free(&em_pd_ida, dev->em_pd->id);
+
kfree(dev->em_pd);
dev->em_pd = NULL;
mutex_unlock(&em_pd_mutex);
--
2.51.0
Hi Chanwoo,
My apologies to delay on this topic.
On 9/21/25 04:19, Changwoo Min wrote:
> It is necessary to refer to a specific performance domain from a
> userspace. For example, the energy model of a particular performance
> domain is updated.
>
> To this end, assign a unique ID to each performance domain to address it,
Is this related to the sched_ext view on the EM that we cannot re-use
the allocated ID for the given domain?
> and manage them in a global linked list to look up a specific one by
> matching ID. IDA is used for ID assignment, and the mutex is used to
> protect the global list from concurrent access.
>
> Note that the mutex (em_pd_list_mutex) is not supposed to hold while
> holding em_pd_mutex to avoid ABBA deadlock.
This might be tricky design, but I have seen in some other
patches you've added the lockdep, so we might have some safety net.
>
> Signed-off-by: Changwoo Min <changwoo@igalia.com>
> ---
> include/linux/energy_model.h | 4 ++++
> kernel/power/energy_model.c | 33 ++++++++++++++++++++++++++++++++-
> 2 files changed, 36 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
> index 61d50571ad88..43aa6153dc57 100644
> --- a/include/linux/energy_model.h
> +++ b/include/linux/energy_model.h
> @@ -54,6 +54,8 @@ struct em_perf_table {
> /**
> * struct em_perf_domain - Performance domain
> * @em_table: Pointer to the runtime modifiable em_perf_table
> + * @node: node in em_pd_list (in energy_model.c)
> + * @id: A unique ID number for each performance domain
> * @nr_perf_states: Number of performance states
> * @min_perf_state: Minimum allowed Performance State index
> * @max_perf_state: Maximum allowed Performance State index
> @@ -71,6 +73,8 @@ struct em_perf_table {
> */
> struct em_perf_domain {
> struct em_perf_table __rcu *em_table;
> + struct list_head node;
> + int id;
> int nr_perf_states;
> int min_perf_state;
> int max_perf_state;
> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
> index 8df55397414a..3fe562b6230e 100644
> --- a/kernel/power/energy_model.c
> +++ b/kernel/power/energy_model.c
> @@ -23,6 +23,16 @@
> */
> static DEFINE_MUTEX(em_pd_mutex);
>
> +/*
> + * Manage performance domains with IDs. One can iterate the performance domains
> + * through the list and pick one with their associated ID. The mutex serializes
> + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
> + * taken to avoid potential deadlock.
> + */
> +static DEFINE_IDA(em_pd_ida);
> +static LIST_HEAD(em_pd_list);
> +static DEFINE_MUTEX(em_pd_list_mutex);
> +
> static void em_cpufreq_update_efficiencies(struct device *dev,
> struct em_perf_state *table);
> static void em_check_capacity_update(void);
> @@ -396,7 +406,7 @@ static int em_create_pd(struct device *dev, int nr_states,
> struct em_perf_table *em_table;
> struct em_perf_domain *pd;
> struct device *cpu_dev;
> - int cpu, ret, num_cpus;
> + int cpu, ret, num_cpus, id;
>
> if (_is_cpu_device(dev)) {
> num_cpus = cpumask_weight(cpus);
> @@ -420,6 +430,13 @@ static int em_create_pd(struct device *dev, int nr_states,
>
> pd->nr_perf_states = nr_states;
>
> + INIT_LIST_HEAD(&pd->node);
> +
> + id = ida_alloc(&em_pd_ida, GFP_KERNEL);
> + if (id < 0)
> + return -ENOMEM;
> + pd->id = id;
> +
> em_table = em_table_alloc(pd);
> if (!em_table)
> goto free_pd;
> @@ -444,6 +461,7 @@ static int em_create_pd(struct device *dev, int nr_states,
> kfree(em_table);
> free_pd:
> kfree(pd);
> + ida_free(&em_pd_ida, id);
> return -EINVAL;
> }
>
> @@ -660,6 +678,13 @@ int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
> unlock:
> mutex_unlock(&em_pd_mutex);
>
> + if (_is_cpu_device(dev))
> + em_check_capacity_update();
> +
> + mutex_lock(&em_pd_list_mutex);
> + list_add_tail(&dev->em_pd->node, &em_pd_list);
> + mutex_unlock(&em_pd_list_mutex);
> +
> return ret;
> }
> EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
> @@ -678,6 +703,10 @@ void em_dev_unregister_perf_domain(struct device *dev)
> if (_is_cpu_device(dev))
> return;
>
> + mutex_lock(&em_pd_list_mutex);
> + list_del_init(&dev->em_pd->node);
> + mutex_unlock(&em_pd_list_mutex);
> +
> /*
> * The mutex separates all register/unregister requests and protects
> * from potential clean-up/setup issues in the debugfs directories.
> @@ -689,6 +718,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
> em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
> lockdep_is_held(&em_pd_mutex)));
>
> + ida_free(&em_pd_ida, dev->em_pd->id);
> +
> kfree(dev->em_pd);
> dev->em_pd = NULL;
> mutex_unlock(&em_pd_mutex);
Apart from that, the code itself looks sane.
Regards,
Lukasz
On 10/6/25 09:17, Lukasz Luba wrote: > Hi Chanwoo, > > My apologies to delay on this topic. > > On 9/21/25 04:19, Changwoo Min wrote: >> It is necessary to refer to a specific performance domain from a >> userspace. For example, the energy model of a particular performance >> domain is updated. >> >> To this end, assign a unique ID to each performance domain to address it, > > Is this related to the sched_ext view on the EM that we cannot re-use > the allocated ID for the given domain? Ignore that comment, I know the need now. Although, there is a small code sneak below... [..] >> @@ -660,6 +678,13 @@ int em_dev_register_pd_no_update(struct device >> *dev, unsigned int nr_states, >> unlock: >> mutex_unlock(&em_pd_mutex); >> + if (_is_cpu_device(dev)) >> + em_check_capacity_update(); >> + It doesn't below to this $subject at all. It looks like it was left from some your local changes, isn't it?
Hi Lukasz, On 10/6/25 21:24, Lukasz Luba wrote: > > > On 10/6/25 09:17, Lukasz Luba wrote: >> Hi Chanwoo, >> >> My apologies to delay on this topic. Thank you for finding time and making an effort for the review! I understand that it is not always possible to make time for review. :-) >> >> On 9/21/25 04:19, Changwoo Min wrote: >>> It is necessary to refer to a specific performance domain from a >>> userspace. For example, the energy model of a particular performance >>> domain is updated. >>> >>> To this end, assign a unique ID to each performance domain to address >>> it, >> >> Is this related to the sched_ext view on the EM that we cannot re-use >> the allocated ID for the given domain? > > Ignore that comment, I know the need now. > > Although, there is a small code sneak below... > > > > [..] > >>> @@ -660,6 +678,13 @@ int em_dev_register_pd_no_update(struct device >>> *dev, unsigned int nr_states, >>> unlock: >>> mutex_unlock(&em_pd_mutex); >>> + if (_is_cpu_device(dev)) >>> + em_check_capacity_update(); >>> + > > It doesn't below to this $subject at all. It looks like > it was left from some your local changes, isn't it? You are right. The code is redundant since the same check is done at em_dev_register_perf_domain(). It is the side-effect of a bad rebase. I will remove this in the next version. Regards, Changwoo Min
© 2016 - 2026 Red Hat, Inc.