Some platforms supports multiple low-power states for CPUs that can be used
when entering system-wide suspend. Currently we are always selecting the
deepest possible state for the CPUs, which can break the system-wakeup
latency constraint that may be required for some use-cases.
Let's take the first step towards addressing this problem, by introducing
an interface for user-space, that allows us to specify the CPU
system-wakeup QoS limit. Subsequent changes will start taking into account
the new QoS limit.
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
Changes in v2:
- Renamings to reflect the QoS are limited to CPUs.
- Move code inside "CONFIG_CPU_IDLE".
---
include/linux/pm_qos.h | 5 ++
kernel/power/qos.c | 102 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 107 insertions(+)
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 4a69d4af3ff8..bf7524d38933 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -149,6 +149,7 @@ bool cpu_latency_qos_request_active(struct pm_qos_request *req);
void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value);
void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value);
void cpu_latency_qos_remove_request(struct pm_qos_request *req);
+s32 cpu_wakeup_latency_qos_limit(void);
#else
static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; }
static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req)
@@ -160,6 +161,10 @@ static inline void cpu_latency_qos_add_request(struct pm_qos_request *req,
static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
s32 new_value) {}
static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
+static inline s32 cpu_wakeup_latency_qos_limit(void)
+{
+ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+}
#endif
#ifdef CONFIG_PM
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 4244b069442e..8c024d7dc43e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -415,6 +415,103 @@ static struct miscdevice cpu_latency_qos_miscdev = {
.fops = &cpu_latency_qos_fops,
};
+/* The CPU system wakeup latency QoS. */
+static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
+ .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .type = PM_QOS_MIN,
+};
+
+/**
+ * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
+ *
+ * Returns the current CPU system wakeup latency QoS limit that may have been
+ * requested by user-space.
+ */
+s32 cpu_wakeup_latency_qos_limit(void)
+{
+ return pm_qos_read_value(&cpu_wakeup_latency_constraints);
+}
+
+static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
+{
+ struct pm_qos_request *req;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->qos = &cpu_wakeup_latency_constraints;
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
+ PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+ filp->private_data = req;
+
+ return 0;
+}
+
+static int cpu_wakeup_latency_qos_release(struct inode *inode,
+ struct file *filp)
+{
+ struct pm_qos_request *req = filp->private_data;
+
+ filp->private_data = NULL;
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
+ PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+ kfree(req);
+
+ return 0;
+}
+
+static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);
+
+ return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
+static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
+ const char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ struct pm_qos_request *req = filp->private_data;
+ s32 value;
+
+ if (count == sizeof(s32)) {
+ if (copy_from_user(&value, buf, sizeof(s32)))
+ return -EFAULT;
+ } else {
+ int ret;
+
+ ret = kstrtos32_from_user(buf, count, 16, &value);
+ if (ret)
+ return ret;
+ }
+
+ if (value < 0)
+ return -EINVAL;
+
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);
+
+ return count;
+}
+
+static const struct file_operations cpu_wakeup_latency_qos_fops = {
+ .open = cpu_wakeup_latency_qos_open,
+ .release = cpu_wakeup_latency_qos_release,
+ .read = cpu_wakeup_latency_qos_read,
+ .write = cpu_wakeup_latency_qos_write,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cpu_wakeup_latency",
+ .fops = &cpu_wakeup_latency_qos_fops,
+};
+
static int __init cpu_latency_qos_init(void)
{
int ret;
@@ -424,6 +521,11 @@ static int __init cpu_latency_qos_init(void)
pr_err("%s: %s setup failed\n", __func__,
cpu_latency_qos_miscdev.name);
+ ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
+ if (ret < 0)
+ pr_err("%s: %s setup failed\n", __func__,
+ cpu_wakeup_latency_qos_miscdev.name);
+
return ret;
}
late_initcall(cpu_latency_qos_init);
--
2.43.0
Hi Ulf,
On Oct 16, 2025 at 17:19:21 +0200, Ulf Hansson wrote:
> Some platforms supports multiple low-power states for CPUs that can be used
> when entering system-wide suspend. Currently we are always selecting the
> deepest possible state for the CPUs, which can break the system-wakeup
> latency constraint that may be required for some use-cases.
>
> Let's take the first step towards addressing this problem, by introducing
> an interface for user-space, that allows us to specify the CPU
> system-wakeup QoS limit. Subsequent changes will start taking into account
> the new QoS limit.
>
> Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
> ---
>
> Changes in v2:
> - Renamings to reflect the QoS are limited to CPUs.
> - Move code inside "CONFIG_CPU_IDLE".
>
> ---
> include/linux/pm_qos.h | 5 ++
> kernel/power/qos.c | 102 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 107 insertions(+)
>
> diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
> index 4a69d4af3ff8..bf7524d38933 100644
> --- a/include/linux/pm_qos.h
> +++ b/include/linux/pm_qos.h
> @@ -149,6 +149,7 @@ bool cpu_latency_qos_request_active(struct pm_qos_request *req);
> void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value);
> void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value);
> void cpu_latency_qos_remove_request(struct pm_qos_request *req);
> +s32 cpu_wakeup_latency_qos_limit(void);
> #else
> static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; }
> static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req)
> @@ -160,6 +161,10 @@ static inline void cpu_latency_qos_add_request(struct pm_qos_request *req,
> static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
> s32 new_value) {}
> static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
> +static inline s32 cpu_wakeup_latency_qos_limit(void)
> +{
> + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
> +}
> #endif
>
> #ifdef CONFIG_PM
> diff --git a/kernel/power/qos.c b/kernel/power/qos.c
> index 4244b069442e..8c024d7dc43e 100644
> --- a/kernel/power/qos.c
> +++ b/kernel/power/qos.c
> @@ -415,6 +415,103 @@ static struct miscdevice cpu_latency_qos_miscdev = {
> .fops = &cpu_latency_qos_fops,
> };
>
> +/* The CPU system wakeup latency QoS. */
> +static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
> + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
> + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> + .type = PM_QOS_MIN,
> +};
> +
> +/**
> + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
> + *
> + * Returns the current CPU system wakeup latency QoS limit that may have been
> + * requested by user-space.
> + */
> +s32 cpu_wakeup_latency_qos_limit(void)
> +{
> + return pm_qos_read_value(&cpu_wakeup_latency_constraints);
> +}
> +
> +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
> +{
> + struct pm_qos_request *req;
> +
> + req = kzalloc(sizeof(*req), GFP_KERNEL);
> + if (!req)
> + return -ENOMEM;
> +
> + req->qos = &cpu_wakeup_latency_constraints;
> + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
> + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
> + filp->private_data = req;
> +
> + return 0;
> +}
> +
> +static int cpu_wakeup_latency_qos_release(struct inode *inode,
> + struct file *filp)
> +{
> + struct pm_qos_request *req = filp->private_data;
> +
> + filp->private_data = NULL;
> + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
> + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
Please excuse the delay in reviewing these patches,
I was wondering why we have decided here in release to reset the
constraints set by a user. For eg. even when I was testing the previous
revision locally I'd just commented out this release hook, since I
wanted to be able to just echo 0xABCD into /dev/cpu_wakeup_latency...
It seems an overkill to me that a userspace program be required to hold
open this file just to make sure the constraints are honoured for the
lifetime of the device. We should definitely give the freedom to just be
able to echo and also be able to cat and read back from the same place
about the latency constraint being set.
One other thing on my mind is - and probably unrelated to this specific
series, but I think we must have some sysfs entry either appear in
/sys/.../cpu0/cpuidle or s2idle/ where we can show next feesible s2idle
state that the governor has chosen to enter based on the value set in
cpu_wakeup_latency.
Thoughts?
> + kfree(req);
> +
> + return 0;
> +}
> +
> +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
> + size_t count, loff_t *f_pos)
> +{
> + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);
> +
> + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
> +}
> +
> +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
> + const char __user *buf,
> + size_t count, loff_t *f_pos)
> +{
> + struct pm_qos_request *req = filp->private_data;
> + s32 value;
> +
> + if (count == sizeof(s32)) {
> + if (copy_from_user(&value, buf, sizeof(s32)))
> + return -EFAULT;
> + } else {
> + int ret;
> +
> + ret = kstrtos32_from_user(buf, count, 16, &value);
> + if (ret)
> + return ret;
> + }
> +
> + if (value < 0)
> + return -EINVAL;
> +
> + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);
> +
> + return count;
> +}
> +
> +static const struct file_operations cpu_wakeup_latency_qos_fops = {
> + .open = cpu_wakeup_latency_qos_open,
> + .release = cpu_wakeup_latency_qos_release,
> + .read = cpu_wakeup_latency_qos_read,
> + .write = cpu_wakeup_latency_qos_write,
> + .llseek = noop_llseek,
> +};
> +
> +static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
> + .minor = MISC_DYNAMIC_MINOR,
> + .name = "cpu_wakeup_latency",
> + .fops = &cpu_wakeup_latency_qos_fops,
> +};
> +
> static int __init cpu_latency_qos_init(void)
> {
> int ret;
> @@ -424,6 +521,11 @@ static int __init cpu_latency_qos_init(void)
> pr_err("%s: %s setup failed\n", __func__,
> cpu_latency_qos_miscdev.name);
>
> + ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
> + if (ret < 0)
> + pr_err("%s: %s setup failed\n", __func__,
> + cpu_wakeup_latency_qos_miscdev.name);
> +
> return ret;
> }
> late_initcall(cpu_latency_qos_init);
> --
> 2.43.0
>
--
Best regards,
Dhruva Gole
Texas Instruments Incorporated
On Wed, Oct 29, 2025 at 9:10 AM Dhruva Gole <d-gole@ti.com> wrote:
>
> Hi Ulf,
>
> On Oct 16, 2025 at 17:19:21 +0200, Ulf Hansson wrote:
> > Some platforms supports multiple low-power states for CPUs that can be used
> > when entering system-wide suspend. Currently we are always selecting the
> > deepest possible state for the CPUs, which can break the system-wakeup
> > latency constraint that may be required for some use-cases.
> >
> > Let's take the first step towards addressing this problem, by introducing
> > an interface for user-space, that allows us to specify the CPU
> > system-wakeup QoS limit. Subsequent changes will start taking into account
> > the new QoS limit.
> >
> > Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
> > ---
> >
> > Changes in v2:
> > - Renamings to reflect the QoS are limited to CPUs.
> > - Move code inside "CONFIG_CPU_IDLE".
> >
> > ---
> > include/linux/pm_qos.h | 5 ++
> > kernel/power/qos.c | 102 +++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 107 insertions(+)
> >
> > diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
> > index 4a69d4af3ff8..bf7524d38933 100644
> > --- a/include/linux/pm_qos.h
> > +++ b/include/linux/pm_qos.h
> > @@ -149,6 +149,7 @@ bool cpu_latency_qos_request_active(struct pm_qos_request *req);
> > void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value);
> > void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value);
> > void cpu_latency_qos_remove_request(struct pm_qos_request *req);
> > +s32 cpu_wakeup_latency_qos_limit(void);
> > #else
> > static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; }
> > static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req)
> > @@ -160,6 +161,10 @@ static inline void cpu_latency_qos_add_request(struct pm_qos_request *req,
> > static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
> > s32 new_value) {}
> > static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
> > +static inline s32 cpu_wakeup_latency_qos_limit(void)
> > +{
> > + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
> > +}
> > #endif
> >
> > #ifdef CONFIG_PM
> > diff --git a/kernel/power/qos.c b/kernel/power/qos.c
> > index 4244b069442e..8c024d7dc43e 100644
> > --- a/kernel/power/qos.c
> > +++ b/kernel/power/qos.c
> > @@ -415,6 +415,103 @@ static struct miscdevice cpu_latency_qos_miscdev = {
> > .fops = &cpu_latency_qos_fops,
> > };
> >
> > +/* The CPU system wakeup latency QoS. */
> > +static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
> > + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
> > + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> > + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> > + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> > + .type = PM_QOS_MIN,
> > +};
> > +
> > +/**
> > + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
> > + *
> > + * Returns the current CPU system wakeup latency QoS limit that may have been
> > + * requested by user-space.
> > + */
> > +s32 cpu_wakeup_latency_qos_limit(void)
> > +{
> > + return pm_qos_read_value(&cpu_wakeup_latency_constraints);
> > +}
> > +
> > +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
> > +{
> > + struct pm_qos_request *req;
> > +
> > + req = kzalloc(sizeof(*req), GFP_KERNEL);
> > + if (!req)
> > + return -ENOMEM;
> > +
> > + req->qos = &cpu_wakeup_latency_constraints;
> > + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
> > + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
> > + filp->private_data = req;
> > +
> > + return 0;
> > +}
> > +
> > +static int cpu_wakeup_latency_qos_release(struct inode *inode,
> > + struct file *filp)
> > +{
> > + struct pm_qos_request *req = filp->private_data;
> > +
> > + filp->private_data = NULL;
> > + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
> > + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
>
> Please excuse the delay in reviewing these patches,
> I was wondering why we have decided here in release to reset the
> constraints set by a user. For eg. even when I was testing the previous
> revision locally I'd just commented out this release hook, since I
> wanted to be able to just echo 0xABCD into /dev/cpu_wakeup_latency...
If you want "fire and forget", that would be a different interface.
Device special files are not for that.
Cleaning up after closing a file descriptor is a safety measure and
CPU wakeup latency constraints are a big deal. Leaving leftover ones
behind dead processes is not a good idea.
> It seems an overkill to me that a userspace program be required to hold
> open this file just to make sure the constraints are honoured for the
> lifetime of the device. We should definitely give the freedom to just be
> able to echo and also be able to cat and read back from the same place
> about the latency constraint being set.
So you'd want a sysfs attribute here, but that has its own issues (the
last writer "wins", so if there are multiple users of it with
different needs in user space, things get tricky).
> One other thing on my mind is - and probably unrelated to this specific
> series, but I think we must have some sysfs entry either appear in
> /sys/.../cpu0/cpuidle or s2idle/ where we can show next feesible s2idle
> state that the governor has chosen to enter based on the value set in
> cpu_wakeup_latency.
Exit latency values for all states are exposed via sysfs. Since
s2idle always uses the deepest state it can use, it is quite
straightforward to figure out which of them will be used going
forward, given a specific latency constraint.
On Oct 29, 2025 at 15:28:22 +0100, Rafael J. Wysocki wrote:
> On Wed, Oct 29, 2025 at 9:10 AM Dhruva Gole <d-gole@ti.com> wrote:
> >
> > Hi Ulf,
> >
> > On Oct 16, 2025 at 17:19:21 +0200, Ulf Hansson wrote:
> > > Some platforms supports multiple low-power states for CPUs that can be used
> > > when entering system-wide suspend. Currently we are always selecting the
> > > deepest possible state for the CPUs, which can break the system-wakeup
> > > latency constraint that may be required for some use-cases.
> > >
> > > Let's take the first step towards addressing this problem, by introducing
> > > an interface for user-space, that allows us to specify the CPU
> > > system-wakeup QoS limit. Subsequent changes will start taking into account
> > > the new QoS limit.
> > >
> > > Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
> > > ---
> > >
> > > Changes in v2:
> > > - Renamings to reflect the QoS are limited to CPUs.
> > > - Move code inside "CONFIG_CPU_IDLE".
> > >
> > > ---
> > > include/linux/pm_qos.h | 5 ++
> > > kernel/power/qos.c | 102 +++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 107 insertions(+)
> > >
> > > diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
> > > index 4a69d4af3ff8..bf7524d38933 100644
> > > --- a/include/linux/pm_qos.h
> > > +++ b/include/linux/pm_qos.h
> > > @@ -149,6 +149,7 @@ bool cpu_latency_qos_request_active(struct pm_qos_request *req);
> > > void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value);
> > > void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value);
> > > void cpu_latency_qos_remove_request(struct pm_qos_request *req);
> > > +s32 cpu_wakeup_latency_qos_limit(void);
> > > #else
> > > static inline s32 cpu_latency_qos_limit(void) { return INT_MAX; }
> > > static inline bool cpu_latency_qos_request_active(struct pm_qos_request *req)
> > > @@ -160,6 +161,10 @@ static inline void cpu_latency_qos_add_request(struct pm_qos_request *req,
> > > static inline void cpu_latency_qos_update_request(struct pm_qos_request *req,
> > > s32 new_value) {}
> > > static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {}
> > > +static inline s32 cpu_wakeup_latency_qos_limit(void)
> > > +{
> > > + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
> > > +}
> > > #endif
> > >
> > > #ifdef CONFIG_PM
> > > diff --git a/kernel/power/qos.c b/kernel/power/qos.c
> > > index 4244b069442e..8c024d7dc43e 100644
> > > --- a/kernel/power/qos.c
> > > +++ b/kernel/power/qos.c
> > > @@ -415,6 +415,103 @@ static struct miscdevice cpu_latency_qos_miscdev = {
> > > .fops = &cpu_latency_qos_fops,
> > > };
> > >
> > > +/* The CPU system wakeup latency QoS. */
> > > +static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
> > > + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
> > > + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> > > + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> > > + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
> > > + .type = PM_QOS_MIN,
> > > +};
> > > +
> > > +/**
> > > + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
> > > + *
> > > + * Returns the current CPU system wakeup latency QoS limit that may have been
> > > + * requested by user-space.
> > > + */
> > > +s32 cpu_wakeup_latency_qos_limit(void)
> > > +{
> > > + return pm_qos_read_value(&cpu_wakeup_latency_constraints);
> > > +}
> > > +
> > > +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
> > > +{
> > > + struct pm_qos_request *req;
> > > +
> > > + req = kzalloc(sizeof(*req), GFP_KERNEL);
> > > + if (!req)
> > > + return -ENOMEM;
> > > +
> > > + req->qos = &cpu_wakeup_latency_constraints;
> > > + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
> > > + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
> > > + filp->private_data = req;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int cpu_wakeup_latency_qos_release(struct inode *inode,
> > > + struct file *filp)
> > > +{
> > > + struct pm_qos_request *req = filp->private_data;
> > > +
> > > + filp->private_data = NULL;
> > > + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
> > > + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
> >
> > Please excuse the delay in reviewing these patches,
> > I was wondering why we have decided here in release to reset the
> > constraints set by a user. For eg. even when I was testing the previous
> > revision locally I'd just commented out this release hook, since I
> > wanted to be able to just echo 0xABCD into /dev/cpu_wakeup_latency...
>
> If you want "fire and forget", that would be a different interface.
> Device special files are not for that.
>
> Cleaning up after closing a file descriptor is a safety measure and
> CPU wakeup latency constraints are a big deal. Leaving leftover ones
> behind dead processes is not a good idea.
Hmm okay ..
>
> > It seems an overkill to me that a userspace program be required to hold
> > open this file just to make sure the constraints are honoured for the
> > lifetime of the device. We should definitely give the freedom to just be
> > able to echo and also be able to cat and read back from the same place
> > about the latency constraint being set.
>
> So you'd want a sysfs attribute here, but that has its own issues (the
> last writer "wins", so if there are multiple users of it with
> different needs in user space, things get tricky).
sysfs makes sense, then would it make sense to have something like a
/sys/devices/system/cpu/cpu0/power/cpu_wakeup_latency entry?
IMHO userspace should decide accordingly to manage it's users and how/whom to allow to
set the latency constraint.
We already have CPU latency QoS entry for example which is sysfs too.
>
> > One other thing on my mind is - and probably unrelated to this specific
> > series, but I think we must have some sysfs entry either appear in
> > /sys/.../cpu0/cpuidle or s2idle/ where we can show next feesible s2idle
> > state that the governor has chosen to enter based on the value set in
> > cpu_wakeup_latency.
>
> Exit latency values for all states are exposed via sysfs. Since
> s2idle always uses the deepest state it can use, it is quite
> straightforward to figure out which of them will be used going
> forward, given a specific latency constraint.
I disagree regarding the straightforward part. There could be
multiple domain heirarchy in a system for eg. and all these multiple
domains would have their own set of domain-idle-states. All of them having their own
entry, exit, and residency latencies. I myself while testing this series
have been thoroughly confused at times what idle-state did the kernel
actually pick this time, and had to add prints just to figure that out.
When implementing these things for the first
time, especially when one has complex and many a domain idle-states it
would indeed help alot if the kernel could just advertise somewhere what
the governor is going to pick as the next s2idle state.
Also, I am not quite sure if these latencies are exposed in the
domain-idle-states scenario ...
I tried checking in /sys/kernel/debug/pm_genpd/XXX/ but I only see
these:
active_time current_state devices idle_states sub_domains total_idle_time
Maybe an additional s2idle_state or something appearing here is what I
was inclined toward.
--
Best regards,
Dhruva Gole
Texas Instruments Incorporated
[...] > > > > > It seems an overkill to me that a userspace program be required to hold > > > open this file just to make sure the constraints are honoured for the > > > lifetime of the device. We should definitely give the freedom to just be > > > able to echo and also be able to cat and read back from the same place > > > about the latency constraint being set. > > > > So you'd want a sysfs attribute here, but that has its own issues (the > > last writer "wins", so if there are multiple users of it with > > different needs in user space, things get tricky). > > sysfs makes sense, then would it make sense to have something like a > /sys/devices/system/cpu/cpu0/power/cpu_wakeup_latency entry? > > IMHO userspace should decide accordingly to manage it's users and how/whom to allow to > set the latency constraint. > We already have CPU latency QoS entry for example which is sysfs too. > > > > > > One other thing on my mind is - and probably unrelated to this specific > > > series, but I think we must have some sysfs entry either appear in > > > /sys/.../cpu0/cpuidle or s2idle/ where we can show next feesible s2idle > > > state that the governor has chosen to enter based on the value set in > > > cpu_wakeup_latency. > > > > Exit latency values for all states are exposed via sysfs. Since > > s2idle always uses the deepest state it can use, it is quite > > straightforward to figure out which of them will be used going > > forward, given a specific latency constraint. > > I disagree regarding the straightforward part. There could be > multiple domain heirarchy in a system for eg. and all these multiple > domains would have their own set of domain-idle-states. All of them having their own > entry, exit, and residency latencies. I myself while testing this series > have been thoroughly confused at times what idle-state did the kernel > actually pick this time, and had to add prints just to figure that out. If I understand correctly, most of that confusion is because of the misunderstanding of including the residency in the state selection in regards to QoS. Residency should not be accounted for, but only enter+exit latencies. > > When implementing these things for the first > time, especially when one has complex and many a domain idle-states it > would indeed help alot if the kernel could just advertise somewhere what > the governor is going to pick as the next s2idle state. The problem with advertising upfront is that the state selection is done dynamically. It simply can't work. > > Also, I am not quite sure if these latencies are exposed in the > domain-idle-states scenario ... > I tried checking in /sys/kernel/debug/pm_genpd/XXX/ but I only see > these: > active_time current_state devices idle_states sub_domains total_idle_time > > Maybe an additional s2idle_state or something appearing here is what I > was inclined toward. That sounds like an idea that is worth exploring, if what you are suggesting is to extend the idle state statistics. In principle we want a new counter per idle state that indicates the number of times we entered this state in s2idle, right? While I was testing this feature, I used trace_printk - and afterward it's easy to digest the trace buffer to see what happened. Kind regards Uffe
On Oct 31, 2025 at 14:47:29 +0100, Ulf Hansson wrote: > [...] > > > > > > > > It seems an overkill to me that a userspace program be required to hold > > > > open this file just to make sure the constraints are honoured for the > > > > lifetime of the device. We should definitely give the freedom to just be > > > > able to echo and also be able to cat and read back from the same place > > > > about the latency constraint being set. > > > > > > So you'd want a sysfs attribute here, but that has its own issues (the > > > last writer "wins", so if there are multiple users of it with > > > different needs in user space, things get tricky). > > > > sysfs makes sense, then would it make sense to have something like a > > /sys/devices/system/cpu/cpu0/power/cpu_wakeup_latency entry? > > > > IMHO userspace should decide accordingly to manage it's users and how/whom to allow to > > set the latency constraint. > > We already have CPU latency QoS entry for example which is sysfs too. > > > > > > > > > One other thing on my mind is - and probably unrelated to this specific > > > > series, but I think we must have some sysfs entry either appear in > > > > /sys/.../cpu0/cpuidle or s2idle/ where we can show next feesible s2idle > > > > state that the governor has chosen to enter based on the value set in > > > > cpu_wakeup_latency. > > > > > > Exit latency values for all states are exposed via sysfs. Since > > > s2idle always uses the deepest state it can use, it is quite > > > straightforward to figure out which of them will be used going > > > forward, given a specific latency constraint. > > > > I disagree regarding the straightforward part. There could be > > multiple domain heirarchy in a system for eg. and all these multiple > > domains would have their own set of domain-idle-states. All of them having their own > > entry, exit, and residency latencies. I myself while testing this series > > have been thoroughly confused at times what idle-state did the kernel > > actually pick this time, and had to add prints just to figure that out. > > If I understand correctly, most of that confusion is because of the > misunderstanding of including the residency in the state selection in > regards to QoS. > > Residency should not be accounted for, but only enter+exit latencies. Understood your point on the latencies, however the point remains that in a multi domain , multi idle-states case, do we really have an easy way to determine what the next choice of idle-state the governor is going to make? We don't even expose the entry exit latencies in sysfs btw... > > > > > When implementing these things for the first > > time, especially when one has complex and many a domain idle-states it > > would indeed help alot if the kernel could just advertise somewhere what > > the governor is going to pick as the next s2idle state. > > The problem with advertising upfront is that the state selection is > done dynamically. It simply can't work. I understand it might be done dynamically, but as IIUC the only constraint being taken into account is really coming from userspace. I don't think this series is taking into account or even exposing any API to kernel world to modify the cpu wakeup latency (which I think you should, but that's an entirely orthogonal discussion, don't want to mix it here). So as far as "dynamic" is concerned I feel if the userspace is having control of which processes are setting the cpu wakeup constraints then it's entirely okay for kernel to tell userspace that at any given moment "this" is the next s2idle state I am going to pick if you do a system s2idle right now. > > > > > Also, I am not quite sure if these latencies are exposed in the > > domain-idle-states scenario ... > > I tried checking in /sys/kernel/debug/pm_genpd/XXX/ but I only see > > these: > > active_time current_state devices idle_states sub_domains total_idle_time > > > > Maybe an additional s2idle_state or something appearing here is what I > > was inclined toward. > > That sounds like an idea that is worth exploring, if what you are > suggesting is to extend the idle state statistics. In principle we > want a new counter per idle state that indicates the number of times > we entered this state in s2idle, right? Absolutely, having a "global" kind of a place to find out the s2idle stats would really be useful. > > While I was testing this feature, I used trace_printk - and afterward > it's easy to digest the trace buffer to see what happened. > > Kind regards > Uffe -- Best regards, Dhruva Gole Texas Instruments Incorporated
On Fri, 31 Oct 2025 at 19:37, Dhruva Gole <d-gole@ti.com> wrote: > > On Oct 31, 2025 at 14:47:29 +0100, Ulf Hansson wrote: > > [...] > > > > > > > > > > > It seems an overkill to me that a userspace program be required to hold > > > > > open this file just to make sure the constraints are honoured for the > > > > > lifetime of the device. We should definitely give the freedom to just be > > > > > able to echo and also be able to cat and read back from the same place > > > > > about the latency constraint being set. > > > > > > > > So you'd want a sysfs attribute here, but that has its own issues (the > > > > last writer "wins", so if there are multiple users of it with > > > > different needs in user space, things get tricky). > > > > > > sysfs makes sense, then would it make sense to have something like a > > > /sys/devices/system/cpu/cpu0/power/cpu_wakeup_latency entry? > > > > > > IMHO userspace should decide accordingly to manage it's users and how/whom to allow to > > > set the latency constraint. > > > We already have CPU latency QoS entry for example which is sysfs too. > > > > > > > > > > > > One other thing on my mind is - and probably unrelated to this specific > > > > > series, but I think we must have some sysfs entry either appear in > > > > > /sys/.../cpu0/cpuidle or s2idle/ where we can show next feesible s2idle > > > > > state that the governor has chosen to enter based on the value set in > > > > > cpu_wakeup_latency. > > > > > > > > Exit latency values for all states are exposed via sysfs. Since > > > > s2idle always uses the deepest state it can use, it is quite > > > > straightforward to figure out which of them will be used going > > > > forward, given a specific latency constraint. > > > > > > I disagree regarding the straightforward part. There could be > > > multiple domain heirarchy in a system for eg. and all these multiple > > > domains would have their own set of domain-idle-states. All of them having their own > > > entry, exit, and residency latencies. I myself while testing this series > > > have been thoroughly confused at times what idle-state did the kernel > > > actually pick this time, and had to add prints just to figure that out. > > > > If I understand correctly, most of that confusion is because of the > > misunderstanding of including the residency in the state selection in > > regards to QoS. > > > > Residency should not be accounted for, but only enter+exit latencies. > > Understood your point on the latencies, however the point remains that > in a multi domain , multi idle-states case, do we really have an easy way to > determine what the next choice of idle-state the governor is going to > make? We don't even expose the entry exit latencies in sysfs btw... I agree, we should extend the sysfs/debugfs information about the domain-idle-states with this too. Especially since we already have it for the regular idle states that are managed by cpuidle. > > > > > > > > > When implementing these things for the first > > > time, especially when one has complex and many a domain idle-states it > > > would indeed help alot if the kernel could just advertise somewhere what > > > the governor is going to pick as the next s2idle state. > > > > The problem with advertising upfront is that the state selection is > > done dynamically. It simply can't work. > > I understand it might be done dynamically, but as IIUC the only > constraint being taken into account is really coming from userspace. I > don't think this series is taking into account or even exposing any API > to kernel world to modify the cpu wakeup latency (which I think you > should, but that's an entirely orthogonal discussion, don't want to mix > it here). So as far as "dynamic" is concerned I feel if the userspace is > having control of which processes are setting the cpu wakeup constraints > then it's entirely okay for kernel to tell userspace that at any given > moment "this" is the next s2idle state I am going to pick if you do a > system s2idle right now. > > > > > > > > > Also, I am not quite sure if these latencies are exposed in the > > > domain-idle-states scenario ... > > > I tried checking in /sys/kernel/debug/pm_genpd/XXX/ but I only see > > > these: > > > active_time current_state devices idle_states sub_domains total_idle_time > > > > > > Maybe an additional s2idle_state or something appearing here is what I > > > was inclined toward. > > > > That sounds like an idea that is worth exploring, if what you are > > suggesting is to extend the idle state statistics. In principle we > > want a new counter per idle state that indicates the number of times > > we entered this state in s2idle, right? > > Absolutely, having a "global" kind of a place to find out the s2idle > stats would really be useful. For regular idle states that are managed by cpuidle, those have a per-state directory called s2idle (if the state is supported for s2idle), with usage/time counters. That said, I agree, it's a good idea to add something similar for the domain-idle-states that are managed by genpd. Let me think about it and I will post a couple of patches that add this information about the domain-idle-states. Kind regards Uffe
© 2016 - 2026 Red Hat, Inc.