Task can set its latency priority with sched_setattr(), which is then used
to set the latency offset of its sched_enity, but sched group entities
still have the default latency offset value.
Add a latency.nice field in cpu cgroup controller to set the latency
priority of the group similarly to sched_setattr(). The latency priority
is then used to set the offset of the sched_entities of the group.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Documentation/admin-guide/cgroup-v2.rst | 10 ++++++++
kernel/sched/core.c | 30 +++++++++++++++++++++++
kernel/sched/fair.c | 32 +++++++++++++++++++++++++
kernel/sched/sched.h | 4 ++++
4 files changed, 76 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1b3ed1c3b3f1..c08424593e4a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1121,6 +1121,16 @@ All time durations are in microseconds.
values similar to the sched_setattr(2). This maximum utilization
value is used to clamp the task specific maximum utilization clamp.
+ cpu.latency.nice
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ The nice value is in the range [-20, 19].
+
+ This interface file allows reading and setting latency using the
+ same values used by sched_setattr(2). The latency_nice of a group is
+ used to limit the impact of the latency_nice of a task outside the
+ group.
Memory
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5b7e237d79b..093cc1af73dc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11059,6 +11059,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
{
return sched_group_set_idle(css_tg(css), idle);
}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return LATENCY_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ int prio;
+
+ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
+ return -ERANGE;
+
+ prio = NICE_TO_LATENCY(nice);
+
+ return sched_group_set_latency(css_tg(css), prio);
+}
#endif
static struct cftype cpu_legacy_files[] = {
@@ -11073,6 +11092,11 @@ static struct cftype cpu_legacy_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -11290,6 +11314,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_idle_read_s64,
.write_s64 = cpu_idle_write_s64,
},
+ {
+ .name = "latency.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_latency_nice_read_s64,
+ .write_s64 = cpu_latency_nice_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 414b6243208b..dc7570f43ebe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12274,6 +12274,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
goto err;
tg->shares = NICE_0_LOAD;
+ tg->latency_prio = DEFAULT_LATENCY_PRIO;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
@@ -12372,6 +12373,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
}
se->my_q = cfs_rq;
+
+ se->latency_offset = calc_latency_offset(tg->latency_prio);
+
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
@@ -12502,6 +12506,34 @@ int sched_group_set_idle(struct task_group *tg, long idle)
return 0;
}
+int sched_group_set_latency(struct task_group *tg, int prio)
+{
+ long latency_offset;
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->latency_prio == prio) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->latency_prio = prio;
+ latency_offset = calc_latency_offset(prio);
+
+ for_each_possible_cpu(i) {
+ struct sched_entity *se = tg->se[i];
+
+ WRITE_ONCE(se->latency_offset, latency_offset);
+ }
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f42f86105d4..9a2e71231083 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -378,6 +378,8 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
+ /* latency priority of the group. */
+ int latency_prio;
#ifdef CONFIG_SMP
/*
@@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
+extern int sched_group_set_latency(struct task_group *tg, int prio);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
--
2.34.1
Hello Vincent. On Fri, Feb 24, 2023 at 10:34:52AM +0100, Vincent Guittot <vincent.guittot@linaro.org> wrote: > + cpu.latency.nice > + A read-write single value file which exists on non-root > + cgroups. The default is "0". > + > + The nice value is in the range [-20, 19]. > + > + This interface file allows reading and setting latency using the > + same values used by sched_setattr(2). The latency_nice of a group is > + used to limit the impact of the latency_nice of a task outside the > + group. IIUC, the latency priority is taken into account when deciding between entitites at the same level (as in pick_next_entity() or check_preempt_wake()/find_matchig_se()). So this group attribute is relevant in context of siblings (i.e. like cpu.weight ~ bandwidth priority)? I'm thus confused when it's referred to as a limit (in vertical sense). You somewhat imply that in [1]: > Regarding the behavior, the rule remains the same that a sched_entity > attached to a cgroup will not get more (latency in this case) than > what has been set for the group entity. But I don't see where such a constraint would be implemented in the code. (My cursory understanding above tends to horizontal comparisons.) Could you please hint me which is right? Thanks, Michal [1] https://lore.kernel.org/r/CAKfTPtDu=c-psGnHkoWSPRWoh1Z0VBBfsN++g+krv4B1SJmFjg@mail.gmail.com/
On Fri, 24 Feb 2023 at 20:29, Michal Koutný <mkoutny@suse.com> wrote: > > Hello Vincent. > > On Fri, Feb 24, 2023 at 10:34:52AM +0100, Vincent Guittot <vincent.guittot@linaro.org> wrote: > > + cpu.latency.nice > > + A read-write single value file which exists on non-root > > + cgroups. The default is "0". > > + > > + The nice value is in the range [-20, 19]. > > + > > + This interface file allows reading and setting latency using the > > + same values used by sched_setattr(2). The latency_nice of a group is > > + used to limit the impact of the latency_nice of a task outside the > > + group. > > IIUC, the latency priority is taken into account when deciding between > entitites at the same level (as in pick_next_entity() or > check_preempt_wake()/find_matchig_se()). > > So this group attribute is relevant in context of siblings (i.e. like > cpu.weight ~ bandwidth priority)? Yes > > I'm thus confused when it's referred to as a limit (in vertical sense). > You somewhat imply that in [1]: There were discussions about adding more features that could make use of the latency nice. This comment mainly wants to describe how this would behave in case that we need to compre entities/tasks not at the same level. Regarding the current use of latency nice to set a latency offset, the problem doesn't appear because latency offset applies between entities at the same level as you mentioned above > > > Regarding the behavior, the rule remains the same that a sched_entity > > attached to a cgroup will not get more (latency in this case) than > > what has been set for the group entity. > > But I don't see where such a constraint would be implemented in the > code. (My cursory understanding above tends to horizontal comparisons.) > > Could you please hint me which is right? Does my explanation above make sense to you ? > > Thanks, > Michal > > [1] https://lore.kernel.org/r/CAKfTPtDu=c-psGnHkoWSPRWoh1Z0VBBfsN++g+krv4B1SJmFjg@mail.gmail.com/ >
On Mon, Feb 27, 2023 at 02:44:22PM +0100, Vincent Guittot <vincent.guittot@linaro.org> wrote: > Regarding the current use of latency nice to set a latency offset, the > problem doesn't appear because latency offset applies between entities > at the same level as you mentioned above Splendid, it turned out that way (latency nice analogous to bandwidth nice). > Does my explanation above make sense to you ? Yes, thank you. Thus, I'd like to propose avoiding the use of "limit" in this context and stress the horizontal scope. For example: > + This interface file allows reading and setting latency using the > + same values used by sched_setattr(2). The latency_nice of a group is > + used to limit the impact of the latency_nice of a task outside the > + group. + This interface file allows reading and setting latency using the + same values used by sched_setattr(2). The latency_nice of a group is + used to modify group members' latency with respect to sibling groups. Regards, Michal
On Mon, 27 Feb 2023 at 15:42, Michal Koutný <mkoutny@suse.com> wrote: > > On Mon, Feb 27, 2023 at 02:44:22PM +0100, Vincent Guittot <vincent.guittot@linaro.org> wrote: > > Regarding the current use of latency nice to set a latency offset, the > > problem doesn't appear because latency offset applies between entities > > at the same level as you mentioned above > > Splendid, it turned out that way (latency nice analogous to bandwidth > nice). > > > Does my explanation above make sense to you ? > > Yes, thank you. > > Thus, I'd like to propose avoiding the use of "limit" in this context and > stress the horizontal scope. For example: > > > + This interface file allows reading and setting latency using the > > + same values used by sched_setattr(2). The latency_nice of a group is > > + used to limit the impact of the latency_nice of a task outside the > > + group. > > + This interface file allows reading and setting latency using the > + same values used by sched_setattr(2). The latency_nice of a group is > + used to modify group members' latency with respect to sibling groups. That sounds reasonable to me. > > Regards, > Michal
© 2016 - 2025 Red Hat, Inc.