From: Valentin Schneider <vschneid@redhat.com>
Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
add a task work to them so that when those tasks return to user, the
actual throttle/dequeue can happen.
Note that since the throttle/dequeue always happens on a task basis when
it returns to user, it's no longer necessary for check_cfs_rq_runtime()
to return a value and pick_task_fair() acts differently according to that
return value, so check_cfs_rq_runtime() is changed to not return a
value.
[aaronlu: extracted from Valentin's original patches.
Fixed a problem that curr is not in timeline tree and has to be dealed
with explicitely;
Make check_cfs_rq_runtime() void.]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
kernel/sched/fair.c | 201 ++++++++++++++++++++++++-------------------
kernel/sched/sched.h | 1 +
2 files changed, 112 insertions(+), 90 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 60eb5329bf526..ab403ff7d53c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5607,7 +5607,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
return se;
}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -5832,8 +5832,49 @@ static inline int throttled_lb_pair(struct
task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static void throttle_cfs_rq_work(struct callback_head *work)
{
+ struct task_struct *p = container_of(work, struct task_struct,
sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ struct rq_flags rf;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ rq = task_rq_lock(p, &rf);
+
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ goto out_unlock;
+
+ /*
+ * If not in limbo, then either replenish has happened or this task got
+ * migrated out of the throttled cfs_rq, move along
+ */
+ if (!cfs_rq->throttle_count)
+ goto out_unlock;
+
+ update_rq_clock(rq);
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+ resched_curr(rq);
+
+out_unlock:
+ task_rq_unlock(rq, p, &rf);
}
void init_cfs_throttle_work(struct task_struct *p)
@@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
*tg, void *data)
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ if (task_has_throttle_work(p))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p;
+ struct rb_node *node;
+
+ cfs_rq->throttle_count++;
+ if (cfs_rq->throttle_count > 1)
+ return 0;
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
- SCHED_WARN_ON(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
+ /*
+ * rq_lock is held, current is (obviously) executing this in kernelspace.
+ *
+ * All other tasks enqueued on this rq have their saved PC at the
+ * context switch, so they will go through the kernel before returning
+ * to userspace. Thus, there are no tasks-in-userspace to handle, just
+ * install the task_work on all of them.
+ */
+ node = rb_first(&cfs_rq->tasks_timeline.rb_root);
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ if (!entity_is_task(se))
+ goto next;
+
+ p = task_of(se);
+ task_throttle_setup_work(p);
+next:
+ node = rb_next(node);
+ }
+
+ /* curr is not in the timeline tree */
+ if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
+ p = task_of(cfs_rq->curr);
+ task_throttle_setup_work(p);
}
- cfs_rq->throttle_count++;
return 0;
}
-static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5919,74 +6009,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_b->lock);
if (!dequeue)
- return false; /* Throttle no longer required. */
-
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ return; /* Throttle no longer required. */
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-
- /* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5995,7 +6024,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
SCHED_WARN_ON(cfs_rq->throttled_clock);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
- return true;
+ return;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -6471,22 +6500,22 @@ static void sync_throttle(struct task_group
*tg, int cpu)
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return false;
+ return;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
+ return;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return true;
+ return;
- return throttle_cfs_rq(cfs_rq);
+ throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -6582,6 +6611,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7117,10 +7147,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
-
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
slice = cfs_rq_min_slice(cfs_rq);
@@ -7157,10 +7183,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
}
sub_nr_running(rq, h_nr_queued);
@@ -8869,8 +8891,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8bfa3d708081..5c2af5a70163c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -742,6 +742,7 @@ struct cfs_rq {
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
--
2.39.5
Hi Aaron,
> static int tg_throttle_down(struct task_group *tg, void *data)
> {
> struct rq *rq = data;
> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> + struct task_struct *p;
> + struct rb_node *node;
> +
> + cfs_rq->throttle_count++;
> + if (cfs_rq->throttle_count > 1)
> + return 0;
>
> /* group is entering throttled state, stop time */
> - if (!cfs_rq->throttle_count) {
> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> - list_del_leaf_cfs_rq(cfs_rq);
> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> + list_del_leaf_cfs_rq(cfs_rq);
>
> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> - if (cfs_rq->nr_queued)
> - cfs_rq->throttled_clock_self = rq_clock(rq);
> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> + if (cfs_rq->nr_queued)
> + cfs_rq->throttled_clock_self = rq_clock(rq);
> +
> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> + /*
> + * rq_lock is held, current is (obviously) executing this in kernelspace.
> + *
> + * All other tasks enqueued on this rq have their saved PC at the
> + * context switch, so they will go through the kernel before returning
> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> + * install the task_work on all of them.
> + */
> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> + while (node) {
> + struct sched_entity *se = __node_2_se(node);
> +
> + if (!entity_is_task(se))
> + goto next;
> +
> + p = task_of(se);
> + task_throttle_setup_work(p);
> +next:
> + node = rb_next(node);
> + }
I'd like to strongly push back on this approach. This adds quite a lot
of extra computation to an already expensive path
(throttle/unthrottle). e.g. this function is part of the cgroup walk
and so it is already O(cgroups) for the number of cgroups in the
hierarchy being throttled. This gets even worse when you consider that
we repeat this separately across all the cpus that the
bandwidth-constrained group is running on. Keep in mind that
throttle/unthrottle is done with rq lock held and IRQ disabled.
In K Prateek's last RFC, there was discussion of using context
tracking; did you consider that approach any further? We could keep
track of the number of threads within a cgroup hierarchy currently in
kernel mode (similar to h_nr_runnable), and thus simplify down the
throttling code here.
Best,
Josh
Hello Josh,
On 3/16/2025 8:55 AM, Josh Don wrote:
> Hi Aaron,
>
>> static int tg_throttle_down(struct task_group *tg, void *data)
>> {
>> struct rq *rq = data;
>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>> + struct task_struct *p;
>> + struct rb_node *node;
>> +
>> + cfs_rq->throttle_count++;
>> + if (cfs_rq->throttle_count > 1)
>> + return 0;
>>
>> /* group is entering throttled state, stop time */
>> - if (!cfs_rq->throttle_count) {
>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> - list_del_leaf_cfs_rq(cfs_rq);
>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> + list_del_leaf_cfs_rq(cfs_rq);
>>
>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> - if (cfs_rq->nr_queued)
>> - cfs_rq->throttled_clock_self = rq_clock(rq);
>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> + if (cfs_rq->nr_queued)
>> + cfs_rq->throttled_clock_self = rq_clock(rq);
>> +
>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>> + /*
>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
>> + *
>> + * All other tasks enqueued on this rq have their saved PC at the
>> + * context switch, so they will go through the kernel before returning
>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
>> + * install the task_work on all of them.
>> + */
>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>> + while (node) {
>> + struct sched_entity *se = __node_2_se(node);
>> +
>> + if (!entity_is_task(se))
>> + goto next;
>> +
>> + p = task_of(se);
>> + task_throttle_setup_work(p);
>> +next:
>> + node = rb_next(node);
>> + }
>
> I'd like to strongly push back on this approach. This adds quite a lot
> of extra computation to an already expensive path
> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> and so it is already O(cgroups) for the number of cgroups in the
> hierarchy being throttled. This gets even worse when you consider that
> we repeat this separately across all the cpus that the
> bandwidth-constrained group is running on. Keep in mind that
> throttle/unthrottle is done with rq lock held and IRQ disabled.
On this note, do you have any statistics for how many tasks are
throttled per-CPU on your system. The info from:
sudo ./bpftrace -e "kprobe:throttle_cfs_rq { \
@nr_queued[((struct cfs_rq *)$1)->h_nr_queued] = count(); \
@nr_runnable[((struct cfs_rq *)$1)->h_nr_runnable] = count(); \
}"
could help estimate the worst case times with per-task throttling
we are expecting.
>
> In K Prateek's last RFC, there was discussion of using context
> tracking; did you consider that approach any further? We could keep
> track of the number of threads within a cgroup hierarchy currently in
> kernel mode (similar to h_nr_runnable), and thus simplify down the
> throttling code here.
Based on Chengming's latest suggestion, we can keep tg_throttle_down()
as is and tag the task at pick using throttled_hierarchy() which will
work too.
Since it'll most likely end up doing:
if (throttled_hierarchy(cfs_rq_of(&p->se)))
task_throttle_setup_work(p);
The only overhead for the users not using CFS bandwidth is just the
cfs_rq->throttle_count check. If it was set, you are simply moving
the overhead to set the throttle work from the throttle path to
the pick path for the throttled tasks only and it also avoids
adding unnecessary work to task that may never get picked before
unthrottle.
>
> Best,
> Josh
--
Thanks and Regards,
Prateek
Hi Josh,
On Sat, Mar 15, 2025 at 08:25:53PM -0700, Josh Don wrote:
> Hi Aaron,
>
> > static int tg_throttle_down(struct task_group *tg, void *data)
> > {
> > struct rq *rq = data;
> > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > + struct task_struct *p;
> > + struct rb_node *node;
> > +
> > + cfs_rq->throttle_count++;
> > + if (cfs_rq->throttle_count > 1)
> > + return 0;
> >
> > /* group is entering throttled state, stop time */
> > - if (!cfs_rq->throttle_count) {
> > - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > - list_del_leaf_cfs_rq(cfs_rq);
> > + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > + list_del_leaf_cfs_rq(cfs_rq);
> >
> > - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > - if (cfs_rq->nr_queued)
> > - cfs_rq->throttled_clock_self = rq_clock(rq);
> > + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > + if (cfs_rq->nr_queued)
> > + cfs_rq->throttled_clock_self = rq_clock(rq);
> > +
> > + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> > + /*
> > + * rq_lock is held, current is (obviously) executing this in kernelspace.
> > + *
> > + * All other tasks enqueued on this rq have their saved PC at the
> > + * context switch, so they will go through the kernel before returning
> > + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> > + * install the task_work on all of them.
> > + */
> > + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> > + while (node) {
> > + struct sched_entity *se = __node_2_se(node);
> > +
> > + if (!entity_is_task(se))
> > + goto next;
> > +
> > + p = task_of(se);
> > + task_throttle_setup_work(p);
> > +next:
> > + node = rb_next(node);
> > + }
>
> I'd like to strongly push back on this approach. This adds quite a lot
> of extra computation to an already expensive path
> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> and so it is already O(cgroups) for the number of cgroups in the
> hierarchy being throttled. This gets even worse when you consider that
> we repeat this separately across all the cpus that the
> bandwidth-constrained group is running on. Keep in mind that
> throttle/unthrottle is done with rq lock held and IRQ disabled.
Agree that it's not good to do this O(nr_task) thing in
throttle/unthrottle path. As Chengming mentioned, throttle path can
avoid this but unthrottle path does not have an easy way to avoid this.
> In K Prateek's last RFC, there was discussion of using context
> tracking; did you consider that approach any further? We could keep
I haven't tried that approach yet.
> track of the number of threads within a cgroup hierarchy currently in
> kernel mode (similar to h_nr_runnable), and thus simplify down the
> throttling code here.
My initial feeling is the implementation looks pretty complex. If it can
be simplified somehow, that would be great.
Best regards,
Aaron
On Wed, Mar 19, 2025 at 6:43 AM Aaron Lu <ziqianlu@bytedance.com> wrote: [snip] > > track of the number of threads within a cgroup hierarchy currently in > > kernel mode (similar to h_nr_runnable), and thus simplify down the > > throttling code here. > > My initial feeling is the implementation looks pretty complex. If it can > be simplified somehow, that would be great. Xi (attached to this message) spoke about this at LPC last year; Xi can you describe your approach in a little bit more concrete detail? Best, Josh
On 2025/3/16 11:25, Josh Don wrote:
> Hi Aaron,
>
>> static int tg_throttle_down(struct task_group *tg, void *data)
>> {
>> struct rq *rq = data;
>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>> + struct task_struct *p;
>> + struct rb_node *node;
>> +
>> + cfs_rq->throttle_count++;
>> + if (cfs_rq->throttle_count > 1)
>> + return 0;
>>
>> /* group is entering throttled state, stop time */
>> - if (!cfs_rq->throttle_count) {
>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> - list_del_leaf_cfs_rq(cfs_rq);
>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> + list_del_leaf_cfs_rq(cfs_rq);
>>
>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> - if (cfs_rq->nr_queued)
>> - cfs_rq->throttled_clock_self = rq_clock(rq);
>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> + if (cfs_rq->nr_queued)
>> + cfs_rq->throttled_clock_self = rq_clock(rq);
>> +
>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>> + /*
>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
>> + *
>> + * All other tasks enqueued on this rq have their saved PC at the
>> + * context switch, so they will go through the kernel before returning
>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
>> + * install the task_work on all of them.
>> + */
>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>> + while (node) {
>> + struct sched_entity *se = __node_2_se(node);
>> +
>> + if (!entity_is_task(se))
>> + goto next;
>> +
>> + p = task_of(se);
>> + task_throttle_setup_work(p);
>> +next:
>> + node = rb_next(node);
>> + }
>
> I'd like to strongly push back on this approach. This adds quite a lot
> of extra computation to an already expensive path
> (throttle/unthrottle). e.g. this function is part of the cgroup walk
Actually, with my suggestion in another email that we only need to mark
these cfs_rqs throttled when quote used up, and setup throttle task work
when the tasks under those cfs_rqs get picked, the cost of throttle path
is much like the dual tree approach.
As for unthrottle path, you're right, it has to iterate each task on
a list to get it queued into the cfs_rq tree, so it needs more thinking.
> and so it is already O(cgroups) for the number of cgroups in the
> hierarchy being throttled. This gets even worse when you consider that
> we repeat this separately across all the cpus that the
> bandwidth-constrained group is running on. Keep in mind that
> throttle/unthrottle is done with rq lock held and IRQ disabled.
Maybe we can avoid holding rq lock when unthrottle? As for per-task
unthrottle, it's much like just waking up lots of tasks, so maybe we
can reuse ttwu path to wakeup those tasks, which could utilise remote
IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
>
> In K Prateek's last RFC, there was discussion of using context
> tracking; did you consider that approach any further? We could keep
> track of the number of threads within a cgroup hierarchy currently in
> kernel mode (similar to h_nr_runnable), and thus simplify down the
Yeah, I think Prateek's approach is very creative! The only downside of
it is that the code becomes much complex.. on already complex codebase.
Anyway, it would be great that or this could be merged in mainline kernel.
At last, I wonder is it possible that we can implement a cgroup-level
bandwidth control, instead of doing it in each sched_class? Then SCX
tasks could be controlled too, without implementing it in BPF code...
Thanks!
> throttling code here.
>
> Best,
> Josh
Hello Chengming,
On 3/17/2025 8:24 AM, Chengming Zhou wrote:
> On 2025/3/16 11:25, Josh Don wrote:
>> Hi Aaron,
>>
>>> static int tg_throttle_down(struct task_group *tg, void *data)
>>> {
>>> struct rq *rq = data;
>>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>> + struct task_struct *p;
>>> + struct rb_node *node;
>>> +
>>> + cfs_rq->throttle_count++;
>>> + if (cfs_rq->throttle_count > 1)
>>> + return 0;
>>>
>>> /* group is entering throttled state, stop time */
>>> - if (!cfs_rq->throttle_count) {
>>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> - list_del_leaf_cfs_rq(cfs_rq);
>>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> + list_del_leaf_cfs_rq(cfs_rq);
>>>
>>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> - if (cfs_rq->nr_queued)
>>> - cfs_rq->throttled_clock_self = rq_clock(rq);
>>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> + if (cfs_rq->nr_queued)
>>> + cfs_rq->throttled_clock_self = rq_clock(rq);
>>> +
>>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>> + /*
>>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
>>> + *
>>> + * All other tasks enqueued on this rq have their saved PC at the
>>> + * context switch, so they will go through the kernel before returning
>>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>> + * install the task_work on all of them.
>>> + */
>>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>> + while (node) {
>>> + struct sched_entity *se = __node_2_se(node);
>>> +
>>> + if (!entity_is_task(se))
>>> + goto next;
>>> +
>>> + p = task_of(se);
>>> + task_throttle_setup_work(p);
>>> +next:
>>> + node = rb_next(node);
>>> + }
>>
>> I'd like to strongly push back on this approach. This adds quite a lot
>> of extra computation to an already expensive path
>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
>
> Actually, with my suggestion in another email that we only need to mark
> these cfs_rqs throttled when quote used up, and setup throttle task work
> when the tasks under those cfs_rqs get picked, the cost of throttle path
> is much like the dual tree approach.
>
> As for unthrottle path, you're right, it has to iterate each task on
> a list to get it queued into the cfs_rq tree, so it needs more thinking.
>
>> and so it is already O(cgroups) for the number of cgroups in the
>> hierarchy being throttled. This gets even worse when you consider that
>> we repeat this separately across all the cpus that the
>> bandwidth-constrained group is running on. Keep in mind that
>> throttle/unthrottle is done with rq lock held and IRQ disabled.
>
> Maybe we can avoid holding rq lock when unthrottle? As for per-task
> unthrottle, it's much like just waking up lots of tasks, so maybe we
> can reuse ttwu path to wakeup those tasks, which could utilise remote
> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
>
>>
>> In K Prateek's last RFC, there was discussion of using context
>> tracking; did you consider that approach any further? We could keep
>> track of the number of threads within a cgroup hierarchy currently in
>> kernel mode (similar to h_nr_runnable), and thus simplify down the
>
> Yeah, I think Prateek's approach is very creative! The only downside of
> it is that the code becomes much complex.. on already complex codebase.
> Anyway, it would be great that or this could be merged in mainline kernel.
I think the consensus is to move to per-task throttling since it'll
simplify the efforts to move to a flat hierarchy sometime in the near
future.
My original approach was complex since i wanted to seamlessly resume the
pick part on unthrottle. In Ben;s patch [1] there is a TODO in
pick_next_entity() and that probably worked with the older vruntime based
CFS algorithm but can break EEVDF guarantees.
[1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
Unfortunately keeping a single rbtree meant replicating the stats and
that indeed adds to complexity.
>
> At last, I wonder is it possible that we can implement a cgroup-level
> bandwidth control, instead of doing it in each sched_class? Then SCX
> tasks could be controlled too, without implementing it in BPF code...
>
> Thanks!
>
>> throttling code here.
>>
>> Best,
>> Josh
--
Thanks and Regards,
Prateek
K Prateek Nayak <kprateek.nayak@amd.com> writes:
> Hello Chengming,
>
> On 3/17/2025 8:24 AM, Chengming Zhou wrote:
>> On 2025/3/16 11:25, Josh Don wrote:
>>> Hi Aaron,
>>>
>>>> static int tg_throttle_down(struct task_group *tg, void *data)
>>>> {
>>>> struct rq *rq = data;
>>>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>>> + struct task_struct *p;
>>>> + struct rb_node *node;
>>>> +
>>>> + cfs_rq->throttle_count++;
>>>> + if (cfs_rq->throttle_count > 1)
>>>> + return 0;
>>>>
>>>> /* group is entering throttled state, stop time */
>>>> - if (!cfs_rq->throttle_count) {
>>>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> - list_del_leaf_cfs_rq(cfs_rq);
>>>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> + list_del_leaf_cfs_rq(cfs_rq);
>>>>
>>>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> - if (cfs_rq->nr_queued)
>>>> - cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> + if (cfs_rq->nr_queued)
>>>> + cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> +
>>>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>>> + /*
>>>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
>>>> + *
>>>> + * All other tasks enqueued on this rq have their saved PC at the
>>>> + * context switch, so they will go through the kernel before returning
>>>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>>> + * install the task_work on all of them.
>>>> + */
>>>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>>> + while (node) {
>>>> + struct sched_entity *se = __node_2_se(node);
>>>> +
>>>> + if (!entity_is_task(se))
>>>> + goto next;
>>>> +
>>>> + p = task_of(se);
>>>> + task_throttle_setup_work(p);
>>>> +next:
>>>> + node = rb_next(node);
>>>> + }
>>>
>>> I'd like to strongly push back on this approach. This adds quite a lot
>>> of extra computation to an already expensive path
>>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
>> Actually, with my suggestion in another email that we only need to mark
>> these cfs_rqs throttled when quote used up, and setup throttle task work
>> when the tasks under those cfs_rqs get picked, the cost of throttle path
>> is much like the dual tree approach.
>> As for unthrottle path, you're right, it has to iterate each task on
>> a list to get it queued into the cfs_rq tree, so it needs more thinking.
>>
>>> and so it is already O(cgroups) for the number of cgroups in the
>>> hierarchy being throttled. This gets even worse when you consider that
>>> we repeat this separately across all the cpus that the
>>> bandwidth-constrained group is running on. Keep in mind that
>>> throttle/unthrottle is done with rq lock held and IRQ disabled.
>> Maybe we can avoid holding rq lock when unthrottle? As for per-task
>> unthrottle, it's much like just waking up lots of tasks, so maybe we
>> can reuse ttwu path to wakeup those tasks, which could utilise remote
>> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
>>
Yeah, looping over all the fully throttled tasks in unthrottle still
isn't great (and needing to do a full enqueue operation for each). It's
probably much better than looping over all the runnable tasks here
though.
Remote IPI shouldn't be very helpful, because we already have that in
async unthrottle. Whether or not it's useful to occasionally release the
lock while doing all the per-task unthrottles like loop_break who knows,
but it's certainly easy enough to do while just staying under rcu.
>>>
>>> In K Prateek's last RFC, there was discussion of using context
>>> tracking; did you consider that approach any further? We could keep
>>> track of the number of threads within a cgroup hierarchy currently in
>>> kernel mode (similar to h_nr_runnable), and thus simplify down the
>> Yeah, I think Prateek's approach is very creative! The only downside of
>> it is that the code becomes much complex.. on already complex codebase.
>> Anyway, it would be great that or this could be merged in mainline kernel.
>
> I think the consensus is to move to per-task throttling since it'll
> simplify the efforts to move to a flat hierarchy sometime in the near
> future.
>
> My original approach was complex since i wanted to seamlessly resume the
> pick part on unthrottle. In Ben;s patch [1] there is a TODO in
> pick_next_entity() and that probably worked with the older vruntime based
> CFS algorithm but can break EEVDF guarantees.
>
> [1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
>
> Unfortunately keeping a single rbtree meant replicating the stats and
> that indeed adds to complexity.
>
Does anything actually rely on those guarantees for correctness in the
scheduler? Would anything actually break if something overrode
pick_next_task_fair to just pick a random runnable task from the rq, or
similar? I'd only expect us to lose out on fairness, and only to the
extent that we're overriding the pick (and not as an ongoing
repercussion from a single unfair pick).
There's still plenty of potential reasons to want to provide better
fairness even between "throttled tasks still running in the kernel" but
I don't think that halfassing it would break EEVDF more than CFS. It
would however be significantly more annoying to duplicate the tree
nowadays with the more data required by entity_eligible.
Hello Prateek,
On 2025/3/20 14:59, K Prateek Nayak wrote:
> Hello Chengming,
>
> On 3/17/2025 8:24 AM, Chengming Zhou wrote:
>> On 2025/3/16 11:25, Josh Don wrote:
>>> Hi Aaron,
>>>
>>>> static int tg_throttle_down(struct task_group *tg, void *data)
>>>> {
>>>> struct rq *rq = data;
>>>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>>> + struct task_struct *p;
>>>> + struct rb_node *node;
>>>> +
>>>> + cfs_rq->throttle_count++;
>>>> + if (cfs_rq->throttle_count > 1)
>>>> + return 0;
>>>>
>>>> /* group is entering throttled state, stop time */
>>>> - if (!cfs_rq->throttle_count) {
>>>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> - list_del_leaf_cfs_rq(cfs_rq);
>>>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> + list_del_leaf_cfs_rq(cfs_rq);
>>>>
>>>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> - if (cfs_rq->nr_queued)
>>>> - cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> + if (cfs_rq->nr_queued)
>>>> + cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> +
>>>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>>> + /*
>>>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
>>>> + *
>>>> + * All other tasks enqueued on this rq have their saved PC at the
>>>> + * context switch, so they will go through the kernel before returning
>>>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>>> + * install the task_work on all of them.
>>>> + */
>>>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>>> + while (node) {
>>>> + struct sched_entity *se = __node_2_se(node);
>>>> +
>>>> + if (!entity_is_task(se))
>>>> + goto next;
>>>> +
>>>> + p = task_of(se);
>>>> + task_throttle_setup_work(p);
>>>> +next:
>>>> + node = rb_next(node);
>>>> + }
>>>
>>> I'd like to strongly push back on this approach. This adds quite a lot
>>> of extra computation to an already expensive path
>>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
>>
>> Actually, with my suggestion in another email that we only need to mark
>> these cfs_rqs throttled when quote used up, and setup throttle task work
>> when the tasks under those cfs_rqs get picked, the cost of throttle path
>> is much like the dual tree approach.
>>
>> As for unthrottle path, you're right, it has to iterate each task on
>> a list to get it queued into the cfs_rq tree, so it needs more thinking.
>>
>>> and so it is already O(cgroups) for the number of cgroups in the
>>> hierarchy being throttled. This gets even worse when you consider that
>>> we repeat this separately across all the cpus that the
>>> bandwidth-constrained group is running on. Keep in mind that
>>> throttle/unthrottle is done with rq lock held and IRQ disabled.
>>
>> Maybe we can avoid holding rq lock when unthrottle? As for per-task
>> unthrottle, it's much like just waking up lots of tasks, so maybe we
>> can reuse ttwu path to wakeup those tasks, which could utilise remote
>> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
>>
>>>
>>> In K Prateek's last RFC, there was discussion of using context
>>> tracking; did you consider that approach any further? We could keep
>>> track of the number of threads within a cgroup hierarchy currently in
>>> kernel mode (similar to h_nr_runnable), and thus simplify down the
>>
>> Yeah, I think Prateek's approach is very creative! The only downside of
>> it is that the code becomes much complex.. on already complex codebase.
>> Anyway, it would be great that or this could be merged in mainline kernel.
>
> I think the consensus is to move to per-task throttling since it'll
> simplify the efforts to move to a flat hierarchy sometime in the near
> future.
Ah, agree! And I'm very much looking forward to seeing a flat hierarchy!
At our clusters, there are often too many levels (3-6) of cgroups, which
caused too much cost in scheduler activities.
>
> My original approach was complex since i wanted to seamlessly resume the
> pick part on unthrottle. In Ben;s patch [1] there is a TODO in
> pick_next_entity() and that probably worked with the older vruntime based
> CFS algorithm but can break EEVDF guarantees.
>
> [1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
>
> Unfortunately keeping a single rbtree meant replicating the stats and
> that indeed adds to complexity.
Ok, got it.
Thanks!
>
>>
>> At last, I wonder is it possible that we can implement a cgroup-level
>> bandwidth control, instead of doing it in each sched_class? Then SCX
>> tasks could be controlled too, without implementing it in BPF code...
>>
>> Thanks!
>>
>>> throttling code here.
>>>
>>> Best,
>>> Josh
>
On Thu, Mar 20, 2025 at 1:39 AM Chengming Zhou <chengming.zhou@linux.dev> wrote:
>
> Hello Prateek,
>
> On 2025/3/20 14:59, K Prateek Nayak wrote:
> > Hello Chengming,
> >
> > On 3/17/2025 8:24 AM, Chengming Zhou wrote:
> >> On 2025/3/16 11:25, Josh Don wrote:
> >>> Hi Aaron,
> >>>
> >>>> static int tg_throttle_down(struct task_group *tg, void *data)
> >>>> {
> >>>> struct rq *rq = data;
> >>>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> >>>> + struct task_struct *p;
> >>>> + struct rb_node *node;
> >>>> +
> >>>> + cfs_rq->throttle_count++;
> >>>> + if (cfs_rq->throttle_count > 1)
> >>>> + return 0;
> >>>>
> >>>> /* group is entering throttled state, stop time */
> >>>> - if (!cfs_rq->throttle_count) {
> >>>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> >>>> - list_del_leaf_cfs_rq(cfs_rq);
> >>>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> >>>> + list_del_leaf_cfs_rq(cfs_rq);
> >>>>
> >>>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> >>>> - if (cfs_rq->nr_queued)
> >>>> - cfs_rq->throttled_clock_self = rq_clock(rq);
> >>>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> >>>> + if (cfs_rq->nr_queued)
> >>>> + cfs_rq->throttled_clock_self = rq_clock(rq);
> >>>> +
> >>>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> >>>> + /*
> >>>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
> >>>> + *
> >>>> + * All other tasks enqueued on this rq have their saved PC at the
> >>>> + * context switch, so they will go through the kernel before returning
> >>>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> >>>> + * install the task_work on all of them.
> >>>> + */
> >>>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> >>>> + while (node) {
> >>>> + struct sched_entity *se = __node_2_se(node);
> >>>> +
> >>>> + if (!entity_is_task(se))
> >>>> + goto next;
> >>>> +
> >>>> + p = task_of(se);
> >>>> + task_throttle_setup_work(p);
> >>>> +next:
> >>>> + node = rb_next(node);
> >>>> + }
> >>>
> >>> I'd like to strongly push back on this approach. This adds quite a lot
> >>> of extra computation to an already expensive path
> >>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> >>
> >> Actually, with my suggestion in another email that we only need to mark
> >> these cfs_rqs throttled when quote used up, and setup throttle task work
> >> when the tasks under those cfs_rqs get picked, the cost of throttle path
> >> is much like the dual tree approach.
> >>
> >> As for unthrottle path, you're right, it has to iterate each task on
> >> a list to get it queued into the cfs_rq tree, so it needs more thinking.
> >>
> >>> and so it is already O(cgroups) for the number of cgroups in the
> >>> hierarchy being throttled. This gets even worse when you consider that
> >>> we repeat this separately across all the cpus that the
> >>> bandwidth-constrained group is running on. Keep in mind that
> >>> throttle/unthrottle is done with rq lock held and IRQ disabled.
> >>
> >> Maybe we can avoid holding rq lock when unthrottle? As for per-task
> >> unthrottle, it's much like just waking up lots of tasks, so maybe we
> >> can reuse ttwu path to wakeup those tasks, which could utilise remote
> >> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
> >>
> >>>
> >>> In K Prateek's last RFC, there was discussion of using context
> >>> tracking; did you consider that approach any further? We could keep
> >>> track of the number of threads within a cgroup hierarchy currently in
> >>> kernel mode (similar to h_nr_runnable), and thus simplify down the
> >>
> >> Yeah, I think Prateek's approach is very creative! The only downside of
> >> it is that the code becomes much complex.. on already complex codebase.
> >> Anyway, it would be great that or this could be merged in mainline kernel.
> >
> > I think the consensus is to move to per-task throttling since it'll
> > simplify the efforts to move to a flat hierarchy sometime in the near
> > future.
>
> Ah, agree! And I'm very much looking forward to seeing a flat hierarchy!
>
> At our clusters, there are often too many levels (3-6) of cgroups, which
> caused too much cost in scheduler activities.
>
> >
> > My original approach was complex since i wanted to seamlessly resume the
> > pick part on unthrottle. In Ben;s patch [1] there is a TODO in
> > pick_next_entity() and that probably worked with the older vruntime based
> > CFS algorithm but can break EEVDF guarantees.
> >
> > [1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
> >
> > Unfortunately keeping a single rbtree meant replicating the stats and
> > that indeed adds to complexity.
>
> Ok, got it.
>
> Thanks!
>
> >
> >>
> >> At last, I wonder is it possible that we can implement a cgroup-level
> >> bandwidth control, instead of doing it in each sched_class? Then SCX
> >> tasks could be controlled too, without implementing it in BPF code...
> >>
> >> Thanks!
> >>
> >>> throttling code here.
> >>>
> >>> Best,
> >>> Josh
> >
I am a bit unsure about the overhead experiment results. Maybe we can add some
counters to check how many cgroups per cpu are actually touched and how many
threads are actually dequeued / enqueued for throttling / unthrottling?
Looks like busy loop workloads were used for the experiment. With throttling
deferred to exit_to_user_mode, it would only be triggered by ticks. A large
runtime debt can accumulate before the on cpu threads are actually dequeued.
(Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
distribute_cfs_runtime would finish early if the quotas are used up by the first
few cpus, which would also result in throttling/unthrottling for only a few
runqueues per period. An intermittent workload like hackbench may give us more
information.
See slide 10 of my presentation for more info:
https://lpc.events/event/18/contributions/1883/attachments/1420/3040/Priority%20Inheritance%20for%20CFS%20Bandwidth%20Control.pdf
Indeed we are seeing more cfsb scalability problems with larger servers. Both
the irq off time from the cgroup walk and the overheads from per task actions
can be problematic.
-Xi
On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote:
...
> I am a bit unsure about the overhead experiment results. Maybe we can add some
> counters to check how many cgroups per cpu are actually touched and how many
> threads are actually dequeued / enqueued for throttling / unthrottling?
Sure thing.
> Looks like busy loop workloads were used for the experiment. With throttling
> deferred to exit_to_user_mode, it would only be triggered by ticks. A large
> runtime debt can accumulate before the on cpu threads are actually dequeued.
> (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
>
> distribute_cfs_runtime would finish early if the quotas are used up by the first
> few cpus, which would also result in throttling/unthrottling for only a few
> runqueues per period. An intermittent workload like hackbench may give us more
> information.
I've added some trace prints and noticed it already invovled almost all
cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to
continue use that setup as described before:
https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/
Below is one print sample:
<idle>-0 [214] d.h1. 1879.281972: distribute_cfs_runtime: cpu214: begins <idle>-0 [214] dNh2. 1879.283564: distribute_cfs_runtime: cpu214: finishes. unthrottled rqs=383, unthro
ttled_cfs_rq=1101, unthrottled_task=69
With async unthrottle, it's not possible to account exactly how many
cfs_rqs are unthrottled and how many tasks are enqueued back, just
how many rqs are involved and how many cfs_rqs/tasks the local cpu has
unthrottled. So this sample means in distribute_cfs_runtime(), 383 rqs
are involved and the local cpu has unthrottled 1101 cfs_rqs and a total
of 69 tasks are enqueued back.
The corresponding bpftrace(duration of distribute_cfs_runtime(), in
nano-seconds) is:
@durations:
[4K, 8K) 9 | |
[8K, 16K) 227 |@@@@@@@@@@@@@@ |
[16K, 32K) 120 |@@@@@@@ |
[32K, 64K) 70 |@@@@ |
[64K, 128K) 0 | |
[128K, 256K) 0 | |
[256K, 512K) 0 | |
[512K, 1M) 158 |@@@@@@@@@ |
[1M, 2M) 832 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[2M, 4M) 177 |@@@@@@@@@@@ |
Thanks,
Aaron
> See slide 10 of my presentation for more info:
> https://lpc.events/event/18/contributions/1883/attachments/1420/3040/Priority%20Inheritance%20for%20CFS%20Bandwidth%20Control.pdf
>
> Indeed we are seeing more cfsb scalability problems with larger servers. Both
> the irq off time from the cgroup walk and the overheads from per task actions
> can be problematic.
>
> -Xi
On Mon, Mar 24, 2025 at 04:58:22PM +0800, Aaron Lu wrote:
> On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote:
> ...
> > I am a bit unsure about the overhead experiment results. Maybe we can add some
> > counters to check how many cgroups per cpu are actually touched and how many
> > threads are actually dequeued / enqueued for throttling / unthrottling?
>
> Sure thing.
>
> > Looks like busy loop workloads were used for the experiment. With throttling
> > deferred to exit_to_user_mode, it would only be triggered by ticks. A large
> > runtime debt can accumulate before the on cpu threads are actually dequeued.
> > (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
> >
> > distribute_cfs_runtime would finish early if the quotas are used up by the first
> > few cpus, which would also result in throttling/unthrottling for only a few
> > runqueues per period. An intermittent workload like hackbench may give us more
> > information.
>
> I've added some trace prints and noticed it already invovled almost all
> cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to
> continue use that setup as described before:
> https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/
One more data point that might be interesting. I've tested this on a
v5.15 based kernel where async unthrottle is not available yet so things
should be worse.
As Xi mentioned, since the test program is cpu hog, I tweaked the quota
setting to make throttle happen more likely.
The bpftrace duration of distribute_cfs_runtime() is:
@durations:
[4K, 8K) 1 | |
[8K, 16K) 8 | |
[16K, 32K) 1 | |
[32K, 64K) 0 | |
[64K, 128K) 0 | |
[128K, 256K) 0 | |
[256K, 512K) 0 | |
[512K, 1M) 0 | |
[1M, 2M) 0 | |
[2M, 4M) 0 | |
[4M, 8M) 0 | |
[8M, 16M) 0 | |
[16M, 32M) 0 | |
[32M, 64M) 376 |@@@@@@@@@@@@@@@@@@@@@@@ |
[64M, 128M) 824 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
One random trace point from the trace prints is:
<idle>-0 [117] d.h1. 83206.734588: distribute_cfs_runtime: cpu117: begins
<idle>-0 [117] dnh1. 83206.801902: distribute_cfs_runtime: cpu117: finishes: unthrottled_rqs=384, unthrottled_cfs_rq=422784, unthrottled_task=10000
So for the above trace point, distribute_cfs_runtime() unthrottled 384
rqs with a total of 422784 cfs_rqs and enqueued back 10000 tasks, this
took about 70ms.
Note that other things like rq lock contention might make things worse -
I did not notice any lock contention in this setup.
I've attached the corresponding debug diff in case it's not clear what
this trace print means.
On Tue, Mar 25, 2025 at 3:02 AM Aaron Lu <ziqianlu@bytedance.com> wrote: > > On Mon, Mar 24, 2025 at 04:58:22PM +0800, Aaron Lu wrote: > > On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote: > > ... > > > I am a bit unsure about the overhead experiment results. Maybe we can add some > > > counters to check how many cgroups per cpu are actually touched and how many > > > threads are actually dequeued / enqueued for throttling / unthrottling? > > > > Sure thing. > > > > > Looks like busy loop workloads were used for the experiment. With throttling > > > deferred to exit_to_user_mode, it would only be triggered by ticks. A large > > > runtime debt can accumulate before the on cpu threads are actually dequeued. > > > (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/) > > > > > > distribute_cfs_runtime would finish early if the quotas are used up by the first > > > few cpus, which would also result in throttling/unthrottling for only a few > > > runqueues per period. An intermittent workload like hackbench may give us more > > > information. > > > > I've added some trace prints and noticed it already invovled almost all > > cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to > > continue use that setup as described before: > > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/ > > One more data point that might be interesting. I've tested this on a > v5.15 based kernel where async unthrottle is not available yet so things > should be worse. > > As Xi mentioned, since the test program is cpu hog, I tweaked the quota > setting to make throttle happen more likely. > > The bpftrace duration of distribute_cfs_runtime() is: > > @durations: > [4K, 8K) 1 | | > [8K, 16K) 8 | | > [16K, 32K) 1 | | > [32K, 64K) 0 | | > [64K, 128K) 0 | | > [128K, 256K) 0 | | > [256K, 512K) 0 | | > [512K, 1M) 0 | | > [1M, 2M) 0 | | > [2M, 4M) 0 | | > [4M, 8M) 0 | | > [8M, 16M) 0 | | > [16M, 32M) 0 | | > [32M, 64M) 376 |@@@@@@@@@@@@@@@@@@@@@@@ | > [64M, 128M) 824 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > One random trace point from the trace prints is: > > <idle>-0 [117] d.h1. 83206.734588: distribute_cfs_runtime: cpu117: begins > <idle>-0 [117] dnh1. 83206.801902: distribute_cfs_runtime: cpu117: finishes: unthrottled_rqs=384, unthrottled_cfs_rq=422784, unthrottled_task=10000 > > So for the above trace point, distribute_cfs_runtime() unthrottled 384 > rqs with a total of 422784 cfs_rqs and enqueued back 10000 tasks, this > took about 70ms. > > Note that other things like rq lock contention might make things worse - > I did not notice any lock contention in this setup. > > I've attached the corresponding debug diff in case it's not clear what > this trace print means. Thanks for getting the test results! My understanding is that you now have 2 test configurations and new vs old throttling mechanisms. I think the two groups of results were test1 + new method and test2 + old method. Is that the case? For test1 + new method, we have "..in distribute_cfs_runtime(), 383 rqs are involved and the local cpu has unthrottled 1101 cfs_rqs and a total of 69 tasks are enqueued back". I think if the workload is in a steady and persistently over limit state we'd have 1000+ tasks periodically being throttled and unthrottled, at least with the old method. So "1101 cfs_rqs and a total of 69 tasks are enqueued back" might be a special case? I'll also try to create a stress test that mimics our production problems but it would take some time.
On Thu, Mar 27, 2025 at 05:11:42PM -0700, Xi Wang wrote: > On Tue, Mar 25, 2025 at 3:02 AM Aaron Lu <ziqianlu@bytedance.com> wrote: > > > > On Mon, Mar 24, 2025 at 04:58:22PM +0800, Aaron Lu wrote: > > > On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote: > > > ... > > > > I am a bit unsure about the overhead experiment results. Maybe we can add some > > > > counters to check how many cgroups per cpu are actually touched and how many > > > > threads are actually dequeued / enqueued for throttling / unthrottling? > > > > > > Sure thing. > > > > > > > Looks like busy loop workloads were used for the experiment. With throttling > > > > deferred to exit_to_user_mode, it would only be triggered by ticks. A large > > > > runtime debt can accumulate before the on cpu threads are actually dequeued. > > > > (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/) > > > > > > > > distribute_cfs_runtime would finish early if the quotas are used up by the first > > > > few cpus, which would also result in throttling/unthrottling for only a few > > > > runqueues per period. An intermittent workload like hackbench may give us more > > > > information. > > > > > > I've added some trace prints and noticed it already invovled almost all > > > cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to > > > continue use that setup as described before: > > > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/ > > > > One more data point that might be interesting. I've tested this on a > > v5.15 based kernel where async unthrottle is not available yet so things > > should be worse. > > > > As Xi mentioned, since the test program is cpu hog, I tweaked the quota > > setting to make throttle happen more likely. > > > > The bpftrace duration of distribute_cfs_runtime() is: > > > > @durations: > > [4K, 8K) 1 | | > > [8K, 16K) 8 | | > > [16K, 32K) 1 | | > > [32K, 64K) 0 | | > > [64K, 128K) 0 | | > > [128K, 256K) 0 | | > > [256K, 512K) 0 | | > > [512K, 1M) 0 | | > > [1M, 2M) 0 | | > > [2M, 4M) 0 | | > > [4M, 8M) 0 | | > > [8M, 16M) 0 | | > > [16M, 32M) 0 | | > > [32M, 64M) 376 |@@@@@@@@@@@@@@@@@@@@@@@ | > > [64M, 128M) 824 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| > > > > One random trace point from the trace prints is: > > > > <idle>-0 [117] d.h1. 83206.734588: distribute_cfs_runtime: cpu117: begins > > <idle>-0 [117] dnh1. 83206.801902: distribute_cfs_runtime: cpu117: finishes: unthrottled_rqs=384, unthrottled_cfs_rq=422784, unthrottled_task=10000 > > > > So for the above trace point, distribute_cfs_runtime() unthrottled 384 > > rqs with a total of 422784 cfs_rqs and enqueued back 10000 tasks, this > > took about 70ms. > > > > Note that other things like rq lock contention might make things worse - > > I did not notice any lock contention in this setup. > > > > I've attached the corresponding debug diff in case it's not clear what > > this trace print means. > > Thanks for getting the test results! > > My understanding is that you now have 2 test configurations and new vs > old throttling mechanisms. I think the two groups of results were > test1 + new method and test2 + old method. Is that the case? Sorry for the confusion. First result is done using this patch series on top of latest tip/sched/core branch which has async unthrottle feature. Second result is done using this patch series(adjusted to run on an old kernel of course) on top of v5.15 based kernel where async unthrottle is not available yet. > > For test1 + new method, we have "..in distribute_cfs_runtime(), 383 > rqs are involved and the local cpu has unthrottled 1101 cfs_rqs and a > total of 69 tasks are enqueued back". I think if the workload is in a > steady and persistently over limit state we'd have 1000+ tasks > periodically being throttled and unthrottled, at least with the old > method. So "1101 cfs_rqs and a total of 69 tasks are enqueued back" > might be a special case? With async unthrottle, distribute_cfs_runtime() will only deal with local cpu's unthrottle and other cpu's unthrottle is done by sending an IPI to let those cpus deal with their own unthrottle. Since there are a total of 2000 leaf cfs_rqs and 20K tasks, on this 384 cpus machine, each cpu should have roughly about 52 tasks. And one top level hierarchy have about 1000 leaf cfs_rqs. That's why there are 1101 cfs_rqs unthrottled and 69 tasks are enqueued. These numbers mean on that specific cpu alone where the hrtime fired, it iterates 1101 cfs_rqs and enqueued back 69 tasks. All other tasks are enqueued in other CPU's IPI handler __cfsb_csd_unthrottle(). With this said, for this setup, "1101 cfs_rqs and 69 tasks" is not a special case but a worse than normal case, if not the worst. When async unthrottle feature is not available(like in the 2nd result), distribute_cfs_runtime() will have to iterate all cpu's throttled cfs_rqs and enqueue back all those throttled tasks. Since these number would be much larger, I showed them for the sole purpose of demonstrating how bad the duration can be when 10K tasks have to be enqueued back in one go. > I'll also try to create a stress test that mimics our production > problems but it would take some time. That would be good to have, thanks.
On 2025/3/13 15:21, Aaron Lu wrote: > From: Valentin Schneider <vschneid@redhat.com> > > Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq, > add a task work to them so that when those tasks return to user, the > actual throttle/dequeue can happen. > > Note that since the throttle/dequeue always happens on a task basis when > it returns to user, it's no longer necessary for check_cfs_rq_runtime() > to return a value and pick_task_fair() acts differently according to that > return value, so check_cfs_rq_runtime() is changed to not return a > value. Previously with the per-cfs_rq throttling, we use update_curr() -> put() path to throttle the cfs_rq and dequeue it from the cfs_rq tree. Now with your per-task throttling, maybe things can become simpler. That we can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these throttled. Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned. WDYT? Thanks.
On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote: > On 2025/3/13 15:21, Aaron Lu wrote: > > From: Valentin Schneider <vschneid@redhat.com> > > > > Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq, > > add a task work to them so that when those tasks return to user, the > > actual throttle/dequeue can happen. > > > > Note that since the throttle/dequeue always happens on a task basis when > > it returns to user, it's no longer necessary for check_cfs_rq_runtime() > > to return a value and pick_task_fair() acts differently according to that > > return value, so check_cfs_rq_runtime() is changed to not return a > > value. > > Previously with the per-cfs_rq throttling, we use update_curr() -> put() path > to throttle the cfs_rq and dequeue it from the cfs_rq tree. > > Now with your per-task throttling, maybe things can become simpler. That we > can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these > throttled. Do I understand correctly that now in throttle_cfs_rq(), we just mark this hierarchy as throttled, but do not add any throttle work to these tasks in this hierarchy and leave the throttle work add job to pick time? > Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work > for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned. If we add a check point in pick time, maybe we can also avoid the check in enqueue time. One thing I'm thinking is, for a task, it may be picked multiple times with only a single enqueue so if we do the check in pick, the overhead can be larger? > WDYT? Thanks for your suggestion. I'll try this approach and see how it turned out.
On 2025/3/14 17:42, Aaron Lu wrote: > On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote: >> On 2025/3/13 15:21, Aaron Lu wrote: >>> From: Valentin Schneider <vschneid@redhat.com> >>> >>> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq, >>> add a task work to them so that when those tasks return to user, the >>> actual throttle/dequeue can happen. >>> >>> Note that since the throttle/dequeue always happens on a task basis when >>> it returns to user, it's no longer necessary for check_cfs_rq_runtime() >>> to return a value and pick_task_fair() acts differently according to that >>> return value, so check_cfs_rq_runtime() is changed to not return a >>> value. >> >> Previously with the per-cfs_rq throttling, we use update_curr() -> put() path >> to throttle the cfs_rq and dequeue it from the cfs_rq tree. >> >> Now with your per-task throttling, maybe things can become simpler. That we >> can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these >> throttled. > > Do I understand correctly that now in throttle_cfs_rq(), we just mark > this hierarchy as throttled, but do not add any throttle work to these > tasks in this hierarchy and leave the throttle work add job to pick > time? Right, we can move throttle_cfs_rq() forward to the curr accouting time, which just mark these throttled. And move setup_task_work() afterward to the pick task time, which make that task dequeue when ret2user. > >> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work >> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned. > > If we add a check point in pick time, maybe we can also avoid the check > in enqueue time. One thing I'm thinking is, for a task, it may be picked > multiple times with only a single enqueue so if we do the check in pick, > the overhead can be larger? As Prateek already mentioned, this check cost is negligeable. > >> WDYT? > > Thanks for your suggestion. I'll try this approach and see how it turned > out.
Hi Chengming, On Fri, Mar 14, 2025 at 07:07:10PM +0800, Chengming Zhou wrote: > On 2025/3/14 17:42, Aaron Lu wrote: > > On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote: > > > On 2025/3/13 15:21, Aaron Lu wrote: > > > > From: Valentin Schneider <vschneid@redhat.com> > > > > > > > > Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq, > > > > add a task work to them so that when those tasks return to user, the > > > > actual throttle/dequeue can happen. > > > > > > > > Note that since the throttle/dequeue always happens on a task basis when > > > > it returns to user, it's no longer necessary for check_cfs_rq_runtime() > > > > to return a value and pick_task_fair() acts differently according to that > > > > return value, so check_cfs_rq_runtime() is changed to not return a > > > > value. > > > > > > Previously with the per-cfs_rq throttling, we use update_curr() -> put() path > > > to throttle the cfs_rq and dequeue it from the cfs_rq tree. > > > > > > Now with your per-task throttling, maybe things can become simpler. That we > > > can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these > > > throttled. > > > > Do I understand correctly that now in throttle_cfs_rq(), we just mark > > this hierarchy as throttled, but do not add any throttle work to these > > tasks in this hierarchy and leave the throttle work add job to pick > > time? > > Right, we can move throttle_cfs_rq() forward to the curr accouting time, which > just mark these throttled. While preparing the next version, I found that if I move throttle_cfs_rq() to accounting time, like in __account_cfs_rq_runtime(), then it is possible on unthrottle path, the following can happen: unthrottle_cfs_rq() -> enqueue_task_fair() -> update_curr() -> account_cfs_rq_runtime() -> throttle_cfs_rq()... Initially I was confused why update_curr() can notice a non-null curr when this cfs_rq is being unthrottled but then I realized in this task based throttling model, it is possible some task woke up in that throttled cfs_rq and have cfs_rq->curr set and then cfs_rq gets unthrottled. So I suppose I'll keep the existing way of marking a cfs_rq as throttled by calling check_cfs_rq_runtime() in the following two places: - in pick_task_fair(), so that the to-be-picked cfs_rq can be marked for throttle; - in put_prev_entity() for prev runnable task's cfs_rq. > And move setup_task_work() afterward to the pick task time, which make that task > dequeue when ret2user. No problem for this part as far as my test goes :-) Thanks, Aaron > > > > > Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work > > > for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned. > > > > If we add a check point in pick time, maybe we can also avoid the check > > in enqueue time. One thing I'm thinking is, for a task, it may be picked > > multiple times with only a single enqueue so if we do the check in pick, > > the overhead can be larger? > > As Prateek already mentioned, this check cost is negligeable. > > > > > > WDYT? > > > > Thanks for your suggestion. I'll try this approach and see how it turned > > out.
On 2025/3/31 14:42, Aaron Lu wrote: > Hi Chengming, > > On Fri, Mar 14, 2025 at 07:07:10PM +0800, Chengming Zhou wrote: >> On 2025/3/14 17:42, Aaron Lu wrote: >>> On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote: >>>> On 2025/3/13 15:21, Aaron Lu wrote: >>>>> From: Valentin Schneider <vschneid@redhat.com> >>>>> >>>>> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq, >>>>> add a task work to them so that when those tasks return to user, the >>>>> actual throttle/dequeue can happen. >>>>> >>>>> Note that since the throttle/dequeue always happens on a task basis when >>>>> it returns to user, it's no longer necessary for check_cfs_rq_runtime() >>>>> to return a value and pick_task_fair() acts differently according to that >>>>> return value, so check_cfs_rq_runtime() is changed to not return a >>>>> value. >>>> >>>> Previously with the per-cfs_rq throttling, we use update_curr() -> put() path >>>> to throttle the cfs_rq and dequeue it from the cfs_rq tree. >>>> >>>> Now with your per-task throttling, maybe things can become simpler. That we >>>> can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these >>>> throttled. >>> >>> Do I understand correctly that now in throttle_cfs_rq(), we just mark >>> this hierarchy as throttled, but do not add any throttle work to these >>> tasks in this hierarchy and leave the throttle work add job to pick >>> time? >> >> Right, we can move throttle_cfs_rq() forward to the curr accouting time, which >> just mark these throttled. > > While preparing the next version, I found that if I move > throttle_cfs_rq() to accounting time, like in __account_cfs_rq_runtime(), > then it is possible on unthrottle path, the following can happen: > unthrottle_cfs_rq() -> enqueue_task_fair() -> update_curr() -> > account_cfs_rq_runtime() -> throttle_cfs_rq()... Ah, right, then it's best to leave throttle_cfs_rq() where it is. > > Initially I was confused why update_curr() can notice a non-null curr > when this cfs_rq is being unthrottled but then I realized in this task > based throttling model, it is possible some task woke up in that > throttled cfs_rq and have cfs_rq->curr set and then cfs_rq gets > unthrottled. > > So I suppose I'll keep the existing way of marking a cfs_rq as > throttled by calling check_cfs_rq_runtime() in the following two places: > - in pick_task_fair(), so that the to-be-picked cfs_rq can be marked for > throttle; > - in put_prev_entity() for prev runnable task's cfs_rq. > >> And move setup_task_work() afterward to the pick task time, which make that task >> dequeue when ret2user. > > No problem for this part as far as my test goes :-) Good to hear. Thanks! > > Thanks, > Aaron > >>> >>>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work >>>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned. >>> >>> If we add a check point in pick time, maybe we can also avoid the check >>> in enqueue time. One thing I'm thinking is, for a task, it may be picked >>> multiple times with only a single enqueue so if we do the check in pick, >>> the overhead can be larger? >> >> As Prateek already mentioned, this check cost is negligeable. >> >>> >>>> WDYT? >>> >>> Thanks for your suggestion. I'll try this approach and see how it turned >>> out.
Hello Aaron,
On 3/14/2025 3:12 PM, Aaron Lu wrote:
>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
> If we add a check point in pick time, maybe we can also avoid the check
> in enqueue time. One thing I'm thinking is, for a task, it may be picked
> multiple times with only a single enqueue so if we do the check in pick,
> the overhead can be larger?
I think it can be minimized to a good extent. Something like:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d646451d617c..ba6571368840 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
static inline void task_throttle_setup_work(struct task_struct *p)
{
+ if (task_has_throttle_work(p))
+ return;
+
/*
* Kthreads and exiting tasks don't return to userspace, so adding the
* work is pointless
@@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
if ((p->flags & (PF_EXITING | PF_KTHREAD)))
return;
- if (task_has_throttle_work(p))
- return;
-
task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
}
@@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
node = rb_next(node);
}
- /* curr is not in the timeline tree */
- if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
- p = task_of(cfs_rq->curr);
- task_throttle_setup_work(p);
- }
-
return 0;
}
@@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
SCHED_WARN_ON(cfs_rq->throttled_clock);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
+
+ /*
+ * If cfs_rq->curr is set, check if current task is queued
+ * and set up the throttle work proactively.
+ */
+ if (cfs_rq->curr) {
+ struct task_struct *p = rq->donor; /* scheduling context with proxy */
+
+ if (task_on_rq_queued(p))
+ task_throttle_setup_work(p);
+ }
+
return;
}
@@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
struct sched_entity *pse = &prev->se;
struct cfs_rq *cfs_rq;
+ /*
+ * Check if throttle work needs to be setup when
+ * switching to a different task.
+ */
+ if (throttled_hierarchy(cfs_rq_of(se)))
+ task_throttle_setup_work(p);
+
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
int pse_depth = pse->depth;
@@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
account_cfs_rq_runtime(cfs_rq, 0);
}
+ if (throttled_hierarchy(cfs_rq_of(se)))
+ task_throttle_setup_work(p);
+
__set_next_task_fair(rq, p, first);
}
--
.. with the additional plumbing in place of course.
--
Thanks and Regards,
Prateek
Hi Prateek,
On Fri, Mar 14, 2025 at 03:56:26PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
>
> On 3/14/2025 3:12 PM, Aaron Lu wrote:
> > > Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
> > > for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
> > If we add a check point in pick time, maybe we can also avoid the check
> > in enqueue time. One thing I'm thinking is, for a task, it may be picked
> > multiple times with only a single enqueue so if we do the check in pick,
> > the overhead can be larger?
>
> I think it can be minimized to a good extent. Something like:
I see, thanks for the illustration.
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d646451d617c..ba6571368840 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
> static inline void task_throttle_setup_work(struct task_struct *p)
> {
> + if (task_has_throttle_work(p))
> + return;
> +
> /*
> * Kthreads and exiting tasks don't return to userspace, so adding the
> * work is pointless
> @@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
> if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> return;
> - if (task_has_throttle_work(p))
> - return;
> -
> task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> }
> @@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
> node = rb_next(node);
> }
> - /* curr is not in the timeline tree */
> - if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> - p = task_of(cfs_rq->curr);
> - task_throttle_setup_work(p);
> - }
> -
Should we also remove adding throttle work for those tasks in
cfs_rq->tasks_timeline?
> return 0;
> }
> @@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> SCHED_WARN_ON(cfs_rq->throttled_clock);
> if (cfs_rq->nr_queued)
> cfs_rq->throttled_clock = rq_clock(rq);
> +
> + /*
> + * If cfs_rq->curr is set, check if current task is queued
> + * and set up the throttle work proactively.
> + */
> + if (cfs_rq->curr) {
> + struct task_struct *p = rq->donor; /* scheduling context with proxy */
I'll have to check what rq->donor means.
I think the point is to proactively add throttle work for rq->curr if
rq->curr is in this throttled hierarchy? Because the only check point to
add throttle work will be at pick time and curr will probably not be
picked anytime soon.
Thanks,
Aaron
> +
> + if (task_on_rq_queued(p))
> + task_throttle_setup_work(p);
> + }
> +
> return;
> }
> @@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
> struct sched_entity *pse = &prev->se;
> struct cfs_rq *cfs_rq;
> + /*
> + * Check if throttle work needs to be setup when
> + * switching to a different task.
> + */
> + if (throttled_hierarchy(cfs_rq_of(se)))
> + task_throttle_setup_work(p);
> +
> while (!(cfs_rq = is_same_group(se, pse))) {
> int se_depth = se->depth;
> int pse_depth = pse->depth;
> @@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> account_cfs_rq_runtime(cfs_rq, 0);
> }
> + if (throttled_hierarchy(cfs_rq_of(se)))
> + task_throttle_setup_work(p);
> +
> __set_next_task_fair(rq, p, first);
> }
> --
>
> .. with the additional plumbing in place of course.
>
> --
> Thanks and Regards,
> Prateek
>
(+jstultz for some clarification around rq->donor)
Hello Aaron,
On 3/14/2025 5:17 PM, Aaron Lu wrote:
> Hi Prateek,
>
> On Fri, Mar 14, 2025 at 03:56:26PM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 3/14/2025 3:12 PM, Aaron Lu wrote:
>>>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>>>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
>>> If we add a check point in pick time, maybe we can also avoid the check
>>> in enqueue time. One thing I'm thinking is, for a task, it may be picked
>>> multiple times with only a single enqueue so if we do the check in pick,
>>> the overhead can be larger?
>>
>> I think it can be minimized to a good extent. Something like:
>
> I see, thanks for the illustration.
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index d646451d617c..ba6571368840 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
>> static inline void task_throttle_setup_work(struct task_struct *p)
>> {
>> + if (task_has_throttle_work(p))
>> + return;
>> +
>> /*
>> * Kthreads and exiting tasks don't return to userspace, so adding the
>> * work is pointless
>> @@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
>> if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>> return;
>> - if (task_has_throttle_work(p))
>> - return;
>> -
>> task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>> }
>> @@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
>> node = rb_next(node);
>> }
>> - /* curr is not in the timeline tree */
>> - if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>> - p = task_of(cfs_rq->curr);
>> - task_throttle_setup_work(p);
>> - }
>> -
>
> Should we also remove adding throttle work for those tasks in
> cfs_rq->tasks_timeline?
Yes. I don't think that is required with this approach suggested by
Chengming.
>
>> return 0;
>> }
>> @@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>> SCHED_WARN_ON(cfs_rq->throttled_clock);
>> if (cfs_rq->nr_queued)
>> cfs_rq->throttled_clock = rq_clock(rq);
>> +
>> + /*
>> + * If cfs_rq->curr is set, check if current task is queued
>> + * and set up the throttle work proactively.
>> + */
>> + if (cfs_rq->curr) {
>> + struct task_struct *p = rq->donor; /* scheduling context with proxy */
>
> I'll have to check what rq->donor means.
Essentially "rq->donor" is the scheduling context (where the time
accounting happens) whereas "rq->curr" may point to a mutex-holder that
is running with the context of "rq->donor" to unblock itself as quickly
as possible.
If we experience a throttle, it is because the time accounting to
"rq->donor" hierarchy couldn't fetch anymore bandwidth.
TBH, adding the task work to either of them doesn't make sense since
the "rq->donor" will be picked to run again once mutex-holder does a
handoff and the throttle work can be added then too.
> I think the point is to proactively add throttle work for rq->curr if
> rq->curr is in this throttled hierarchy? Because the only check point to
> add throttle work will be at pick time and curr will probably not be
> picked anytime soon.
John knows these bits best and maybe he can correct me if I'm wrong
here. For the time being, rq->curr and rq->donor both point to the
same task but that will change when Part-2 of proxy execution series
lands. We can just clarify these bits now to avoid any conflicts later.
--
Thanks and Regards,
Prateek
>
> Thanks,
> Aaron
>
>> +
>> + if (task_on_rq_queued(p))
>> + task_throttle_setup_work(p);
>> + }
>> +
>> return;
>> }
>> @@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>> struct sched_entity *pse = &prev->se;
>> struct cfs_rq *cfs_rq;
>> + /*
>> + * Check if throttle work needs to be setup when
>> + * switching to a different task.
>> + */
>> + if (throttled_hierarchy(cfs_rq_of(se)))
>> + task_throttle_setup_work(p);
>> +
>> while (!(cfs_rq = is_same_group(se, pse))) {
>> int se_depth = se->depth;
>> int pse_depth = pse->depth;
>> @@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>> account_cfs_rq_runtime(cfs_rq, 0);
>> }
>> + if (throttled_hierarchy(cfs_rq_of(se)))
>> + task_throttle_setup_work(p);
>> +
>> __set_next_task_fair(rq, p, first);
>> }
>> --
>>
>> .. with the additional plumbing in place of course.
>>
>> --
>> Thanks and Regards,
>> Prateek
>>
On 2025/3/14 19:47, Aaron Lu wrote:
> Hi Prateek,
>
> On Fri, Mar 14, 2025 at 03:56:26PM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 3/14/2025 3:12 PM, Aaron Lu wrote:
>>>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>>>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
>>> If we add a check point in pick time, maybe we can also avoid the check
>>> in enqueue time. One thing I'm thinking is, for a task, it may be picked
>>> multiple times with only a single enqueue so if we do the check in pick,
>>> the overhead can be larger?
>>
>> I think it can be minimized to a good extent. Something like:
>
> I see, thanks for the illustration.
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index d646451d617c..ba6571368840 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
>> static inline void task_throttle_setup_work(struct task_struct *p)
>> {
>> + if (task_has_throttle_work(p))
>> + return;
>> +
>> /*
>> * Kthreads and exiting tasks don't return to userspace, so adding the
>> * work is pointless
>> @@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
>> if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>> return;
>> - if (task_has_throttle_work(p))
>> - return;
>> -
>> task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>> }
>> @@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
>> node = rb_next(node);
>> }
>> - /* curr is not in the timeline tree */
>> - if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>> - p = task_of(cfs_rq->curr);
>> - task_throttle_setup_work(p);
>> - }
>> -
>
> Should we also remove adding throttle work for those tasks in
> cfs_rq->tasks_timeline?
>
>> return 0;
>> }
>> @@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>> SCHED_WARN_ON(cfs_rq->throttled_clock);
>> if (cfs_rq->nr_queued)
>> cfs_rq->throttled_clock = rq_clock(rq);
>> +
>> + /*
>> + * If cfs_rq->curr is set, check if current task is queued
>> + * and set up the throttle work proactively.
>> + */
>> + if (cfs_rq->curr) {
>> + struct task_struct *p = rq->donor; /* scheduling context with proxy */
>
> I'll have to check what rq->donor means.
> I think the point is to proactively add throttle work for rq->curr if
> rq->curr is in this throttled hierarchy? Because the only check point to
> add throttle work will be at pick time and curr will probably not be
> picked anytime soon.
The cfs_rq based throttle use update_curr() -> put_prev_task(), so it just
resched_curr() in update_curr().
With per-task throttle, we don't need to call resched_curr(), we should
setup throttle work for this curr, which will dequeue & resched when ret2user.
Thanks.
>
> Thanks,
> Aaron
>
>> +
>> + if (task_on_rq_queued(p))
>> + task_throttle_setup_work(p);
>> + }
>> +
>> return;
>> }
>> @@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>> struct sched_entity *pse = &prev->se;
>> struct cfs_rq *cfs_rq;
>> + /*
>> + * Check if throttle work needs to be setup when
>> + * switching to a different task.
>> + */
>> + if (throttled_hierarchy(cfs_rq_of(se)))
>> + task_throttle_setup_work(p);
>> +
>> while (!(cfs_rq = is_same_group(se, pse))) {
>> int se_depth = se->depth;
>> int pse_depth = pse->depth;
>> @@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>> account_cfs_rq_runtime(cfs_rq, 0);
>> }
>> + if (throttled_hierarchy(cfs_rq_of(se)))
>> + task_throttle_setup_work(p);
>> +
>> __set_next_task_fair(rq, p, first);
>> }
>> --
>>
>> .. with the additional plumbing in place of course.
>>
>> --
>> Thanks and Regards,
>> Prateek
>>
On 3/14/2025 2:09 PM, Chengming Zhou wrote: > On 2025/3/13 15:21, Aaron Lu wrote: >> From: Valentin Schneider <vschneid@redhat.com> >> >> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq, >> add a task work to them so that when those tasks return to user, the >> actual throttle/dequeue can happen. >> >> Note that since the throttle/dequeue always happens on a task basis when >> it returns to user, it's no longer necessary for check_cfs_rq_runtime() >> to return a value and pick_task_fair() acts differently according to that >> return value, so check_cfs_rq_runtime() is changed to not return a >> value. > > Previously with the per-cfs_rq throttling, we use update_curr() -> put() path > to throttle the cfs_rq and dequeue it from the cfs_rq tree. > > Now with your per-task throttling, maybe things can become simpler. That we > can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these > throttled. > > Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work > for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned. +1 That seems like a good idea with per-task approach. > > WDYT? > > Thanks. -- Thanks and Regards, Prateek
Hello Aaron,
On 3/13/2025 12:51 PM, Aaron Lu wrote:
[..snip..]
>
> +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> static void throttle_cfs_rq_work(struct callback_head *work)
> {
> + struct task_struct *p = container_of(work, struct task_struct,
> sched_throttle_work);
> + struct sched_entity *se;
> + struct cfs_rq *cfs_rq;
> + struct rq *rq;
> + struct rq_flags rf;
> +
> + WARN_ON_ONCE(p != current);
> + p->sched_throttle_work.next = &p->sched_throttle_work;
> +
> + /*
> + * If task is exiting, then there won't be a return to userspace, so we
> + * don't have to bother with any of this.
> + */
> + if ((p->flags & PF_EXITING))
> + return;
> +
> + rq = task_rq_lock(p, &rf);
nit. With CLASS(task_rq_lock, rq_guard)(p), you can fetch the rq with
"rq_gurad.rq" and the "goto out_unlock" can be replaced with simple
return.
> +
> + se = &p->se;
> + cfs_rq = cfs_rq_of(se);
> +
> + /* Raced, forget */
> + if (p->sched_class != &fair_sched_class)
> + goto out_unlock;
> +
> + /*
> + * If not in limbo, then either replenish has happened or this task got
> + * migrated out of the throttled cfs_rq, move along
> + */
> + if (!cfs_rq->throttle_count)
> + goto out_unlock;
> +
> + update_rq_clock(rq);
> + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);> + resched_curr(rq);
> +
> +out_unlock:
> + task_rq_unlock(rq, p, &rf);
> }
>
> void init_cfs_throttle_work(struct task_struct *p)
> @@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
> *tg, void *data)
> return 0;
> }
>
> +static inline bool task_has_throttle_work(struct task_struct *p)
> +{
> + return p->sched_throttle_work.next != &p->sched_throttle_work;
> +}
> +
> +static inline void task_throttle_setup_work(struct task_struct *p)
> +{
> + /*
> + * Kthreads and exiting tasks don't return to userspace, so adding the
> + * work is pointless
> + */
> + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> + return;
> +
> + if (task_has_throttle_work(p))
> + return;
> +
> + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> +}
> +
> static int tg_throttle_down(struct task_group *tg, void *data)
> {
> struct rq *rq = data;
> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> + struct task_struct *p;
> + struct rb_node *node;
> +
> + cfs_rq->throttle_count++;
> + if (cfs_rq->throttle_count > 1)
> + return 0;
General question: Do we need the throttled_lb_pair() check in
can_migrate_task() with the per-task throttle? Moving a throttled task
to another CPU can ensures that the task can run quicker and exit to
user space as quickly as possible and once the task dequeues, it will
remove itself from the list of fair tasks making it unreachable for
the load balancer. Thoughts?
>
> /* group is entering throttled state, stop time */
> - if (!cfs_rq->throttle_count) {
> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> - list_del_leaf_cfs_rq(cfs_rq);
> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> + list_del_leaf_cfs_rq(cfs_rq);
>
> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> - if (cfs_rq->nr_queued)
> - cfs_rq->throttled_clock_self = rq_clock(rq);
> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> + if (cfs_rq->nr_queued)
> + cfs_rq->throttled_clock_self = rq_clock(rq);
> +
> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> + /*
> + * rq_lock is held, current is (obviously) executing this in kernelspace.
> + *
> + * All other tasks enqueued on this rq have their saved PC at the
> + * context switch, so they will go through the kernel before returning
> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> + * install the task_work on all of them.
> + */
> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> + while (node) {
> + struct sched_entity *se = __node_2_se(node);
> +
> + if (!entity_is_task(se))
> + goto next;
> +
> + p = task_of(se);
> + task_throttle_setup_work(p);
> +next:
> + node = rb_next(node);
> + }
> +
> + /* curr is not in the timeline tree */
> + if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> + p = task_of(cfs_rq->curr);
> + task_throttle_setup_work(p);
> }
> - cfs_rq->throttle_count++;
>
> return 0;
> }
>
[..snip..]
--
Thanks and Regards,
Prateek
On Fri, Mar 14, 2025 at 08:58:14AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
>
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
>
> [..snip..]
>
> >
> > +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> > static void throttle_cfs_rq_work(struct callback_head *work)
> > {
> > + struct task_struct *p = container_of(work, struct task_struct,
> > sched_throttle_work);
> > + struct sched_entity *se;
> > + struct cfs_rq *cfs_rq;
> > + struct rq *rq;
> > + struct rq_flags rf;
> > +
> > + WARN_ON_ONCE(p != current);
> > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > +
> > + /*
> > + * If task is exiting, then there won't be a return to userspace, so we
> > + * don't have to bother with any of this.
> > + */
> > + if ((p->flags & PF_EXITING))
> > + return;
> > +
> > + rq = task_rq_lock(p, &rf);
>
> nit. With CLASS(task_rq_lock, rq_guard)(p), you can fetch the rq with
> "rq_gurad.rq" and the "goto out_unlock" can be replaced with simple
> return.
Got it, thanks for the suggestion.
> > +
> > + se = &p->se;
> > + cfs_rq = cfs_rq_of(se);
> > +
> > + /* Raced, forget */
> > + if (p->sched_class != &fair_sched_class)
> > + goto out_unlock;
> > +
> > + /*
> > + * If not in limbo, then either replenish has happened or this task got
> > + * migrated out of the throttled cfs_rq, move along
> > + */
> > + if (!cfs_rq->throttle_count)
> > + goto out_unlock;
> > +
> > + update_rq_clock(rq);
> > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);> + resched_curr(rq);
> > +
> > +out_unlock:
> > + task_rq_unlock(rq, p, &rf);
> > }
> >
> > void init_cfs_throttle_work(struct task_struct *p)
> > @@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
> > *tg, void *data)
> > return 0;
> > }
> >
> > +static inline bool task_has_throttle_work(struct task_struct *p)
> > +{
> > + return p->sched_throttle_work.next != &p->sched_throttle_work;
> > +}
> > +
> > +static inline void task_throttle_setup_work(struct task_struct *p)
> > +{
> > + /*
> > + * Kthreads and exiting tasks don't return to userspace, so adding the
> > + * work is pointless
> > + */
> > + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > + return;
> > +
> > + if (task_has_throttle_work(p))
> > + return;
> > +
> > + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> > +}
> > +
> > static int tg_throttle_down(struct task_group *tg, void *data)
> > {
> > struct rq *rq = data;
> > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > + struct task_struct *p;
> > + struct rb_node *node;
> > +
> > + cfs_rq->throttle_count++;
> > + if (cfs_rq->throttle_count > 1)
> > + return 0;
>
> General question: Do we need the throttled_lb_pair() check in
> can_migrate_task() with the per-task throttle? Moving a throttled task
> to another CPU can ensures that the task can run quicker and exit to
> user space as quickly as possible and once the task dequeues, it will
> remove itself from the list of fair tasks making it unreachable for
> the load balancer. Thoughts?
That's a good point.
The current approach dequeued the task and removed it from rq's
cfs_tasks list, causing it lose the load balance opportunity. This is
pretty sad.
I'll need to think about this. I hope we can somehow keep the throttled
tasks in cfs_tasks list, I'll see how to make this happen.
Thanks,
Aaron
> >
> > /* group is entering throttled state, stop time */
> > - if (!cfs_rq->throttle_count) {
> > - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > - list_del_leaf_cfs_rq(cfs_rq);
> > + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > + list_del_leaf_cfs_rq(cfs_rq);
> >
> > - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > - if (cfs_rq->nr_queued)
> > - cfs_rq->throttled_clock_self = rq_clock(rq);
> > + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > + if (cfs_rq->nr_queued)
> > + cfs_rq->throttled_clock_self = rq_clock(rq);
> > +
> > + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> > + /*
> > + * rq_lock is held, current is (obviously) executing this in kernelspace.
> > + *
> > + * All other tasks enqueued on this rq have their saved PC at the
> > + * context switch, so they will go through the kernel before returning
> > + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> > + * install the task_work on all of them.
> > + */
> > + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> > + while (node) {
> > + struct sched_entity *se = __node_2_se(node);
> > +
> > + if (!entity_is_task(se))
> > + goto next;
> > +
> > + p = task_of(se);
> > + task_throttle_setup_work(p);
> > +next:
> > + node = rb_next(node);
> > + }
> > +
> > + /* curr is not in the timeline tree */
> > + if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> > + p = task_of(cfs_rq->curr);
> > + task_throttle_setup_work(p);
> > }
> > - cfs_rq->throttle_count++;
> >
> > return 0;
> > }
> >
>
> [..snip..]
>
> --
> Thanks and Regards,
> Prateek
>
Hello Aaron,
On 3/14/2025 2:27 PM, Aaron Lu wrote:
>>>
>>> +static inline bool task_has_throttle_work(struct task_struct *p)
>>> +{
>>> + return p->sched_throttle_work.next != &p->sched_throttle_work;
>>> +}
>>> +
>>> +static inline void task_throttle_setup_work(struct task_struct *p)
>>> +{
>>> + /*
>>> + * Kthreads and exiting tasks don't return to userspace, so adding the
>>> + * work is pointless
>>> + */
>>> + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>>> + return;
>>> +
>>> + if (task_has_throttle_work(p))
>>> + return;
>>> +
>>> + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>>> +}
>>> +
>>> static int tg_throttle_down(struct task_group *tg, void *data)
>>> {
>>> struct rq *rq = data;
>>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>> + struct task_struct *p;
>>> + struct rb_node *node;
>>> +
>>> + cfs_rq->throttle_count++;
>>> + if (cfs_rq->throttle_count > 1)
>>> + return 0;
>>
>> General question: Do we need the throttled_lb_pair() check in
>> can_migrate_task() with the per-task throttle? Moving a throttled task
>> to another CPU can ensures that the task can run quicker and exit to
>> user space as quickly as possible and once the task dequeues, it will
>> remove itself from the list of fair tasks making it unreachable for
>> the load balancer. Thoughts?
>
> That's a good point.
>
> The current approach dequeued the task and removed it from rq's
> cfs_tasks list, causing it lose the load balance opportunity. This is
> pretty sad.
That is fine. Today we have the throttled_lb_pair() check since tasks
on throttled hierarchy remain on the fair tasks list and the load
balancer should not move the around since they don't contribute to
current load in throttled state and moving them around will not change
anything since they'll still be waiting on another CPU for unthrottle.
With per-task throttle, we know that all the tasks on the fair task
list are runnable (except for the delayed ones but they contribute to
the load) - tasks on throttled hierarchy that exit to userspace will
dequeue themselves, removing them from the fair task list too.
Since a task that hasn't dequeued itself on a throttled hierarchy is
runnable, I'm suggesting we get rid of the throttled_lb_pair() check
in can_migrate_task() entirely.
>
> I'll need to think about this. I hope we can somehow keep the throttled
> tasks in cfs_tasks list, I'll see how to make this happen.
>
> Thanks,
> Aaron
>
[..snip..]
--
Thanks and Regards,
Prateek
On Fri, Mar 14, 2025 at 02:42:46PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
>
> On 3/14/2025 2:27 PM, Aaron Lu wrote:
> > > >
> > > > +static inline bool task_has_throttle_work(struct task_struct *p)
> > > > +{
> > > > + return p->sched_throttle_work.next != &p->sched_throttle_work;
> > > > +}
> > > > +
> > > > +static inline void task_throttle_setup_work(struct task_struct *p)
> > > > +{
> > > > + /*
> > > > + * Kthreads and exiting tasks don't return to userspace, so adding the
> > > > + * work is pointless
> > > > + */
> > > > + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > > > + return;
> > > > +
> > > > + if (task_has_throttle_work(p))
> > > > + return;
> > > > +
> > > > + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> > > > +}
> > > > +
> > > > static int tg_throttle_down(struct task_group *tg, void *data)
> > > > {
> > > > struct rq *rq = data;
> > > > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > > > + struct task_struct *p;
> > > > + struct rb_node *node;
> > > > +
> > > > + cfs_rq->throttle_count++;
> > > > + if (cfs_rq->throttle_count > 1)
> > > > + return 0;
> > >
> > > General question: Do we need the throttled_lb_pair() check in
> > > can_migrate_task() with the per-task throttle? Moving a throttled task
> > > to another CPU can ensures that the task can run quicker and exit to
> > > user space as quickly as possible and once the task dequeues, it will
> > > remove itself from the list of fair tasks making it unreachable for
> > > the load balancer. Thoughts?
> >
> > That's a good point.
> >
> > The current approach dequeued the task and removed it from rq's
> > cfs_tasks list, causing it lose the load balance opportunity. This is
> > pretty sad.
>
> That is fine. Today we have the throttled_lb_pair() check since tasks
> on throttled hierarchy remain on the fair tasks list and the load
> balancer should not move the around since they don't contribute to
> current load in throttled state and moving them around will not change
> anything since they'll still be waiting on another CPU for unthrottle.
OK I misunderstood. I thought tasks in throttled hierarchy still has
chance to participate load balance.
> With per-task throttle, we know that all the tasks on the fair task
> list are runnable (except for the delayed ones but they contribute to
> the load) - tasks on throttled hierarchy that exit to userspace will
> dequeue themselves, removing them from the fair task list too.
>
> Since a task that hasn't dequeued itself on a throttled hierarchy is
> runnable, I'm suggesting we get rid of the throttled_lb_pair() check
> in can_migrate_task() entirely.
Agree, will fix this in next version, thanks.
Best regards,
Aaron
> >
> > I'll need to think about this. I hope we can somehow keep the throttled
> > tasks in cfs_tasks list, I'll see how to make this happen.
> >
> > Thanks,
> > Aaron
> >
>
> [..snip..]
>
> --
> Thanks and Regards,
> Prateek
>
Hello Aaron,
P.S. I've fixed the wrapped lines and have been testing the series. So
far I haven't run into any issues yet on my machine. Will report back if
anything surface.
I've few comments inlined below.
On 3/13/2025 12:51 PM, Aaron Lu wrote:
[..snip..]
> +static inline void task_throttle_setup_work(struct task_struct *p)
> +{
> + /*
> + * Kthreads and exiting tasks don't return to userspace, so adding the
> + * work is pointless
> + */
> + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> + return;
> +
> + if (task_has_throttle_work(p))
> + return;
> +
> + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
Does it make sense to add a throttle work to a delayed task? It may be
dequeued soon and when it is queued back, the throttle situation might
have changed but the work is unnecessarily run. Could the throttle work
be instead added at the point of enqueue for delayed tasks?
> +}
> +
> static int tg_throttle_down(struct task_group *tg, void *data)
> {
> struct rq *rq = data;
> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> + struct task_struct *p;
> + struct rb_node *node;
> +
> + cfs_rq->throttle_count++;
> + if (cfs_rq->throttle_count > 1)
> + return 0;
>
> /* group is entering throttled state, stop time */
> - if (!cfs_rq->throttle_count) {
> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> - list_del_leaf_cfs_rq(cfs_rq);
> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
Once cencern here is that the PELT is seemingly frozen despite the
hierarchy being runnable. I've still not tracked down whether it'll
cause any problems once unthrottled and all throttled time is negated
from the pelt clock but is there any concerns here?
Maybe this can be done at dequeue when cfs_rq->nr_queued on a
throttled_hierarchy() reached 0.
> + list_del_leaf_cfs_rq(cfs_rq);
>
> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> - if (cfs_rq->nr_queued)
> - cfs_rq->throttled_clock_self = rq_clock(rq);
> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> + if (cfs_rq->nr_queued)
> + cfs_rq->throttled_clock_self = rq_clock(rq);
> +
> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> + /*
> + * rq_lock is held, current is (obviously) executing this in kernelspace.
> + *
> + * All other tasks enqueued on this rq have their saved PC at the
> + * context switch, so they will go through the kernel before returning
> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> + * install the task_work on all of them.
> + */
> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> + while (node) {
> + struct sched_entity *se = __node_2_se(node);
> +
> + if (!entity_is_task(se))
> + goto next;
> +
> + p = task_of(se);
> + task_throttle_setup_work(p);
> +next:
> + node = rb_next(node);
> + }
> +
> + /* curr is not in the timeline tree */
> + if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
I believe we can reach here from pick_next_task_fair() ->
check_cfs_rq_runtime() -> throttle_cfs_rq() in which case cfs_rq->curr
will still be set despite the task being blocked since put_prev_entity()
has not been called yet.
I believe there should be a check for task_on_rq_queued() here for the
current task.
> + p = task_of(cfs_rq->curr);
> + task_throttle_setup_work(p);
> }
> - cfs_rq->throttle_count++;
>
> return 0;
> }
>
[..snip..]
--
Thanks and Regards,
Prateek
On Thu, Mar 13, 2025 at 11:44:49PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
>
> P.S. I've fixed the wrapped lines and have been testing the series. So
> far I haven't run into any issues yet on my machine. Will report back if
> anything surface.
Thanks a lot for taking the time to review and test.
>
> I've few comments inlined below.
>
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
>
> [..snip..]
>
> > +static inline void task_throttle_setup_work(struct task_struct *p)
> > +{
> > + /*
> > + * Kthreads and exiting tasks don't return to userspace, so adding the
> > + * work is pointless
> > + */
> > + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > + return;
> > +
> > + if (task_has_throttle_work(p))
> > + return;
> > +
> > + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>
> Does it make sense to add a throttle work to a delayed task? It may be
I missed the case that a delayed task can still be on cfs_rq and I agree
there is no need to add throttle work to a delayed task.
> dequeued soon and when it is queued back, the throttle situation might
> have changed but the work is unnecessarily run. Could the throttle work
> be instead added at the point of enqueue for delayed tasks?
Yes. If a delayed task gets re-queued and its cfs_rq is in throttled
hierarchy, it should be added the throttle work.
>
> > +}
> > +
> > static int tg_throttle_down(struct task_group *tg, void *data)
> > {
> > struct rq *rq = data;
> > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > + struct task_struct *p;
> > + struct rb_node *node;
> > +
> > + cfs_rq->throttle_count++;
> > + if (cfs_rq->throttle_count > 1)
> > + return 0;
> >
> > /* group is entering throttled state, stop time */
> > - if (!cfs_rq->throttle_count) {
> > - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > - list_del_leaf_cfs_rq(cfs_rq);
> > + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>
> Once cencern here is that the PELT is seemingly frozen despite the
> hierarchy being runnable. I've still not tracked down whether it'll
> cause any problems once unthrottled and all throttled time is negated
> from the pelt clock but is there any concerns here?
I chose to do it this way because:
1 I expect most of the time, if a task has to continue to run after its
cfs_rq gets throttled, the time is relatively small so should not cause
much impact. But I agree there can be times a task runs relatively long;
2 I think the original intent to freeze cfs_rq's pelt clock on throttle
is so that on unthrottle, it can retore its loada(without its load being
decayed etc.). If I chose to not freeze its pelt clock on throttle
because some task is still running in kernel mode, since some of this
cfs_rq's tasks are throttled, its load can become smaller and this can
impact its load on unthrottle.
I think both approach is not perfect, so I chose the simple one for now
:) Not sure if my thinking is correct though.
> Maybe this can be done at dequeue when cfs_rq->nr_queued on a
> throttled_hierarchy() reached 0.
Yes, this looks more consistent, maybe I should change to this approach.
> > + list_del_leaf_cfs_rq(cfs_rq);
> >
> > - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > - if (cfs_rq->nr_queued)
> > - cfs_rq->throttled_clock_self = rq_clock(rq);
> > + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > + if (cfs_rq->nr_queued)
> > + cfs_rq->throttled_clock_self = rq_clock(rq);
> > +
> > + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> > + /*
> > + * rq_lock is held, current is (obviously) executing this in kernelspace.
> > + *
> > + * All other tasks enqueued on this rq have their saved PC at the
> > + * context switch, so they will go through the kernel before returning
> > + * to userspace. Thus, there are no tasks-in-userspace to handle, just
> > + * install the task_work on all of them.
> > + */
> > + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> > + while (node) {
> > + struct sched_entity *se = __node_2_se(node);
> > +
> > + if (!entity_is_task(se))
> > + goto next;
> > +
> > + p = task_of(se);
> > + task_throttle_setup_work(p);
> > +next:
> > + node = rb_next(node);
> > + }
> > +
> > + /* curr is not in the timeline tree */
> > + if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>
> I believe we can reach here from pick_next_task_fair() ->
> check_cfs_rq_runtime() -> throttle_cfs_rq() in which case cfs_rq->curr
> will still be set despite the task being blocked since put_prev_entity()
> has not been called yet.
>
> I believe there should be a check for task_on_rq_queued() here for the
> current task.
Ah right, I'll see how to fix this.
Thanks,
Aaron
> > + p = task_of(cfs_rq->curr);
> > + task_throttle_setup_work(p);
> > }
> > - cfs_rq->throttle_count++;
> >
> > return 0;
> > }
> >
>
> [..snip..]
>
> --
> Thanks and Regards,
> Prateek
>
Hello Aaron,
On 3/14/2025 2:18 PM, Aaron Lu wrote:
>>> static int tg_throttle_down(struct task_group *tg, void *data)
>>> {
>>> struct rq *rq = data;
>>> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>> + struct task_struct *p;
>>> + struct rb_node *node;
>>> +
>>> + cfs_rq->throttle_count++;
>>> + if (cfs_rq->throttle_count > 1)
>>> + return 0;
>>>
>>> /* group is entering throttled state, stop time */
>>> - if (!cfs_rq->throttle_count) {
>>> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> - list_del_leaf_cfs_rq(cfs_rq);
>>> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>
>> Once cencern here is that the PELT is seemingly frozen despite the
>> hierarchy being runnable. I've still not tracked down whether it'll
>> cause any problems once unthrottled and all throttled time is negated
>> from the pelt clock but is there any concerns here?
>
> I chose to do it this way because:
> 1 I expect most of the time, if a task has to continue to run after its
> cfs_rq gets throttled, the time is relatively small so should not cause
> much impact. But I agree there can be times a task runs relatively long;
> 2 I think the original intent to freeze cfs_rq's pelt clock on throttle
> is so that on unthrottle, it can retore its loada(without its load being
> decayed etc.). If I chose to not freeze its pelt clock on throttle
> because some task is still running in kernel mode, since some of this
> cfs_rq's tasks are throttled, its load can become smaller and this can
> impact its load on unthrottle.
>
> I think both approach is not perfect, so I chose the simple one for now
> :) Not sure if my thinking is correct though.
>
>> Maybe this can be done at dequeue when cfs_rq->nr_queued on a
>> throttled_hierarchy() reached 0.
>
> Yes, this looks more consistent, maybe I should change to this approach.
I agree the time might be small in most cases but some syscalls with
enough contention in the system can take a while to exit to user mode.
Even I'm not sure what the correct approach is here - should a
subtree's PELT be frozen when the last task dequeues or should we
freeze it for the whole hierarchy once the throttled cfs_rq dequeues?
I'll wait for other folks to chime in since they know these bits
better than me.
>
>>> + list_del_leaf_cfs_rq(cfs_rq);
>>>
>>> - SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> - if (cfs_rq->nr_queued)
>>> - cfs_rq->throttled_clock_self = rq_clock(rq);
>>> + SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> + if (cfs_rq->nr_queued)
>>> + cfs_rq->throttled_clock_self = rq_clock(rq);
>>> +
>>> + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>> + /*
>>> + * rq_lock is held, current is (obviously) executing this in kernelspace.
>>> + *
>>> + * All other tasks enqueued on this rq have their saved PC at the
>>> + * context switch, so they will go through the kernel before returning
>>> + * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>> + * install the task_work on all of them.
>>> + */
>>> + node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>> + while (node) {
>>> + struct sched_entity *se = __node_2_se(node);
>>> +
>>> + if (!entity_is_task(se))
>>> + goto next;
>>> +
>>> + p = task_of(se);
>>> + task_throttle_setup_work(p);
>>> +next:
>>> + node = rb_next(node);
>>> + }
>>> +
>>> + /* curr is not in the timeline tree */
>>> + if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>>
>> I believe we can reach here from pick_next_task_fair() ->
>> check_cfs_rq_runtime() -> throttle_cfs_rq() in which case cfs_rq->curr
>> will still be set despite the task being blocked since put_prev_entity()
>> has not been called yet.
>>
>> I believe there should be a check for task_on_rq_queued() here for the
>> current task.
>
> Ah right, I'll see how to fix this.
It may not be necessary with the recent suggestion from Chengming where
you can just add the task work if the task was picked on a throttled
hierarchy.
--
Thanks and Regards,
Prateek
>
> Thanks,
> Aaron
>
>>> + p = task_of(cfs_rq->curr);
>>> + task_throttle_setup_work(p);
>>> }
>>> - cfs_rq->throttle_count++;
>>>
>>> return 0;
>>> }
>>>
>>
>> [..snip..]
>>
>> --
>> Thanks and Regards,
>> Prateek
>>
© 2016 - 2025 Red Hat, Inc.