From: Valentin Schneider <vschneid@redhat.com>
In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.
This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets wakeup, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.
To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued. In this way, tasks throttled will not
hold any kernel resources.
To avoid breaking bisect, preserve the current throttle behavior by
still dequeuing throttled hierarchy from rq and because of this, no task
can have that throttle task work added yet. The throttle model will
switch to task based in a later patch.
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
kernel/sched/fair.c | 88 +++++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 1 +
2 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 75bf6186a5137..e87ceb0a2d37f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5825,8 +5825,47 @@ static inline int throttled_lb_pair(struct task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static void throttle_cfs_rq_work(struct callback_head *work)
{
+ struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ scoped_guard(task_rq_lock, p) {
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ /*
+ * If not in limbo, then either replenish has happened or this
+ * task got migrated out of the throttled cfs_rq, move along.
+ */
+ if (!cfs_rq->throttle_count)
+ return;
+ rq = scope.rq;
+ update_rq_clock(rq);
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ resched_curr(rq);
+ }
+
+ cond_resched_tasks_rcu_qs();
}
void init_cfs_throttle_work(struct task_struct *p)
@@ -5866,21 +5905,42 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ if (task_has_throttle_work(p))
+ return;
+
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ cfs_rq->throttle_count++;
+ if (cfs_rq->throttle_count > 1)
+ return 0;
+
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
- }
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
return 0;
}
@@ -6575,6 +6635,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6744,6 +6805,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
again:
cfs_rq = &rq->cfs;
@@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (throttled_hierarchy(cfs_rq_of(se))) {
+ /* Shuold not happen for now */
+ WARN_ON_ONCE(1);
+ task_throttle_setup_work(p);
+ }
+
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 921527327f107..83f16fc44884f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -736,6 +736,7 @@ struct cfs_rq {
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
--
2.39.5
On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> static int tg_throttle_down(struct task_group *tg, void *data)
> {
> struct rq *rq = data;
> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>
> + cfs_rq->throttle_count++;
> + if (cfs_rq->throttle_count > 1)
> + return 0;
if (cfs_rq->throttle_count++)
return 0;
vs
if (--cfs_rq->throttle_count)
return;
On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> {
> struct sched_entity *se;
> struct cfs_rq *cfs_rq;
> + struct task_struct *p;
>
> again:
> cfs_rq = &rq->cfs;
> @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> - return task_of(se);
> + p = task_of(se);
> + if (throttled_hierarchy(cfs_rq_of(se))) {
> + /* Shuold not happen for now */
> + WARN_ON_ONCE(1);
> + task_throttle_setup_work(p);
> + }
> +
> + return p;
> }
So the final code is a little different, because you're removing the
return value from check_cfs_rq_runtime().
But would not that exact return value be the thing you're now checking
for again?
That is; at the end of the series, would not something like:
static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
struct task_struct *p;
bool throttled;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
return NULL;
throttled = false;
do {
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
goto again;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
if (unlikely(throttled))
task_throttle_setup_work(p);
return p;
}
make it more obvious / be simpler?
On Thu, May 22, 2025 at 01:07:28PM +0200, Peter Zijlstra wrote:
> On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> > @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > {
> > struct sched_entity *se;
> > struct cfs_rq *cfs_rq;
> > + struct task_struct *p;
> >
> > again:
> > cfs_rq = &rq->cfs;
> > @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > cfs_rq = group_cfs_rq(se);
> > } while (cfs_rq);
> >
> > - return task_of(se);
> > + p = task_of(se);
> > + if (throttled_hierarchy(cfs_rq_of(se))) {
> > + /* Shuold not happen for now */
> > + WARN_ON_ONCE(1);
> > + task_throttle_setup_work(p);
> > + }
> > +
> > + return p;
> > }
>
> So the final code is a little different, because you're removing the
> return value from check_cfs_rq_runtime().
>
> But would not that exact return value be the thing you're now checking
> for again?
>
Ah yes.
> That is; at the end of the series, would not something like:
>
> static struct task_struct *pick_task_fair(struct rq *rq)
> {
> struct sched_entity *se;
> struct cfs_rq *cfs_rq;
> struct task_struct *p;
> bool throttled;
>
> again:
> cfs_rq = &rq->cfs;
> if (!cfs_rq->nr_queued)
> return NULL;
>
> throttled = false;
>
> do {
> if (cfs_rq->curr && cfs_rq->curr->on_rq)
> update_curr(cfs_rq);
>
> throttled |= check_cfs_rq_runtime(cfs_rq);
>
> se = pick_next_entity(rq, cfs_rq);
> if (!se)
> goto again;
>
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> p = task_of(se);
> if (unlikely(throttled))
> task_throttle_setup_work(p);
> return p;
> }
>
> make it more obvious / be simpler?
Thanks for the suggestion, will follow it in next version.
On Fri, May 23, 2025 at 03:40:14PM +0800, Aaron Lu wrote:
> On Thu, May 22, 2025 at 01:07:28PM +0200, Peter Zijlstra wrote:
> > On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> > > @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > > {
> > > struct sched_entity *se;
> > > struct cfs_rq *cfs_rq;
> > > + struct task_struct *p;
> > >
> > > again:
> > > cfs_rq = &rq->cfs;
> > > @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > > cfs_rq = group_cfs_rq(se);
> > > } while (cfs_rq);
> > >
> > > - return task_of(se);
> > > + p = task_of(se);
> > > + if (throttled_hierarchy(cfs_rq_of(se))) {
> > > + /* Shuold not happen for now */
> > > + WARN_ON_ONCE(1);
> > > + task_throttle_setup_work(p);
> > > + }
> > > +
> > > + return p;
> > > }
> >
> > So the final code is a little different, because you're removing the
> > return value from check_cfs_rq_runtime().
> >
> > But would not that exact return value be the thing you're now checking
> > for again?
> >
>
> Ah yes.
>
> > That is; at the end of the series, would not something like:
> >
> > static struct task_struct *pick_task_fair(struct rq *rq)
> > {
> > struct sched_entity *se;
> > struct cfs_rq *cfs_rq;
> > struct task_struct *p;
> > bool throttled;
> >
> > again:
> > cfs_rq = &rq->cfs;
> > if (!cfs_rq->nr_queued)
> > return NULL;
> >
> > throttled = false;
> >
> > do {
> > if (cfs_rq->curr && cfs_rq->curr->on_rq)
> > update_curr(cfs_rq);
> >
> > throttled |= check_cfs_rq_runtime(cfs_rq);
> >
> > se = pick_next_entity(rq, cfs_rq);
> > if (!se)
> > goto again;
> >
> > cfs_rq = group_cfs_rq(se);
> > } while (cfs_rq);
> >
> > p = task_of(se);
> > if (unlikely(throttled))
> > task_throttle_setup_work(p);
> > return p;
> > }
> >
> > make it more obvious / be simpler?
>
> Thanks for the suggestion, will follow it in next version.
Found a tiny difference while testing: check_cfs_rq_runtime() could
return false for a cfs_rq whose throttled_hierarchy() is true. The
reason is, that still throttled cfs_rq may be assigned runtime by
another cpu doing distribute_cfs_runtime() and has an async unthrottle
queued but didn't process it yet. The end result is, it has a positive
runtime_remaining but isn't unthrottled yet. I think this doesn't make
much difference but thought it might be worth mentioning.
A side note, now that check_cfs_rq_runtime() only marks cfs_rq's
throttle status and returns a signal, it no longer does dequeuing
stuffs, I suppose there is no need to call it in put_prev_entity()?
Because that signal is now only useful in pick time and we always run
check_cfs_rq_runtime() on every cfs_rq encountered during pick.
Also, check_enqueue_throttle() doesn't look useful either because
enqueued task will go through pick and we will add a throttle work to it
if needed. I removed these stuffs and run some tests, didn't notice
anything wrong yet but perhaps I missed something, comments?
Best regards,
Aaron
Hello Aaron, On 5/29/2025 5:21 PM, Aaron Lu wrote: > A side note, now that check_cfs_rq_runtime() only marks cfs_rq's > throttle status and returns a signal, it no longer does dequeuing > stuffs, I suppose there is no need to call it in put_prev_entity()? But perhaps it is needed to queue the bandwidth timer via __assign_cfs_rq_runtime() if the cfs_rq had built up slack when the last task was dequeued? Otherwise, the throttle will only be noted at the time of pick and the timer will only start then but check_cfs_rq_runtime() in put_prev_entity() and check_enqueue_throttle() could have already spotted a slack and queued the timer which can ensure bandwidth is available sooner and reduce latency. Let me know if I'm terribly mistaken :) > Because that signal is now only useful in pick time and we always run > check_cfs_rq_runtime() on every cfs_rq encountered during pick.> > Also, check_enqueue_throttle() doesn't look useful either because > enqueued task will go through pick and we will add a throttle work to it > if needed. I removed these stuffs and run some tests, didn't notice > anything wrong yet but perhaps I missed something, comments? > > Best regards, > Aaron -- Thanks and Regards, Prateek
Hi Prateek, On Fri, May 30, 2025 at 11:06:52AM +0530, K Prateek Nayak wrote: > Hello Aaron, > > On 5/29/2025 5:21 PM, Aaron Lu wrote: > > A side note, now that check_cfs_rq_runtime() only marks cfs_rq's > > throttle status and returns a signal, it no longer does dequeuing > > stuffs, I suppose there is no need to call it in put_prev_entity()? > > But perhaps it is needed to queue the bandwidth timer via > __assign_cfs_rq_runtime() if the cfs_rq had built up slack when the last > task was dequeued? > > Otherwise, the throttle will only be noted at the time of pick and the > timer will only start then but check_cfs_rq_runtime() in > put_prev_entity() and check_enqueue_throttle() could have already > spotted a slack and queued the timer which can ensure bandwidth is > available sooner and reduce latency. > > Let me know if I'm terribly mistaken :) > Sounds reasonable to me, thanks for the insight! Best wishes, Aaron > > Because that signal is now only useful in pick time and we always run > > check_cfs_rq_runtime() on every cfs_rq encountered during pick.> Also, > > check_enqueue_throttle() doesn't look useful either because > > enqueued task will go through pick and we will add a throttle work to it > > if needed. I removed these stuffs and run some tests, didn't notice > > anything wrong yet but perhaps I missed something, comments? > > > > Best regards, > > Aaron > > -- > Thanks and Regards, > Prateek >
On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> static void throttle_cfs_rq_work(struct callback_head *work)
> {
> + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> + struct sched_entity *se;
> + struct cfs_rq *cfs_rq;
> + struct rq *rq;
> +
> + WARN_ON_ONCE(p != current);
> + p->sched_throttle_work.next = &p->sched_throttle_work;
> +
> + /*
> + * If task is exiting, then there won't be a return to userspace, so we
> + * don't have to bother with any of this.
> + */
> + if ((p->flags & PF_EXITING))
> + return;
> +
> + scoped_guard(task_rq_lock, p) {
> + se = &p->se;
> + cfs_rq = cfs_rq_of(se);
> +
> + /* Raced, forget */
> + if (p->sched_class != &fair_sched_class)
> + return;
> +
> + /*
> + * If not in limbo, then either replenish has happened or this
> + * task got migrated out of the throttled cfs_rq, move along.
> + */
> + if (!cfs_rq->throttle_count)
> + return;
> + rq = scope.rq;
> + update_rq_clock(rq);
> + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> + resched_curr(rq);
> + }
> +
> + cond_resched_tasks_rcu_qs();
> }
What's that cond_resched thing about? The general plan is to make
cond_resched go away.
On Thu, May 22, 2025 at 12:48:43PM +0200, Peter Zijlstra wrote:
> On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
>
> > static void throttle_cfs_rq_work(struct callback_head *work)
> > {
> > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > + struct sched_entity *se;
> > + struct cfs_rq *cfs_rq;
> > + struct rq *rq;
> > +
> > + WARN_ON_ONCE(p != current);
> > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > +
> > + /*
> > + * If task is exiting, then there won't be a return to userspace, so we
> > + * don't have to bother with any of this.
> > + */
> > + if ((p->flags & PF_EXITING))
> > + return;
> > +
> > + scoped_guard(task_rq_lock, p) {
> > + se = &p->se;
> > + cfs_rq = cfs_rq_of(se);
> > +
> > + /* Raced, forget */
> > + if (p->sched_class != &fair_sched_class)
> > + return;
> > +
> > + /*
> > + * If not in limbo, then either replenish has happened or this
> > + * task got migrated out of the throttled cfs_rq, move along.
> > + */
> > + if (!cfs_rq->throttle_count)
> > + return;
> > + rq = scope.rq;
> > + update_rq_clock(rq);
> > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > + resched_curr(rq);
> > + }
> > +
> > + cond_resched_tasks_rcu_qs();
> > }
>
> What's that cond_resched thing about? The general plan is to make
> cond_resched go away.
Got it.
The purpose is to let throttled task schedule and also mark a task rcu
quiescent state. Without this cond_resched_tasks_rcu_qs(), this task
will be scheduled by cond_resched() in task_work_run() and since that is
a preempt schedule, it didn't mark a task rcu quiescent state.
Any suggestion here? Perhaps a plain schedule()? Thanks.
On Thu, May 22, 2025 at 07:44:55PM +0800, Aaron Lu wrote:
> On Thu, May 22, 2025 at 12:48:43PM +0200, Peter Zijlstra wrote:
> > On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> >
> > > static void throttle_cfs_rq_work(struct callback_head *work)
> > > {
> > > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > > + struct sched_entity *se;
> > > + struct cfs_rq *cfs_rq;
> > > + struct rq *rq;
> > > +
> > > + WARN_ON_ONCE(p != current);
> > > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > > +
> > > + /*
> > > + * If task is exiting, then there won't be a return to userspace, so we
> > > + * don't have to bother with any of this.
> > > + */
> > > + if ((p->flags & PF_EXITING))
> > > + return;
> > > +
> > > + scoped_guard(task_rq_lock, p) {
> > > + se = &p->se;
> > > + cfs_rq = cfs_rq_of(se);
> > > +
> > > + /* Raced, forget */
> > > + if (p->sched_class != &fair_sched_class)
> > > + return;
> > > +
> > > + /*
> > > + * If not in limbo, then either replenish has happened or this
> > > + * task got migrated out of the throttled cfs_rq, move along.
> > > + */
> > > + if (!cfs_rq->throttle_count)
> > > + return;
> > > + rq = scope.rq;
> > > + update_rq_clock(rq);
> > > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > + resched_curr(rq);
> > > + }
> > > +
> > > + cond_resched_tasks_rcu_qs();
> > > }
> >
> > What's that cond_resched thing about? The general plan is to make
> > cond_resched go away.
>
> Got it.
>
> The purpose is to let throttled task schedule and also mark a task rcu
> quiescent state. Without this cond_resched_tasks_rcu_qs(), this task
> will be scheduled by cond_resched() in task_work_run() and since that is
> a preempt schedule, it didn't mark a task rcu quiescent state.
>
> Any suggestion here? Perhaps a plain schedule()? Thanks.
I am confused, this is task_work_run(), that is ran from
exit_to_user_mode_loop(), which contains a schedule().
On Thu, May 22, 2025 at 01:54:18PM +0200, Peter Zijlstra wrote:
> On Thu, May 22, 2025 at 07:44:55PM +0800, Aaron Lu wrote:
> > On Thu, May 22, 2025 at 12:48:43PM +0200, Peter Zijlstra wrote:
> > > On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> > >
> > > > static void throttle_cfs_rq_work(struct callback_head *work)
> > > > {
> > > > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > > > + struct sched_entity *se;
> > > > + struct cfs_rq *cfs_rq;
> > > > + struct rq *rq;
> > > > +
> > > > + WARN_ON_ONCE(p != current);
> > > > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > > > +
> > > > + /*
> > > > + * If task is exiting, then there won't be a return to userspace, so we
> > > > + * don't have to bother with any of this.
> > > > + */
> > > > + if ((p->flags & PF_EXITING))
> > > > + return;
> > > > +
> > > > + scoped_guard(task_rq_lock, p) {
> > > > + se = &p->se;
> > > > + cfs_rq = cfs_rq_of(se);
> > > > +
> > > > + /* Raced, forget */
> > > > + if (p->sched_class != &fair_sched_class)
> > > > + return;
> > > > +
> > > > + /*
> > > > + * If not in limbo, then either replenish has happened or this
> > > > + * task got migrated out of the throttled cfs_rq, move along.
> > > > + */
> > > > + if (!cfs_rq->throttle_count)
> > > > + return;
> > > > + rq = scope.rq;
> > > > + update_rq_clock(rq);
> > > > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > > > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > > + resched_curr(rq);
> > > > + }
> > > > +
> > > > + cond_resched_tasks_rcu_qs();
> > > > }
> > >
> > > What's that cond_resched thing about? The general plan is to make
> > > cond_resched go away.
> >
> > Got it.
> >
> > The purpose is to let throttled task schedule and also mark a task rcu
> > quiescent state. Without this cond_resched_tasks_rcu_qs(), this task
> > will be scheduled by cond_resched() in task_work_run() and since that is
> > a preempt schedule, it didn't mark a task rcu quiescent state.
> >
> > Any suggestion here? Perhaps a plain schedule()? Thanks.
>
> I am confused, this is task_work_run(), that is ran from
> exit_to_user_mode_loop(), which contains a schedule().
There is a cond_resched() in task_work_run() loop:
do {
next = work->next;
work->func(work);
work = next;
cond_resched();
} while (work);
And when this throttle work returns with need_resched bit set,
cond_resched() will cause a schedule but that didn't mark a task
quiescent state...
On Thu, May 22, 2025 at 08:40:02PM +0800, Aaron Lu wrote:
> On Thu, May 22, 2025 at 01:54:18PM +0200, Peter Zijlstra wrote:
> > On Thu, May 22, 2025 at 07:44:55PM +0800, Aaron Lu wrote:
> > > On Thu, May 22, 2025 at 12:48:43PM +0200, Peter Zijlstra wrote:
> > > > On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> > > >
> > > > > static void throttle_cfs_rq_work(struct callback_head *work)
> > > > > {
> > > > > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > > > > + struct sched_entity *se;
> > > > > + struct cfs_rq *cfs_rq;
> > > > > + struct rq *rq;
> > > > > +
> > > > > + WARN_ON_ONCE(p != current);
> > > > > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > > > > +
> > > > > + /*
> > > > > + * If task is exiting, then there won't be a return to userspace, so we
> > > > > + * don't have to bother with any of this.
> > > > > + */
> > > > > + if ((p->flags & PF_EXITING))
> > > > > + return;
> > > > > +
> > > > > + scoped_guard(task_rq_lock, p) {
> > > > > + se = &p->se;
> > > > > + cfs_rq = cfs_rq_of(se);
> > > > > +
> > > > > + /* Raced, forget */
> > > > > + if (p->sched_class != &fair_sched_class)
> > > > > + return;
> > > > > +
> > > > > + /*
> > > > > + * If not in limbo, then either replenish has happened or this
> > > > > + * task got migrated out of the throttled cfs_rq, move along.
> > > > > + */
> > > > > + if (!cfs_rq->throttle_count)
> > > > > + return;
> > > > > + rq = scope.rq;
> > > > > + update_rq_clock(rq);
> > > > > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > > > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > > > > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > > > + resched_curr(rq);
> > > > > + }
> > > > > +
> > > > > + cond_resched_tasks_rcu_qs();
> > > > > }
> > > >
> > > > What's that cond_resched thing about? The general plan is to make
> > > > cond_resched go away.
> > >
> > > Got it.
> > >
> > > The purpose is to let throttled task schedule and also mark a task rcu
> > > quiescent state. Without this cond_resched_tasks_rcu_qs(), this task
> > > will be scheduled by cond_resched() in task_work_run() and since that is
> > > a preempt schedule, it didn't mark a task rcu quiescent state.
> > >
> > > Any suggestion here? Perhaps a plain schedule()? Thanks.
> >
> > I am confused, this is task_work_run(), that is ran from
> > exit_to_user_mode_loop(), which contains a schedule().
>
I should probably have added that the schedule() call contained in
exit_to_user_mode_loop() is early in that loop, where the to-be-throttled
task doesn't have need_resched bit set yet.
> There is a cond_resched() in task_work_run() loop:
>
> do {
> next = work->next;
> work->func(work);
> work = next;
> cond_resched();
> } while (work);
>
> And when this throttle work returns with need_resched bit set,
> cond_resched() will cause a schedule but that didn't mark a task
> quiescent state...
Another approach I can think of is to add a test of task_is_throttled()
in rcu_tasks_is_holdout(). I remembered when I tried this before, I can
hit the following path:
exit_to_user_mode_loop() -> get_signal() -> throttle_task_work() ->
do_exit() -> exit_signals() -> percpu_rwsem_wait() -> schedule() ->
try_to_block_task() -> dequeue_task_fair().
I would like to avoid this path, because it doesn't feel right for a
throttled task to go through another dequeue again(except for the cases
like task group change, affinity change etc. are special cases that have
to be dealed with though).
It looks to me, a schedule() call(or any other form) that makes sure
this throttled task gets scheduled in its task work is the safest thing
to do.
Thoughts?
Thanks,
Aaron
On Fri, May 23, 2025 at 05:53:50PM +0800, Aaron Lu wrote:
> On Thu, May 22, 2025 at 08:40:02PM +0800, Aaron Lu wrote:
> > On Thu, May 22, 2025 at 01:54:18PM +0200, Peter Zijlstra wrote:
> > > On Thu, May 22, 2025 at 07:44:55PM +0800, Aaron Lu wrote:
> > > > On Thu, May 22, 2025 at 12:48:43PM +0200, Peter Zijlstra wrote:
> > > > > On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> > > > >
> > > > > > static void throttle_cfs_rq_work(struct callback_head *work)
> > > > > > {
> > > > > > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > > > > > + struct sched_entity *se;
> > > > > > + struct cfs_rq *cfs_rq;
> > > > > > + struct rq *rq;
> > > > > > +
> > > > > > + WARN_ON_ONCE(p != current);
> > > > > > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > > > > > +
> > > > > > + /*
> > > > > > + * If task is exiting, then there won't be a return to userspace, so we
> > > > > > + * don't have to bother with any of this.
> > > > > > + */
> > > > > > + if ((p->flags & PF_EXITING))
> > > > > > + return;
> > > > > > +
> > > > > > + scoped_guard(task_rq_lock, p) {
> > > > > > + se = &p->se;
> > > > > > + cfs_rq = cfs_rq_of(se);
> > > > > > +
> > > > > > + /* Raced, forget */
> > > > > > + if (p->sched_class != &fair_sched_class)
> > > > > > + return;
> > > > > > +
> > > > > > + /*
> > > > > > + * If not in limbo, then either replenish has happened or this
> > > > > > + * task got migrated out of the throttled cfs_rq, move along.
> > > > > > + */
> > > > > > + if (!cfs_rq->throttle_count)
> > > > > > + return;
> > > > > > + rq = scope.rq;
> > > > > > + update_rq_clock(rq);
> > > > > > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > > > > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > > > > > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > > > > + resched_curr(rq);
> > > > > > + }
> > > > > > +
> > > > > > + cond_resched_tasks_rcu_qs();
> > > > > > }
> > > > >
> > > > > What's that cond_resched thing about? The general plan is to make
> > > > > cond_resched go away.
> > > >
> > > > Got it.
> > > >
> > > > The purpose is to let throttled task schedule and also mark a task rcu
> > > > quiescent state. Without this cond_resched_tasks_rcu_qs(), this task
> > > > will be scheduled by cond_resched() in task_work_run() and since that is
> > > > a preempt schedule, it didn't mark a task rcu quiescent state.
> > > >
> > > > Any suggestion here? Perhaps a plain schedule()? Thanks.
> > >
> > > I am confused, this is task_work_run(), that is ran from
> > > exit_to_user_mode_loop(), which contains a schedule().
> >
>
> I should probably have added that the schedule() call contained in
> exit_to_user_mode_loop() is early in that loop, where the to-be-throttled
> task doesn't have need_resched bit set yet.
No, but if it does get set, it will get picked up at:
ti_work = read_thread_flags();
and since TIF_NEED_RESCHED is part of EXIT_TO_USER_MODE_WORK, we'll get
another cycle, and do the schedule() thing.
> > There is a cond_resched() in task_work_run() loop:
> >
> > do {
> > next = work->next;
> > work->func(work);
> > work = next;
> > cond_resched();
> > } while (work);
That cond_resched() is equally going away.
> > And when this throttle work returns with need_resched bit set,
> > cond_resched() will cause a schedule but that didn't mark a task
> > quiescent state...
>
> Another approach I can think of is to add a test of task_is_throttled()
> in rcu_tasks_is_holdout(). I remembered when I tried this before, I can
> hit the following path:
So this really is about task_rcu needing something? Let me go look at
task-rcu.
So AFAICT, exit_to_user_mode_loop() will do schedule(), which will call
__schedule(SM_NONE), which then will have preempt = false and call:
rcu_note_context_switch(false) which in turn will do:
rcu_task_rq(current, false).
This should be sufficient, no?
On Fri, May 23, 2025 at 12:52:22PM +0200, Peter Zijlstra wrote:
> On Fri, May 23, 2025 at 05:53:50PM +0800, Aaron Lu wrote:
> > On Thu, May 22, 2025 at 08:40:02PM +0800, Aaron Lu wrote:
> > > On Thu, May 22, 2025 at 01:54:18PM +0200, Peter Zijlstra wrote:
> > > > On Thu, May 22, 2025 at 07:44:55PM +0800, Aaron Lu wrote:
> > > > > On Thu, May 22, 2025 at 12:48:43PM +0200, Peter Zijlstra wrote:
> > > > > > On Tue, May 20, 2025 at 06:41:05PM +0800, Aaron Lu wrote:
> > > > > >
> > > > > > > static void throttle_cfs_rq_work(struct callback_head *work)
> > > > > > > {
> > > > > > > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > > > > > > + struct sched_entity *se;
> > > > > > > + struct cfs_rq *cfs_rq;
> > > > > > > + struct rq *rq;
> > > > > > > +
> > > > > > > + WARN_ON_ONCE(p != current);
> > > > > > > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > > > > > > +
> > > > > > > + /*
> > > > > > > + * If task is exiting, then there won't be a return to userspace, so we
> > > > > > > + * don't have to bother with any of this.
> > > > > > > + */
> > > > > > > + if ((p->flags & PF_EXITING))
> > > > > > > + return;
> > > > > > > +
> > > > > > > + scoped_guard(task_rq_lock, p) {
> > > > > > > + se = &p->se;
> > > > > > > + cfs_rq = cfs_rq_of(se);
> > > > > > > +
> > > > > > > + /* Raced, forget */
> > > > > > > + if (p->sched_class != &fair_sched_class)
> > > > > > > + return;
> > > > > > > +
> > > > > > > + /*
> > > > > > > + * If not in limbo, then either replenish has happened or this
> > > > > > > + * task got migrated out of the throttled cfs_rq, move along.
> > > > > > > + */
> > > > > > > + if (!cfs_rq->throttle_count)
> > > > > > > + return;
> > > > > > > + rq = scope.rq;
> > > > > > > + update_rq_clock(rq);
> > > > > > > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > > > > > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > > > > > > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > > > > > + resched_curr(rq);
> > > > > > > + }
> > > > > > > +
> > > > > > > + cond_resched_tasks_rcu_qs();
> > > > > > > }
> > > > > >
> > > > > > What's that cond_resched thing about? The general plan is to make
> > > > > > cond_resched go away.
> > > > >
> > > > > Got it.
> > > > >
> > > > > The purpose is to let throttled task schedule and also mark a task rcu
> > > > > quiescent state. Without this cond_resched_tasks_rcu_qs(), this task
> > > > > will be scheduled by cond_resched() in task_work_run() and since that is
> > > > > a preempt schedule, it didn't mark a task rcu quiescent state.
> > > > >
> > > > > Any suggestion here? Perhaps a plain schedule()? Thanks.
> > > >
> > > > I am confused, this is task_work_run(), that is ran from
> > > > exit_to_user_mode_loop(), which contains a schedule().
> > >
> >
> > I should probably have added that the schedule() call contained in
> > exit_to_user_mode_loop() is early in that loop, where the to-be-throttled
> > task doesn't have need_resched bit set yet.
>
> No, but if it does get set, it will get picked up at:
>
> ti_work = read_thread_flags();
>
> and since TIF_NEED_RESCHED is part of EXIT_TO_USER_MODE_WORK, we'll get
> another cycle, and do the schedule() thing.
>
> > > There is a cond_resched() in task_work_run() loop:
> > >
> > > do {
> > > next = work->next;
> > > work->func(work);
> > > work = next;
> > > cond_resched();
> > > } while (work);
>
> That cond_resched() is equally going away.
Good to know this.
As long as this cond_resched() goes away, there is no need for an
explicite schedule() or any of its other forms in this throttle task
work.
> > > And when this throttle work returns with need_resched bit set,
> > > cond_resched() will cause a schedule but that didn't mark a task
> > > quiescent state...
> >
> > Another approach I can think of is to add a test of task_is_throttled()
> > in rcu_tasks_is_holdout(). I remembered when I tried this before, I can
> > hit the following path:
>
> So this really is about task_rcu needing something? Let me go look at
> task-rcu.
Yes. I found this problem when using bpftrace to profile something and
bpftrace couldn't start untill the test is finished :)
I'm assuming bpftrace need those throttled tasks properly mark a qs
state. With this change here, bpftrace can start normally when the test
is running.
>
> So AFAICT, exit_to_user_mode_loop() will do schedule(), which will call
> __schedule(SM_NONE), which then will have preempt = false and call:
> rcu_note_context_switch(false) which in turn will do:
> rcu_task_rq(current, false).
>
> This should be sufficient, no?
Yes, as long as that cond_resched() in task_work_run() loop is gone.
I'll also give it a test and will let you know if I find anything
unexpected.
Thanks,
Aaron
On 2025/5/20 18:41, Aaron Lu wrote:
> From: Valentin Schneider <vschneid@redhat.com>
>
> In current throttle model, when a cfs_rq is throttled, its entity will
> be dequeued from cpu's rq, making tasks attached to it not able to run,
> thus achiveing the throttle target.
>
> This has a drawback though: assume a task is a reader of percpu_rwsem
> and is waiting. When it gets wakeup, it can not run till its task group's
> next period comes, which can be a relatively long time. Waiting writer
> will have to wait longer due to this and it also makes further reader
> build up and eventually trigger task hung.
>
> To improve this situation, change the throttle model to task based, i.e.
> when a cfs_rq is throttled, record its throttled status but do not remove
> it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> they get picked, add a task work to them so that when they return
> to user, they can be dequeued. In this way, tasks throttled will not
> hold any kernel resources.
>
> To avoid breaking bisect, preserve the current throttle behavior by
> still dequeuing throttled hierarchy from rq and because of this, no task
> can have that throttle task work added yet. The throttle model will
> switch to task based in a later patch.
>
> Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
I'm wondering how about put 02-04 patches together, since it's strange
to setup task work in this patch but without changing throttle_cfs_rq(),
which makes the reviewing process a bit confused? WDYT?
Thanks!
> ---
> kernel/sched/fair.c | 88 +++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/sched.h | 1 +
> 2 files changed, 80 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 75bf6186a5137..e87ceb0a2d37f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5825,8 +5825,47 @@ static inline int throttled_lb_pair(struct task_group *tg,
> throttled_hierarchy(dest_cfs_rq);
> }
>
> +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> static void throttle_cfs_rq_work(struct callback_head *work)
> {
> + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> + struct sched_entity *se;
> + struct cfs_rq *cfs_rq;
> + struct rq *rq;
> +
> + WARN_ON_ONCE(p != current);
> + p->sched_throttle_work.next = &p->sched_throttle_work;
> +
> + /*
> + * If task is exiting, then there won't be a return to userspace, so we
> + * don't have to bother with any of this.
> + */
> + if ((p->flags & PF_EXITING))
> + return;
> +
> + scoped_guard(task_rq_lock, p) {
> + se = &p->se;
> + cfs_rq = cfs_rq_of(se);
> +
> + /* Raced, forget */
> + if (p->sched_class != &fair_sched_class)
> + return;
> +
> + /*
> + * If not in limbo, then either replenish has happened or this
> + * task got migrated out of the throttled cfs_rq, move along.
> + */
> + if (!cfs_rq->throttle_count)
> + return;
> + rq = scope.rq;
> + update_rq_clock(rq);
> + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> + resched_curr(rq);
> + }
> +
> + cond_resched_tasks_rcu_qs();
> }
>
> void init_cfs_throttle_work(struct task_struct *p)
> @@ -5866,21 +5905,42 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> return 0;
> }
>
> +static inline bool task_has_throttle_work(struct task_struct *p)
> +{
> + return p->sched_throttle_work.next != &p->sched_throttle_work;
> +}
> +
> +static inline void task_throttle_setup_work(struct task_struct *p)
> +{
> + if (task_has_throttle_work(p))
> + return;
> +
> + /*
> + * Kthreads and exiting tasks don't return to userspace, so adding the
> + * work is pointless
> + */
> + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> + return;
> +
> + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> +}
> +
> static int tg_throttle_down(struct task_group *tg, void *data)
> {
> struct rq *rq = data;
> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>
> + cfs_rq->throttle_count++;
> + if (cfs_rq->throttle_count > 1)
> + return 0;
> +
> /* group is entering throttled state, stop time */
> - if (!cfs_rq->throttle_count) {
> - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> - list_del_leaf_cfs_rq(cfs_rq);
> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> + list_del_leaf_cfs_rq(cfs_rq);
>
> - WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> - if (cfs_rq->nr_queued)
> - cfs_rq->throttled_clock_self = rq_clock(rq);
> - }
> - cfs_rq->throttle_count++;
> + WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> + if (cfs_rq->nr_queued)
> + cfs_rq->throttled_clock_self = rq_clock(rq);
>
> return 0;
> }
> @@ -6575,6 +6635,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> cfs_rq->runtime_enabled = 0;
> INIT_LIST_HEAD(&cfs_rq->throttled_list);
> INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
> }
>
> void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> @@ -6744,6 +6805,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> static inline void sync_throttle(struct task_group *tg, int cpu) {}
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> +static void task_throttle_setup_work(struct task_struct *p) {}
>
> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> {
> @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> {
> struct sched_entity *se;
> struct cfs_rq *cfs_rq;
> + struct task_struct *p;
>
> again:
> cfs_rq = &rq->cfs;
> @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> - return task_of(se);
> + p = task_of(se);
> + if (throttled_hierarchy(cfs_rq_of(se))) {
> + /* Shuold not happen for now */
> + WARN_ON_ONCE(1);
> + task_throttle_setup_work(p);
> + }
> +
> + return p;
> }
>
> static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 921527327f107..83f16fc44884f 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -736,6 +736,7 @@ struct cfs_rq {
> int throttle_count;
> struct list_head throttled_list;
> struct list_head throttled_csd_list;
> + struct list_head throttled_limbo_list;
> #endif /* CONFIG_CFS_BANDWIDTH */
> #endif /* CONFIG_FAIR_GROUP_SCHED */
> };
On Wed, May 21, 2025 at 05:01:58PM +0800, Chengming Zhou wrote:
> On 2025/5/20 18:41, Aaron Lu wrote:
> > From: Valentin Schneider <vschneid@redhat.com>
> >
> > In current throttle model, when a cfs_rq is throttled, its entity will
> > be dequeued from cpu's rq, making tasks attached to it not able to run,
> > thus achiveing the throttle target.
> >
> > This has a drawback though: assume a task is a reader of percpu_rwsem
> > and is waiting. When it gets wakeup, it can not run till its task group's
> > next period comes, which can be a relatively long time. Waiting writer
> > will have to wait longer due to this and it also makes further reader
> > build up and eventually trigger task hung.
> >
> > To improve this situation, change the throttle model to task based, i.e.
> > when a cfs_rq is throttled, record its throttled status but do not remove
> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> > they get picked, add a task work to them so that when they return
> > to user, they can be dequeued. In this way, tasks throttled will not
> > hold any kernel resources.
> >
> > To avoid breaking bisect, preserve the current throttle behavior by
> > still dequeuing throttled hierarchy from rq and because of this, no task
> > can have that throttle task work added yet. The throttle model will
> > switch to task based in a later patch.
> >
> > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> > Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
>
> I'm wondering how about put 02-04 patches together, since it's strange
> to setup task work in this patch but without changing throttle_cfs_rq(),
Do you mean 02-05?
Because the actual change to throttle_cfs_rq() happens in patch5 :)
> which makes the reviewing process a bit confused? WDYT?
Yes, I agree it looks a bit confused.
The point is to not break bisect while make review easier; if merging
all task based throttle related patches together, that would be to put
patch 02-05 together, which seems too big?
Thanks,
Aaron
> > ---
> > kernel/sched/fair.c | 88 +++++++++++++++++++++++++++++++++++++++-----
> > kernel/sched/sched.h | 1 +
> > 2 files changed, 80 insertions(+), 9 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 75bf6186a5137..e87ceb0a2d37f 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5825,8 +5825,47 @@ static inline int throttled_lb_pair(struct task_group *tg,
> > throttled_hierarchy(dest_cfs_rq);
> > }
> > +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> > static void throttle_cfs_rq_work(struct callback_head *work)
> > {
> > + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > + struct sched_entity *se;
> > + struct cfs_rq *cfs_rq;
> > + struct rq *rq;
> > +
> > + WARN_ON_ONCE(p != current);
> > + p->sched_throttle_work.next = &p->sched_throttle_work;
> > +
> > + /*
> > + * If task is exiting, then there won't be a return to userspace, so we
> > + * don't have to bother with any of this.
> > + */
> > + if ((p->flags & PF_EXITING))
> > + return;
> > +
> > + scoped_guard(task_rq_lock, p) {
> > + se = &p->se;
> > + cfs_rq = cfs_rq_of(se);
> > +
> > + /* Raced, forget */
> > + if (p->sched_class != &fair_sched_class)
> > + return;
> > +
> > + /*
> > + * If not in limbo, then either replenish has happened or this
> > + * task got migrated out of the throttled cfs_rq, move along.
> > + */
> > + if (!cfs_rq->throttle_count)
> > + return;
> > + rq = scope.rq;
> > + update_rq_clock(rq);
> > + WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > + resched_curr(rq);
> > + }
> > +
> > + cond_resched_tasks_rcu_qs();
> > }
> > void init_cfs_throttle_work(struct task_struct *p)
> > @@ -5866,21 +5905,42 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> > return 0;
> > }
> > +static inline bool task_has_throttle_work(struct task_struct *p)
> > +{
> > + return p->sched_throttle_work.next != &p->sched_throttle_work;
> > +}
> > +
> > +static inline void task_throttle_setup_work(struct task_struct *p)
> > +{
> > + if (task_has_throttle_work(p))
> > + return;
> > +
> > + /*
> > + * Kthreads and exiting tasks don't return to userspace, so adding the
> > + * work is pointless
> > + */
> > + if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > + return;
> > +
> > + task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> > +}
> > +
> > static int tg_throttle_down(struct task_group *tg, void *data)
> > {
> > struct rq *rq = data;
> > struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > + cfs_rq->throttle_count++;
> > + if (cfs_rq->throttle_count > 1)
> > + return 0;
> > +
> > /* group is entering throttled state, stop time */
> > - if (!cfs_rq->throttle_count) {
> > - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > - list_del_leaf_cfs_rq(cfs_rq);
> > + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > + list_del_leaf_cfs_rq(cfs_rq);
> > - WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> > - if (cfs_rq->nr_queued)
> > - cfs_rq->throttled_clock_self = rq_clock(rq);
> > - }
> > - cfs_rq->throttle_count++;
> > + WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> > + if (cfs_rq->nr_queued)
> > + cfs_rq->throttled_clock_self = rq_clock(rq);
> > return 0;
> > }
> > @@ -6575,6 +6635,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> > cfs_rq->runtime_enabled = 0;
> > INIT_LIST_HEAD(&cfs_rq->throttled_list);
> > INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> > + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
> > }
> > void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> > @@ -6744,6 +6805,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> > static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> > static inline void sync_throttle(struct task_group *tg, int cpu) {}
> > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > +static void task_throttle_setup_work(struct task_struct *p) {}
> > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> > {
> > @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > {
> > struct sched_entity *se;
> > struct cfs_rq *cfs_rq;
> > + struct task_struct *p;
> > again:
> > cfs_rq = &rq->cfs;
> > @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > cfs_rq = group_cfs_rq(se);
> > } while (cfs_rq);
> > - return task_of(se);
> > + p = task_of(se);
> > + if (throttled_hierarchy(cfs_rq_of(se))) {
> > + /* Shuold not happen for now */
> > + WARN_ON_ONCE(1);
> > + task_throttle_setup_work(p);
> > + }
> > +
> > + return p;
> > }
> > static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 921527327f107..83f16fc44884f 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -736,6 +736,7 @@ struct cfs_rq {
> > int throttle_count;
> > struct list_head throttled_list;
> > struct list_head throttled_csd_list;
> > + struct list_head throttled_limbo_list;
> > #endif /* CONFIG_CFS_BANDWIDTH */
> > #endif /* CONFIG_FAIR_GROUP_SCHED */
> > };
On 2025/5/21 17:21, Aaron Lu wrote: > On Wed, May 21, 2025 at 05:01:58PM +0800, Chengming Zhou wrote: >> On 2025/5/20 18:41, Aaron Lu wrote: >>> From: Valentin Schneider <vschneid@redhat.com> >>> >>> In current throttle model, when a cfs_rq is throttled, its entity will >>> be dequeued from cpu's rq, making tasks attached to it not able to run, >>> thus achiveing the throttle target. >>> >>> This has a drawback though: assume a task is a reader of percpu_rwsem >>> and is waiting. When it gets wakeup, it can not run till its task group's >>> next period comes, which can be a relatively long time. Waiting writer >>> will have to wait longer due to this and it also makes further reader >>> build up and eventually trigger task hung. >>> >>> To improve this situation, change the throttle model to task based, i.e. >>> when a cfs_rq is throttled, record its throttled status but do not remove >>> it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when >>> they get picked, add a task work to them so that when they return >>> to user, they can be dequeued. In this way, tasks throttled will not >>> hold any kernel resources. >>> >>> To avoid breaking bisect, preserve the current throttle behavior by >>> still dequeuing throttled hierarchy from rq and because of this, no task >>> can have that throttle task work added yet. The throttle model will >>> switch to task based in a later patch. >>> >>> Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick >>> Signed-off-by: Valentin Schneider <vschneid@redhat.com> >>> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> >> >> I'm wondering how about put 02-04 patches together, since it's strange >> to setup task work in this patch but without changing throttle_cfs_rq(), > > Do you mean 02-05? > Because the actual change to throttle_cfs_rq() happens in patch5 :) Ah, right. > >> which makes the reviewing process a bit confused? WDYT? > > Yes, I agree it looks a bit confused. > > The point is to not break bisect while make review easier; if merging > all task based throttle related patches together, that would be to put > patch 02-05 together, which seems too big? Yeah, a big patch but complete, instead of changing a bit on the same function in each patch. Anyway, it's your call :-) Thanks! > > Thanks, > Aaron >
On Thu, May 22, 2025 at 07:43:40PM +0800, Chengming Zhou wrote: > On 2025/5/21 17:21, Aaron Lu wrote: > > On Wed, May 21, 2025 at 05:01:58PM +0800, Chengming Zhou wrote: > > > On 2025/5/20 18:41, Aaron Lu wrote: > > > > From: Valentin Schneider <vschneid@redhat.com> > > > > > > > > In current throttle model, when a cfs_rq is throttled, its entity will > > > > be dequeued from cpu's rq, making tasks attached to it not able to run, > > > > thus achiveing the throttle target. > > > > > > > > This has a drawback though: assume a task is a reader of percpu_rwsem > > > > and is waiting. When it gets wakeup, it can not run till its task group's > > > > next period comes, which can be a relatively long time. Waiting writer > > > > will have to wait longer due to this and it also makes further reader > > > > build up and eventually trigger task hung. > > > > > > > > To improve this situation, change the throttle model to task based, i.e. > > > > when a cfs_rq is throttled, record its throttled status but do not remove > > > > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when > > > > they get picked, add a task work to them so that when they return > > > > to user, they can be dequeued. In this way, tasks throttled will not > > > > hold any kernel resources. > > > > > > > > To avoid breaking bisect, preserve the current throttle behavior by > > > > still dequeuing throttled hierarchy from rq and because of this, no task > > > > can have that throttle task work added yet. The throttle model will > > > > switch to task based in a later patch. > > > > > > > > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick > > > > Signed-off-by: Valentin Schneider <vschneid@redhat.com> > > > > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com> > > > > > > I'm wondering how about put 02-04 patches together, since it's strange > > > to setup task work in this patch but without changing throttle_cfs_rq(), > > > > Do you mean 02-05? > > Because the actual change to throttle_cfs_rq() happens in patch5 :) > > Ah, right. > > > > > > which makes the reviewing process a bit confused? WDYT? > > > > Yes, I agree it looks a bit confused. > > > > The point is to not break bisect while make review easier; if merging > > all task based throttle related patches together, that would be to put > > patch 02-05 together, which seems too big? > > Yeah, a big patch but complete, instead of changing a bit on the same > function in each patch. Anyway, it's your call :-) > Thanks for the suggestion and I can try that in next version. I'm thinking maybe one patch for newly added data structures and one patch for throttle related helper functions, than a complete patch to switch to task based throttle and the rest is the same. Let's also see if others have opinions on this :)
On Tue, 2025-05-20 at 18:41 +0800, Aaron Lu wrote:
> @@ -6744,6 +6805,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> static inline void sync_throttle(struct task_group *tg, int cpu) {}
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> +static void task_throttle_setup_work(struct task_struct *p) {}
>
> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> {
> @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> {
> struct sched_entity *se;
> struct cfs_rq *cfs_rq;
> + struct task_struct *p;
>
> again:
> cfs_rq = &rq->cfs;
> @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> - return task_of(se);
> + p = task_of(se);
> + if (throttled_hierarchy(cfs_rq_of(se))) {
> + /* Shuold not happen for now */
Typo: s/Shuold/Should
Btw: Is there a trace point in place when throttling/unthrottling
happens? Would be nice to see that in a trace, but might be that I
missed those events in my configuration up to now.
> + WARN_ON_ONCE(1);
> + task_throttle_setup_work(p);
> + }
> +
> + return p;
> }
On Tue, May 20, 2025 at 02:02:54PM +0200, Florian Bezdeka wrote:
> On Tue, 2025-05-20 at 18:41 +0800, Aaron Lu wrote:
> > @@ -6744,6 +6805,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> > static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> > static inline void sync_throttle(struct task_group *tg, int cpu) {}
> > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > +static void task_throttle_setup_work(struct task_struct *p) {}
> >
> > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> > {
> > @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > {
> > struct sched_entity *se;
> > struct cfs_rq *cfs_rq;
> > + struct task_struct *p;
> >
> > again:
> > cfs_rq = &rq->cfs;
> > @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > cfs_rq = group_cfs_rq(se);
> > } while (cfs_rq);
> >
> > - return task_of(se);
> > + p = task_of(se);
> > + if (throttled_hierarchy(cfs_rq_of(se))) {
> > + /* Shuold not happen for now */
>
> Typo: s/Shuold/Should
>
Ah right, thanks.
> Btw: Is there a trace point in place when throttling/unthrottling
> happens? Would be nice to see that in a trace, but might be that I
> missed those events in my configuration up to now.
There is no trace point for throttle related events yet.
I tried to see if bpftrace can do the job but unfortunately, bpftrace
does not allow variable as array indice so I failed to get cfs_rq when
probing tg_throttle_down(tg, data)...
Thanks,
Aaron
> > + WARN_ON_ONCE(1);
> > + task_throttle_setup_work(p);
> > + }
> > +
> > + return p;
> > }
>
On Wed, May 21, 2025 at 02:37:05PM +0800, Aaron Lu wrote:
> On Tue, May 20, 2025 at 02:02:54PM +0200, Florian Bezdeka wrote:
> > On Tue, 2025-05-20 at 18:41 +0800, Aaron Lu wrote:
> > > @@ -6744,6 +6805,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> > > static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> > > static inline void sync_throttle(struct task_group *tg, int cpu) {}
> > > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > > +static void task_throttle_setup_work(struct task_struct *p) {}
> > >
> > > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> > > {
> > > @@ -8851,6 +8913,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > > {
> > > struct sched_entity *se;
> > > struct cfs_rq *cfs_rq;
> > > + struct task_struct *p;
> > >
> > > again:
> > > cfs_rq = &rq->cfs;
> > > @@ -8871,7 +8934,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > > cfs_rq = group_cfs_rq(se);
> > > } while (cfs_rq);
> > >
> > > - return task_of(se);
> > > + p = task_of(se);
> > > + if (throttled_hierarchy(cfs_rq_of(se))) {
> > > + /* Shuold not happen for now */
> >
> > Typo: s/Shuold/Should
> >
>
> Ah right, thanks.
>
> > Btw: Is there a trace point in place when throttling/unthrottling
> > happens? Would be nice to see that in a trace, but might be that I
> > missed those events in my configuration up to now.
>
> There is no trace point for throttle related events yet.
>
> I tried to see if bpftrace can do the job but unfortunately, bpftrace
> does not allow variable as array indice so I failed to get cfs_rq when
> probing tg_throttle_down(tg, data)...
>
@ajor from IRC helped me solve this problem so I think the following
bpftrace script can somehow serve the purpose before trace events for
throttle/unthrottle are supported:
kfunc:tg_throttle_down
{
$rq = (struct rq *)args->data;
$cpu = $rq->cpu;
$cfs_rq = *(args->tg->cfs_rq + $cpu);
if ($cfs_rq->throttle_count == 0) {
printf("%llu cfs_rq %s/%d throttled\n", nsecs, str(args->tg->css.cgroup->kn->name), $cpu);
}
}
kfunc:tg_unthrottle_up
{
$rq = (struct rq *)args->data;
$cpu = $rq->cpu;
$cfs_rq = *(args->tg->cfs_rq + $cpu);
if ($cfs_rq->throttle_count == 1) {
printf("%llu cfs_rq %s/%d unthrottled\n", nsecs, str(args->tg->css.cgroup->kn->name), $cpu);
}
}
© 2016 - 2025 Red Hat, Inc.