On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
core will dequeue it and then requeued it.
The throttled task is still considered as queued by core because p->on_rq
is still set so core will dequeue it, but since the task is already
dequeued on throttle in fair, handle this case properly.
Affinity and sched class change is similar.
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
kernel/sched/fair.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74bc320cbc238..4c66fd8d24389 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
update_rq_clock(rq);
WARN_ON_ONCE(!list_empty(&p->throttle_node));
dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+ /*
+ * Must not add it to limbo list before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
resched_curr(rq);
}
@@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
INIT_LIST_HEAD(&p->throttle_node);
}
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+ /*
+ * Task is throttled and someone wants to dequeue it again:
+ * it must be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc.
+ */
+ WARN_ON_ONCE(p->se.on_rq);
+ WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
+
+ list_del_init(&p->throttle_node);
+}
+
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
@@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
+ if (unlikely(task_is_throttled(p))) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
util_est_dequeue(&rq->cfs, p);
--
2.39.5
On 2025/5/20 18:41, Aaron Lu wrote:
> On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
> core will dequeue it and then requeued it.
>
> The throttled task is still considered as queued by core because p->on_rq
> is still set so core will dequeue it, but since the task is already
> dequeued on throttle in fair, handle this case properly.
>
> Affinity and sched class change is similar.
How about setting p->on_rq to 0 when throttled? which is the fact that
the task is not on cfs queue anymore, does this method cause any problem?
Thanks!
>
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
> kernel/sched/fair.c | 24 ++++++++++++++++++++++++
> 1 file changed, 24 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 74bc320cbc238..4c66fd8d24389 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> update_rq_clock(rq);
> WARN_ON_ONCE(!list_empty(&p->throttle_node));
> dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> + /*
> + * Must not add it to limbo list before dequeue or dequeue will
> + * mistakenly regard this task as an already throttled one.
> + */
> list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> resched_curr(rq);
> }
> @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
> INIT_LIST_HEAD(&p->throttle_node);
> }
>
> +static void dequeue_throttled_task(struct task_struct *p, int flags)
> +{
> + /*
> + * Task is throttled and someone wants to dequeue it again:
> + * it must be sched/core when core needs to do things like
> + * task affinity change, task group change, task sched class
> + * change etc.
> + */
> + WARN_ON_ONCE(p->se.on_rq);
> + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
> +
> + list_del_init(&p->throttle_node);
> +}
> +
> static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> static int tg_unthrottle_up(struct task_group *tg, void *data)
> {
> @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> static void task_throttle_setup_work(struct task_struct *p) {}
> static bool task_is_throttled(struct task_struct *p) { return false; }
> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
>
> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> {
> @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> */
> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> {
> + if (unlikely(task_is_throttled(p))) {
> + dequeue_throttled_task(p, flags);
> + return true;
> + }
> +
> if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> util_est_dequeue(&rq->cfs, p);
>
On Fri, May 23, 2025 at 10:43:53AM +0800, Chengming Zhou wrote:
> On 2025/5/20 18:41, Aaron Lu wrote:
> > On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
> > core will dequeue it and then requeued it.
> >
> > The throttled task is still considered as queued by core because p->on_rq
> > is still set so core will dequeue it, but since the task is already
> > dequeued on throttle in fair, handle this case properly.
> >
> > Affinity and sched class change is similar.
>
> How about setting p->on_rq to 0 when throttled? which is the fact that
> the task is not on cfs queue anymore, does this method cause any problem?
>
On task group change/affinity change etc. if the throttled task is
regarded as !on_rq, then it will miss the chance to be enqueued to the
new(and correct) cfs_rqs, instead, it will be enqueued back to its
original cfs_rq on unthrottle which breaks affinity or task group
settings. We may be able to do something in tg_unthrottle_up() to take
special care of these situations, but it seems a lot of headaches.
Also, for task group change, if the new task group does not have throttle
setting, that throttled task should be allowed to run immediately instead
of waiting for its old cfs_rq's unthrottle event. Similar is true when
this throttled task changed its sched class, like from fair to rt.
Makes sense?
Thanks,
Aaron
> >
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> > kernel/sched/fair.c | 24 ++++++++++++++++++++++++
> > 1 file changed, 24 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 74bc320cbc238..4c66fd8d24389 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> > update_rq_clock(rq);
> > WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > + /*
> > + * Must not add it to limbo list before dequeue or dequeue will
> > + * mistakenly regard this task as an already throttled one.
> > + */
> > list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > resched_curr(rq);
> > }
> > @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
> > INIT_LIST_HEAD(&p->throttle_node);
> > }
> > +static void dequeue_throttled_task(struct task_struct *p, int flags)
> > +{
> > + /*
> > + * Task is throttled and someone wants to dequeue it again:
> > + * it must be sched/core when core needs to do things like
> > + * task affinity change, task group change, task sched class
> > + * change etc.
> > + */
> > + WARN_ON_ONCE(p->se.on_rq);
> > + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
> > +
> > + list_del_init(&p->throttle_node);
> > +}
> > +
> > static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> > static int tg_unthrottle_up(struct task_group *tg, void *data)
> > {
> > @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
> > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > static void task_throttle_setup_work(struct task_struct *p) {}
> > static bool task_is_throttled(struct task_struct *p) { return false; }
> > +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
> > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> > {
> > @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> > */
> > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > {
> > + if (unlikely(task_is_throttled(p))) {
> > + dequeue_throttled_task(p, flags);
> > + return true;
> > + }
> > +
> > if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> > util_est_dequeue(&rq->cfs, p);
On 2025/5/23 15:56, Aaron Lu wrote:
> On Fri, May 23, 2025 at 10:43:53AM +0800, Chengming Zhou wrote:
>> On 2025/5/20 18:41, Aaron Lu wrote:
>>> On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
>>> core will dequeue it and then requeued it.
>>>
>>> The throttled task is still considered as queued by core because p->on_rq
>>> is still set so core will dequeue it, but since the task is already
>>> dequeued on throttle in fair, handle this case properly.
>>>
>>> Affinity and sched class change is similar.
>>
>> How about setting p->on_rq to 0 when throttled? which is the fact that
>> the task is not on cfs queue anymore, does this method cause any problem?
>>
>
> On task group change/affinity change etc. if the throttled task is
> regarded as !on_rq, then it will miss the chance to be enqueued to the
> new(and correct) cfs_rqs, instead, it will be enqueued back to its
> original cfs_rq on unthrottle which breaks affinity or task group
Yeah, this is indeed a problem, I was thinking to delete the throttled task
from the cfs_rq limbo list, then add it to another cfs_rq limbo list or cfs_rq
runnable tree based on the new cfs_rq's throttle status.
But it's much complex compared with your current method.
> settings. We may be able to do something in tg_unthrottle_up() to take
> special care of these situations, but it seems a lot of headaches.
>
> Also, for task group change, if the new task group does not have throttle
> setting, that throttled task should be allowed to run immediately instead
> of waiting for its old cfs_rq's unthrottle event. Similar is true when
> this throttled task changed its sched class, like from fair to rt.
>
> Makes sense?
Ok, the another problem of the current method I can think of is the PELT maintenance,
we skip the actual dequeue_task_fair() process, which includes PELT detach, we just
delete it from the cfs_rq limbo list, so it can result in PELT maintenance error.
Thanks!
>
> Thanks,
> Aaron
>
>>>
>>> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
>>> ---
>>> kernel/sched/fair.c | 24 ++++++++++++++++++++++++
>>> 1 file changed, 24 insertions(+)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 74bc320cbc238..4c66fd8d24389 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
>>> update_rq_clock(rq);
>>> WARN_ON_ONCE(!list_empty(&p->throttle_node));
>>> dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
>>> + /*
>>> + * Must not add it to limbo list before dequeue or dequeue will
>>> + * mistakenly regard this task as an already throttled one.
>>> + */
>>> list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
>>> resched_curr(rq);
>>> }
>>> @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
>>> INIT_LIST_HEAD(&p->throttle_node);
>>> }
>>> +static void dequeue_throttled_task(struct task_struct *p, int flags)
>>> +{
>>> + /*
>>> + * Task is throttled and someone wants to dequeue it again:
>>> + * it must be sched/core when core needs to do things like
>>> + * task affinity change, task group change, task sched class
>>> + * change etc.
>>> + */
>>> + WARN_ON_ONCE(p->se.on_rq);
>>> + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
>>> +
>>> + list_del_init(&p->throttle_node);
>>> +}
>>> +
>>> static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>>> static int tg_unthrottle_up(struct task_group *tg, void *data)
>>> {
>>> @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
>>> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
>>> static void task_throttle_setup_work(struct task_struct *p) {}
>>> static bool task_is_throttled(struct task_struct *p) { return false; }
>>> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
>>> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>>> {
>>> @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>>> */
>>> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>> {
>>> + if (unlikely(task_is_throttled(p))) {
>>> + dequeue_throttled_task(p, flags);
>>> + return true;
>>> + }
>>> +
>>> if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
>>> util_est_dequeue(&rq->cfs, p);
On Fri, May 23, 2025 at 05:13:35PM +0800, Chengming Zhou wrote:
> On 2025/5/23 15:56, Aaron Lu wrote:
> > On Fri, May 23, 2025 at 10:43:53AM +0800, Chengming Zhou wrote:
> > > On 2025/5/20 18:41, Aaron Lu wrote:
> > > > On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
> > > > core will dequeue it and then requeued it.
> > > >
> > > > The throttled task is still considered as queued by core because p->on_rq
> > > > is still set so core will dequeue it, but since the task is already
> > > > dequeued on throttle in fair, handle this case properly.
> > > >
> > > > Affinity and sched class change is similar.
> > >
> > > How about setting p->on_rq to 0 when throttled? which is the fact that
> > > the task is not on cfs queue anymore, does this method cause any problem?
> > >
> >
> > On task group change/affinity change etc. if the throttled task is
> > regarded as !on_rq, then it will miss the chance to be enqueued to the
> > new(and correct) cfs_rqs, instead, it will be enqueued back to its
> > original cfs_rq on unthrottle which breaks affinity or task group
>
> Yeah, this is indeed a problem, I was thinking to delete the throttled task
> from the cfs_rq limbo list, then add it to another cfs_rq limbo list or cfs_rq
> runnable tree based on the new cfs_rq's throttle status.
Only work when the task is still handled by fair :)
>
> But it's much complex compared with your current method.
>
> > settings. We may be able to do something in tg_unthrottle_up() to take
> > special care of these situations, but it seems a lot of headaches.
> >
> > Also, for task group change, if the new task group does not have throttle
> > setting, that throttled task should be allowed to run immediately instead
> > of waiting for its old cfs_rq's unthrottle event. Similar is true when
> > this throttled task changed its sched class, like from fair to rt.
> >
> > Makes sense?
>
> Ok, the another problem of the current method I can think of is the PELT maintenance,
> we skip the actual dequeue_task_fair() process, which includes PELT detach, we just
> delete it from the cfs_rq limbo list, so it can result in PELT maintenance error.
>
There are corresponding callbacks that handle this, e.g. for task group
change, there is task_change_group_fair() that handles PELT detach; for
affinity change, there is migrate_task_rq_fair() does that and for sched
class change, there is switched_from/to() does that.
Or do I miss anything?
Thanks,
Aaron
> > > >
> > > > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > > > ---
> > > > kernel/sched/fair.c | 24 ++++++++++++++++++++++++
> > > > 1 file changed, 24 insertions(+)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 74bc320cbc238..4c66fd8d24389 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> > > > update_rq_clock(rq);
> > > > WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > > dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > > > + /*
> > > > + * Must not add it to limbo list before dequeue or dequeue will
> > > > + * mistakenly regard this task as an already throttled one.
> > > > + */
> > > > list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > > resched_curr(rq);
> > > > }
> > > > @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
> > > > INIT_LIST_HEAD(&p->throttle_node);
> > > > }
> > > > +static void dequeue_throttled_task(struct task_struct *p, int flags)
> > > > +{
> > > > + /*
> > > > + * Task is throttled and someone wants to dequeue it again:
> > > > + * it must be sched/core when core needs to do things like
> > > > + * task affinity change, task group change, task sched class
> > > > + * change etc.
> > > > + */
> > > > + WARN_ON_ONCE(p->se.on_rq);
> > > > + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
> > > > +
> > > > + list_del_init(&p->throttle_node);
> > > > +}
> > > > +
> > > > static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> > > > static int tg_unthrottle_up(struct task_group *tg, void *data)
> > > > {
> > > > @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
> > > > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > > > static void task_throttle_setup_work(struct task_struct *p) {}
> > > > static bool task_is_throttled(struct task_struct *p) { return false; }
> > > > +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
> > > > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> > > > {
> > > > @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> > > > */
> > > > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > > > {
> > > > + if (unlikely(task_is_throttled(p))) {
> > > > + dequeue_throttled_task(p, flags);
> > > > + return true;
> > > > + }
> > > > +
> > > > if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> > > > util_est_dequeue(&rq->cfs, p);
On 2025/5/23 17:42, Aaron Lu wrote:
> On Fri, May 23, 2025 at 05:13:35PM +0800, Chengming Zhou wrote:
>> On 2025/5/23 15:56, Aaron Lu wrote:
>>> On Fri, May 23, 2025 at 10:43:53AM +0800, Chengming Zhou wrote:
>>>> On 2025/5/20 18:41, Aaron Lu wrote:
>>>>> On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
>>>>> core will dequeue it and then requeued it.
>>>>>
>>>>> The throttled task is still considered as queued by core because p->on_rq
>>>>> is still set so core will dequeue it, but since the task is already
>>>>> dequeued on throttle in fair, handle this case properly.
>>>>>
>>>>> Affinity and sched class change is similar.
>>>>
>>>> How about setting p->on_rq to 0 when throttled? which is the fact that
>>>> the task is not on cfs queue anymore, does this method cause any problem?
>>>>
>>>
>>> On task group change/affinity change etc. if the throttled task is
>>> regarded as !on_rq, then it will miss the chance to be enqueued to the
>>> new(and correct) cfs_rqs, instead, it will be enqueued back to its
>>> original cfs_rq on unthrottle which breaks affinity or task group
>>
>> Yeah, this is indeed a problem, I was thinking to delete the throttled task
>> from the cfs_rq limbo list, then add it to another cfs_rq limbo list or cfs_rq
>> runnable tree based on the new cfs_rq's throttle status.
>
> Only work when the task is still handled by fair :)
>
>>
>> But it's much complex compared with your current method.
>>
>>> settings. We may be able to do something in tg_unthrottle_up() to take
>>> special care of these situations, but it seems a lot of headaches.
>>>
>>> Also, for task group change, if the new task group does not have throttle
>>> setting, that throttled task should be allowed to run immediately instead
>>> of waiting for its old cfs_rq's unthrottle event. Similar is true when
>>> this throttled task changed its sched class, like from fair to rt.
>>>
>>> Makes sense?
>>
>> Ok, the another problem of the current method I can think of is the PELT maintenance,
>> we skip the actual dequeue_task_fair() process, which includes PELT detach, we just
>> delete it from the cfs_rq limbo list, so it can result in PELT maintenance error.
>>
>
> There are corresponding callbacks that handle this, e.g. for task group
> change, there is task_change_group_fair() that handles PELT detach; for
> affinity change, there is migrate_task_rq_fair() does that and for sched
> class change, there is switched_from/to() does that.
>
> Or do I miss anything?
migrate_task_rq_fair() only do it when !task_on_rq_migrating(p), which is wakeup migrate,
because we already do detach in dequeue_task_fair() for on_rq task migration...
You can see the DO_DETACH flag in update_load_avg() called from dequeue_entity().
Thanks!
>
> Thanks,
> Aaron
>
>>>>>
>>>>> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
>>>>> ---
>>>>> kernel/sched/fair.c | 24 ++++++++++++++++++++++++
>>>>> 1 file changed, 24 insertions(+)
>>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index 74bc320cbc238..4c66fd8d24389 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
>>>>> update_rq_clock(rq);
>>>>> WARN_ON_ONCE(!list_empty(&p->throttle_node));
>>>>> dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
>>>>> + /*
>>>>> + * Must not add it to limbo list before dequeue or dequeue will
>>>>> + * mistakenly regard this task as an already throttled one.
>>>>> + */
>>>>> list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
>>>>> resched_curr(rq);
>>>>> }
>>>>> @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
>>>>> INIT_LIST_HEAD(&p->throttle_node);
>>>>> }
>>>>> +static void dequeue_throttled_task(struct task_struct *p, int flags)
>>>>> +{
>>>>> + /*
>>>>> + * Task is throttled and someone wants to dequeue it again:
>>>>> + * it must be sched/core when core needs to do things like
>>>>> + * task affinity change, task group change, task sched class
>>>>> + * change etc.
>>>>> + */
>>>>> + WARN_ON_ONCE(p->se.on_rq);
>>>>> + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
>>>>> +
>>>>> + list_del_init(&p->throttle_node);
>>>>> +}
>>>>> +
>>>>> static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>>>>> static int tg_unthrottle_up(struct task_group *tg, void *data)
>>>>> {
>>>>> @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
>>>>> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
>>>>> static void task_throttle_setup_work(struct task_struct *p) {}
>>>>> static bool task_is_throttled(struct task_struct *p) { return false; }
>>>>> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
>>>>> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>>>>> {
>>>>> @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>>>>> */
>>>>> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>> {
>>>>> + if (unlikely(task_is_throttled(p))) {
>>>>> + dequeue_throttled_task(p, flags);
>>>>> + return true;
>>>>> + }
>>>>> +
>>>>> if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
>>>>> util_est_dequeue(&rq->cfs, p);
On Fri, May 23, 2025 at 05:53:55PM +0800, Chengming Zhou wrote:
> On 2025/5/23 17:42, Aaron Lu wrote:
> > On Fri, May 23, 2025 at 05:13:35PM +0800, Chengming Zhou wrote:
> > > On 2025/5/23 15:56, Aaron Lu wrote:
> > > > On Fri, May 23, 2025 at 10:43:53AM +0800, Chengming Zhou wrote:
> > > > > On 2025/5/20 18:41, Aaron Lu wrote:
> > > > > > On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
> > > > > > core will dequeue it and then requeued it.
> > > > > >
> > > > > > The throttled task is still considered as queued by core because p->on_rq
> > > > > > is still set so core will dequeue it, but since the task is already
> > > > > > dequeued on throttle in fair, handle this case properly.
> > > > > >
> > > > > > Affinity and sched class change is similar.
> > > > >
> > > > > How about setting p->on_rq to 0 when throttled? which is the fact that
> > > > > the task is not on cfs queue anymore, does this method cause any problem?
> > > > >
> > > >
> > > > On task group change/affinity change etc. if the throttled task is
> > > > regarded as !on_rq, then it will miss the chance to be enqueued to the
> > > > new(and correct) cfs_rqs, instead, it will be enqueued back to its
> > > > original cfs_rq on unthrottle which breaks affinity or task group
> > >
> > > Yeah, this is indeed a problem, I was thinking to delete the throttled task
> > > from the cfs_rq limbo list, then add it to another cfs_rq limbo list or cfs_rq
> > > runnable tree based on the new cfs_rq's throttle status.
> >
> > Only work when the task is still handled by fair :)
> >
> > >
> > > But it's much complex compared with your current method.
> > >
> > > > settings. We may be able to do something in tg_unthrottle_up() to take
> > > > special care of these situations, but it seems a lot of headaches.
> > > >
> > > > Also, for task group change, if the new task group does not have throttle
> > > > setting, that throttled task should be allowed to run immediately instead
> > > > of waiting for its old cfs_rq's unthrottle event. Similar is true when
> > > > this throttled task changed its sched class, like from fair to rt.
> > > >
> > > > Makes sense?
> > >
> > > Ok, the another problem of the current method I can think of is the PELT maintenance,
> > > we skip the actual dequeue_task_fair() process, which includes PELT detach, we just
> > > delete it from the cfs_rq limbo list, so it can result in PELT maintenance error.
> > >
> >
> > There are corresponding callbacks that handle this, e.g. for task group
> > change, there is task_change_group_fair() that handles PELT detach; for
> > affinity change, there is migrate_task_rq_fair() does that and for sched
> > class change, there is switched_from/to() does that.
> >
> > Or do I miss anything?
>
> migrate_task_rq_fair() only do it when !task_on_rq_migrating(p), which is wakeup migrate,
> because we already do detach in dequeue_task_fair() for on_rq task migration...
> You can see the DO_DETACH flag in update_load_avg() called from dequeue_entity().
>
Understood, thanks for catching this!
So the code was initially developed on top of v5.15 and there is a
detach in migrate_task_rq_fair():
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
/*
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
lockdep_assert_rq_held(task_rq(p));
detach_entity_cfs_rq(&p->se);
}
But looks like it's gone now by commit e1f078f50478("sched/fair: Combine
detach into dequeue when migrating task") and I failed to notice this
detail...
Anyway, the task is already dequeued without TASK_ON_RQ_MIGRATING being
set when throttled and it can't be dequeued again, so perhaps something
like below could cure this situation?(just to illustrate the idea, not
even compile tested)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 89afa472299b7..dc2e9a6bf3fd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5868,6 +5868,9 @@ static void dequeue_throttled_task(struct task_struct *p, int flags)
WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
list_del_init(&p->throttle_node);
+
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
}
On 2025/5/23 19:59, Aaron Lu wrote:
> On Fri, May 23, 2025 at 05:53:55PM +0800, Chengming Zhou wrote:
>> On 2025/5/23 17:42, Aaron Lu wrote:
>>> On Fri, May 23, 2025 at 05:13:35PM +0800, Chengming Zhou wrote:
>>>> On 2025/5/23 15:56, Aaron Lu wrote:
>>>>> On Fri, May 23, 2025 at 10:43:53AM +0800, Chengming Zhou wrote:
>>>>>> On 2025/5/20 18:41, Aaron Lu wrote:
>>>>>>> On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
>>>>>>> core will dequeue it and then requeued it.
>>>>>>>
>>>>>>> The throttled task is still considered as queued by core because p->on_rq
>>>>>>> is still set so core will dequeue it, but since the task is already
>>>>>>> dequeued on throttle in fair, handle this case properly.
>>>>>>>
>>>>>>> Affinity and sched class change is similar.
>>>>>>
>>>>>> How about setting p->on_rq to 0 when throttled? which is the fact that
>>>>>> the task is not on cfs queue anymore, does this method cause any problem?
>>>>>>
>>>>>
>>>>> On task group change/affinity change etc. if the throttled task is
>>>>> regarded as !on_rq, then it will miss the chance to be enqueued to the
>>>>> new(and correct) cfs_rqs, instead, it will be enqueued back to its
>>>>> original cfs_rq on unthrottle which breaks affinity or task group
>>>>
>>>> Yeah, this is indeed a problem, I was thinking to delete the throttled task
>>>> from the cfs_rq limbo list, then add it to another cfs_rq limbo list or cfs_rq
>>>> runnable tree based on the new cfs_rq's throttle status.
>>>
>>> Only work when the task is still handled by fair :)
>>>
>>>>
>>>> But it's much complex compared with your current method.
>>>>
>>>>> settings. We may be able to do something in tg_unthrottle_up() to take
>>>>> special care of these situations, but it seems a lot of headaches.
>>>>>
>>>>> Also, for task group change, if the new task group does not have throttle
>>>>> setting, that throttled task should be allowed to run immediately instead
>>>>> of waiting for its old cfs_rq's unthrottle event. Similar is true when
>>>>> this throttled task changed its sched class, like from fair to rt.
>>>>>
>>>>> Makes sense?
>>>>
>>>> Ok, the another problem of the current method I can think of is the PELT maintenance,
>>>> we skip the actual dequeue_task_fair() process, which includes PELT detach, we just
>>>> delete it from the cfs_rq limbo list, so it can result in PELT maintenance error.
>>>>
>>>
>>> There are corresponding callbacks that handle this, e.g. for task group
>>> change, there is task_change_group_fair() that handles PELT detach; for
>>> affinity change, there is migrate_task_rq_fair() does that and for sched
>>> class change, there is switched_from/to() does that.
>>>
>>> Or do I miss anything?
>>
>> migrate_task_rq_fair() only do it when !task_on_rq_migrating(p), which is wakeup migrate,
>> because we already do detach in dequeue_task_fair() for on_rq task migration...
>> You can see the DO_DETACH flag in update_load_avg() called from dequeue_entity().
>>
>
> Understood, thanks for catching this!
>
> So the code was initially developed on top of v5.15 and there is a
> detach in migrate_task_rq_fair():
>
> if (p->on_rq == TASK_ON_RQ_MIGRATING) {
> /*
> * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
> * rq->lock and can modify state directly.
> */
> lockdep_assert_rq_held(task_rq(p));
> detach_entity_cfs_rq(&p->se);
> }
>
> But looks like it's gone now by commit e1f078f50478("sched/fair: Combine
> detach into dequeue when migrating task") and I failed to notice this
> detail...
Yeah..
>
> Anyway, the task is already dequeued without TASK_ON_RQ_MIGRATING being
> set when throttled and it can't be dequeued again, so perhaps something
> like below could cure this situation?(just to illustrate the idea, not
> even compile tested)
Ok, seems reasonable to me!
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 89afa472299b7..dc2e9a6bf3fd7 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5868,6 +5868,9 @@ static void dequeue_throttled_task(struct task_struct *p, int flags)
> WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
>
> list_del_init(&p->throttle_node);
> +
> + if (task_on_rq_migrating(p))
> + detach_task_cfs_rq(p);
> }
>
>
On Tue, May 20, 2025 at 06:41:07PM +0800, Aaron Lu wrote:
> On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
> core will dequeue it and then requeued it.
>
> The throttled task is still considered as queued by core because p->on_rq
> is still set so core will dequeue it, but since the task is already
> dequeued on throttle in fair, handle this case properly.
>
> Affinity and sched class change is similar.
>
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
> kernel/sched/fair.c | 24 ++++++++++++++++++++++++
> 1 file changed, 24 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 74bc320cbc238..4c66fd8d24389 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> update_rq_clock(rq);
> WARN_ON_ONCE(!list_empty(&p->throttle_node));
> dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> + /*
> + * Must not add it to limbo list before dequeue or dequeue will
> + * mistakenly regard this task as an already throttled one.
> + */
> list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> resched_curr(rq);
> }
> @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
> INIT_LIST_HEAD(&p->throttle_node);
> }
>
> +static void dequeue_throttled_task(struct task_struct *p, int flags)
> +{
> + /*
> + * Task is throttled and someone wants to dequeue it again:
> + * it must be sched/core when core needs to do things like
> + * task affinity change, task group change, task sched class
> + * change etc.
> + */
> + WARN_ON_ONCE(p->se.on_rq);
> + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
> +
> + list_del_init(&p->throttle_node);
> +}
> +
> static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> static int tg_unthrottle_up(struct task_group *tg, void *data)
> {
> @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> static void task_throttle_setup_work(struct task_struct *p) {}
> static bool task_is_throttled(struct task_struct *p) { return false; }
> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
>
> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> {
> @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> */
> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> {
> + if (unlikely(task_is_throttled(p))) {
> + dequeue_throttled_task(p, flags);
> + return true;
> + }
> +
> if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> util_est_dequeue(&rq->cfs, p);
This is asymmetric -- dequeue removes it from that throttle list, but
the corresponding enqueue will not add it back, what gives?
Because now we have:
p->on_rq=1
p->throttle_node on list
move_queued_task()
deactivate_task()
dequeue_task_fair()
list_del_init(throttle_node)
p->on_rq = 2
activate_task()
enqueue_task_fair()
// nothing special, makes the thing runnable
p->on_rq = 1;
and we exit with a task that is on-rq and not throttled ?!?
Why is this? Are we relying on pick_task_fair() to dequeue it again and
fix up our inconsistencies? If so, that had better have a comment on.
On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote:
> On Tue, May 20, 2025 at 06:41:07PM +0800, Aaron Lu wrote:
> > On task group change, for tasks whose on_rq equals to TASK_ON_RQ_QUEUED,
> > core will dequeue it and then requeued it.
> >
> > The throttled task is still considered as queued by core because p->on_rq
> > is still set so core will dequeue it, but since the task is already
> > dequeued on throttle in fair, handle this case properly.
> >
> > Affinity and sched class change is similar.
> >
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> > kernel/sched/fair.c | 24 ++++++++++++++++++++++++
> > 1 file changed, 24 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 74bc320cbc238..4c66fd8d24389 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5866,6 +5866,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> > update_rq_clock(rq);
> > WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > + /*
> > + * Must not add it to limbo list before dequeue or dequeue will
> > + * mistakenly regard this task as an already throttled one.
> > + */
> > list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > resched_curr(rq);
> > }
> > @@ -5881,6 +5885,20 @@ void init_cfs_throttle_work(struct task_struct *p)
> > INIT_LIST_HEAD(&p->throttle_node);
> > }
> >
> > +static void dequeue_throttled_task(struct task_struct *p, int flags)
> > +{
> > + /*
> > + * Task is throttled and someone wants to dequeue it again:
> > + * it must be sched/core when core needs to do things like
> > + * task affinity change, task group change, task sched class
> > + * change etc.
> > + */
> > + WARN_ON_ONCE(p->se.on_rq);
> > + WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
> > +
> > + list_del_init(&p->throttle_node);
> > +}
> > +
> > static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> > static int tg_unthrottle_up(struct task_group *tg, void *data)
> > {
> > @@ -6834,6 +6852,7 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
> > static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > static void task_throttle_setup_work(struct task_struct *p) {}
> > static bool task_is_throttled(struct task_struct *p) { return false; }
> > +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
> >
> > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> > {
> > @@ -7281,6 +7300,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> > */
> > static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > {
> > + if (unlikely(task_is_throttled(p))) {
> > + dequeue_throttled_task(p, flags);
> > + return true;
> > + }
> > +
> > if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
> > util_est_dequeue(&rq->cfs, p);
>
> This is asymmetric -- dequeue removes it from that throttle list, but
> the corresponding enqueue will not add it back, what gives?
>
> Because now we have:
>
> p->on_rq=1
> p->throttle_node on list
>
> move_queued_task()
> deactivate_task()
> dequeue_task_fair()
> list_del_init(throttle_node)
> p->on_rq = 2
>
> activate_task()
> enqueue_task_fair()
> // nothing special, makes the thing runnable
> p->on_rq = 1;
>
> and we exit with a task that is on-rq and not throttled ?!?
>
> Why is this? Are we relying on pick_task_fair() to dequeue it again and
> fix up our inconsistencies? If so, that had better have a comment on.
Correct.
Does the following comment look OK?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 89afa472299b7..4f4d64cf31fb1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7147,6 +7147,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
if (unlikely(task_is_throttled(p))) {
+ /*
+ * Task migrated to new rq will have its throttle work
+ * added if necessary in pick time.
+ */
dequeue_throttled_task(p, flags);
return true;
}
On Thu, May 22, 2025 at 08:49:43PM +0800, Aaron Lu wrote: > On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote: > > This is asymmetric -- dequeue removes it from that throttle list, but > > the corresponding enqueue will not add it back, what gives? > > > > Because now we have: > > > > p->on_rq=1 > > p->throttle_node on list > > > > move_queued_task() > > deactivate_task() > > dequeue_task_fair() > > list_del_init(throttle_node) > > p->on_rq = 2 > > > > activate_task() > > enqueue_task_fair() > > // nothing special, makes the thing runnable > > p->on_rq = 1; > > > > and we exit with a task that is on-rq and not throttled ?!? > > > > Why is this? Are we relying on pick_task_fair() to dequeue it again and > > fix up our inconsistencies? If so, that had better have a comment on. > > Correct. But would it not be better to have enqueue bail when we're trying to enqueue an already throttled task into a throttled cfs_rq? It seems a waste to do the actual enqueue, pick, dequeue when we could've just avoided all that. The immediate problem seems to be that you destroy the task_is_throttled() state on dequeue, but surely that is trivially fixable by not keeping that state in the list.
On Fri, May 23, 2025 at 04:59:42PM +0200, Peter Zijlstra wrote: > On Thu, May 22, 2025 at 08:49:43PM +0800, Aaron Lu wrote: > > On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote: > > > > This is asymmetric -- dequeue removes it from that throttle list, but > > > the corresponding enqueue will not add it back, what gives? > > > > > > Because now we have: > > > > > > p->on_rq=1 > > > p->throttle_node on list > > > > > > move_queued_task() > > > deactivate_task() > > > dequeue_task_fair() > > > list_del_init(throttle_node) > > > p->on_rq = 2 > > > > > > activate_task() > > > enqueue_task_fair() > > > // nothing special, makes the thing runnable > > > p->on_rq = 1; > > > > > > and we exit with a task that is on-rq and not throttled ?!? > > > > > > Why is this? Are we relying on pick_task_fair() to dequeue it again and > > > fix up our inconsistencies? If so, that had better have a comment on. > > > > Correct. > > But would it not be better to have enqueue bail when we're trying to > enqueue an already throttled task into a throttled cfs_rq? > > It seems a waste to do the actual enqueue, pick, dequeue when we > could've just avoided all that. > The original idea is to keep code simple but surely this can be optimized. I'm working on it and will paste diff here once I get it work. Thanks, Aaron > The immediate problem seems to be that you destroy the > task_is_throttled() state on dequeue, but surely that is trivially > fixable by not keeping that state in the list.
On Mon, May 26, 2025 at 07:36:50PM +0800, Aaron Lu wrote:
> On Fri, May 23, 2025 at 04:59:42PM +0200, Peter Zijlstra wrote:
> > On Thu, May 22, 2025 at 08:49:43PM +0800, Aaron Lu wrote:
> > > On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote:
> >
> > > > This is asymmetric -- dequeue removes it from that throttle list, but
> > > > the corresponding enqueue will not add it back, what gives?
> > > >
> > > > Because now we have:
> > > >
> > > > p->on_rq=1
> > > > p->throttle_node on list
> > > >
> > > > move_queued_task()
> > > > deactivate_task()
> > > > dequeue_task_fair()
> > > > list_del_init(throttle_node)
> > > > p->on_rq = 2
> > > >
> > > > activate_task()
> > > > enqueue_task_fair()
> > > > // nothing special, makes the thing runnable
> > > > p->on_rq = 1;
> > > >
> > > > and we exit with a task that is on-rq and not throttled ?!?
> > > >
> > > > Why is this? Are we relying on pick_task_fair() to dequeue it again and
> > > > fix up our inconsistencies? If so, that had better have a comment on.
> > >
> > > Correct.
> >
> > But would it not be better to have enqueue bail when we're trying to
> > enqueue an already throttled task into a throttled cfs_rq?
> >
> > It seems a waste to do the actual enqueue, pick, dequeue when we
> > could've just avoided all that.
> >
>
> The original idea is to keep code simple but surely this can be
> optimized. I'm working on it and will paste diff here once I get it
> work.
>
I tried below diff on top of this series:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 055f3782eeaee..1c5d7c4ff6652 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -882,6 +882,7 @@ struct task_struct {
#ifdef CONFIG_CFS_BANDWIDTH
struct callback_head sched_throttle_work;
struct list_head throttle_node;
+ bool throttled;
#endif
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 89afa472299b7..c585a12f2c753 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
static inline bool task_is_throttled(struct task_struct *p)
{
- return !list_empty(&p->throttle_node);
+ return p->throttled;
}
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5842,6 +5842,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
* mistakenly regard this task as an already throttled one.
*/
list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ p->throttled = true;
resched_curr(rq);
}
@@ -5870,6 +5871,22 @@ static void dequeue_throttled_task(struct task_struct *p, int flags)
list_del_init(&p->throttle_node);
}
+/* return true to skip actual enqueue */
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ if (throttled_hierarchy(cfs_rq)) {
+ /* throttled task move across task groups/rqs. */
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* unthrottle */
+ p->throttled = false;
+ return false;
+}
+
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
@@ -6714,6 +6731,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6907,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6917,7 +6938,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
util_est_enqueue(&rq->cfs, p);
if (flags & ENQUEUE_DELAYED) {
- WARN_ON_ONCE(task_is_throttled(p));
requeue_delayed_entity(se);
return;
}
But got a list corruption issue on se->group_node. After some debugging,
the following situation could happen and cause a throttled task's
se.group_node left on rq->cfs_tasks when this task is returning to user
with throttle task executed and another cpu moving it to a new group and
its new cfs_rq is also throttled:
cpuX cpuY
taskA ret2user
throttle_cfs_rq_work() sched_move_task(taskA)
task_rq_lock acquired
dequeue_task_fair(taskA)
task_rq_lock released
task_rq_lock acquired
task_current_donor(taskA) == true
task_on_rq_queued(taskA) == true
dequeue_task(taskA)
put_prev_task(taskA)
sched_change_group()
enqueue_task(taskA) -> taskA's new cfs_rq
is throttled, go the
fast path and skip
actual enqueue
set_next_task(taskA)
__set_next_task_fair(taskA)
list_move(&se->group_node, &rq->cfs_tasks); // bug
schedule()
(The current series does not have the problem because it always did an
actual enqueue.)
I think this can be trivially fixed by checking if the task is the
current one in enqueue_throttled_task() and if so, do not go the fast
path but do an actual enqueue, like below. I've tested it and do not
find any problem right now. Thoughts?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c585a12f2c753..f9de7df44e968 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5876,7 +5876,8 @@ static bool enqueue_throttled_task(struct task_struct *p)
{
struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
- if (throttled_hierarchy(cfs_rq)) {
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
/* throttled task move across task groups/rqs. */
list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
return true;
Hello Aaron,
On 5/27/2025 12:28 PM, Aaron Lu wrote:
> On Mon, May 26, 2025 at 07:36:50PM +0800, Aaron Lu wrote:
>> On Fri, May 23, 2025 at 04:59:42PM +0200, Peter Zijlstra wrote:
>>> On Thu, May 22, 2025 at 08:49:43PM +0800, Aaron Lu wrote:
>>>> On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote:
>>>
>>>>> This is asymmetric -- dequeue removes it from that throttle list, but
>>>>> the corresponding enqueue will not add it back, what gives?
>>>>>
>>>>> Because now we have:
>>>>>
>>>>> p->on_rq=1
>>>>> p->throttle_node on list
>>>>>
>>>>> move_queued_task()
>>>>> deactivate_task()
>>>>> dequeue_task_fair()
>>>>> list_del_init(throttle_node)
>>>>> p->on_rq = 2
>>>>>
>>>>> activate_task()
>>>>> enqueue_task_fair()
>>>>> // nothing special, makes the thing runnable
>>>>> p->on_rq = 1;
>>>>>
>>>>> and we exit with a task that is on-rq and not throttled ?!?
>>>>>
>>>>> Why is this? Are we relying on pick_task_fair() to dequeue it again and
>>>>> fix up our inconsistencies? If so, that had better have a comment on.
>>>>
>>>> Correct.
>>>
>>> But would it not be better to have enqueue bail when we're trying to
>>> enqueue an already throttled task into a throttled cfs_rq?
>>>
>>> It seems a waste to do the actual enqueue, pick, dequeue when we
>>> could've just avoided all that.
>>>
>>
>> The original idea is to keep code simple but surely this can be
>> optimized. I'm working on it and will paste diff here once I get it
>> work.
>>
>
> I tried below diff on top of this series:
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 055f3782eeaee..1c5d7c4ff6652 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -882,6 +882,7 @@ struct task_struct {
> #ifdef CONFIG_CFS_BANDWIDTH
> struct callback_head sched_throttle_work;
> struct list_head throttle_node;
> + bool throttled;
> #endif
> #endif
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 89afa472299b7..c585a12f2c753 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>
> static inline bool task_is_throttled(struct task_struct *p)
> {
> - return !list_empty(&p->throttle_node);
> + return p->throttled;
> }
>
> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> @@ -5842,6 +5842,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> * mistakenly regard this task as an already throttled one.
> */
> list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> + p->throttled = true;
> resched_curr(rq);
> }
Since we now have an official per-task throttle indicator, what are your
thoughts on reusing "p->se.group_node" for throttled_limbo_list?
Something like this lightly tested diff based on your suggestion:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11eb0612e22d..f9fdcf812e81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -578,6 +578,9 @@ struct sched_entity {
unsigned char sched_delayed;
unsigned char rel_deadline;
unsigned char custom_slice;
+#ifdef CONFIG_CFS_BANDWIDTH
+ unsigned char sched_throttled;
+#endif
/* hole */
u64 exec_start;
@@ -881,7 +884,6 @@ struct task_struct {
struct task_group *sched_task_group;
#ifdef CONFIG_CFS_BANDWIDTH
struct callback_head sched_throttle_work;
- struct list_head throttle_node;
#endif
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25e794ea0283..b1cb05baf8d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
static inline bool task_is_throttled(struct task_struct *p)
{
- return !list_empty(&p->throttle_node);
+ return !!p->se.sched_throttled;
}
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5835,13 +5835,14 @@ static void throttle_cfs_rq_work(struct callback_head *work)
return;
rq = scope.rq;
update_rq_clock(rq);
- WARN_ON_ONCE(!list_empty(&p->throttle_node));
+ WARN_ON_ONCE(p->se.sched_throttled);
dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
/*
- * Must not add it to limbo list before dequeue or dequeue will
+ * Must not mark throttled before dequeue or dequeue will
* mistakenly regard this task as an already throttled one.
*/
- list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ p->se.sched_throttled = 1;
+ list_add(&p->se.group_node, &cfs_rq->throttled_limbo_list);
resched_curr(rq);
}
@@ -5853,7 +5854,6 @@ void init_cfs_throttle_work(struct task_struct *p)
init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
/* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
p->sched_throttle_work.next = &p->sched_throttle_work;
- INIT_LIST_HEAD(&p->throttle_node);
}
static void dequeue_throttled_task(struct task_struct *p, int flags)
@@ -5864,10 +5864,26 @@ static void dequeue_throttled_task(struct task_struct *p, int flags)
* task affinity change, task group change, task sched class
* change etc.
*/
- WARN_ON_ONCE(p->se.on_rq);
+ WARN_ON_ONCE(!p->se.sched_throttled);
WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
- list_del_init(&p->throttle_node);
+ list_del_init(&p->se.group_node);
+}
+
+/* return true to skip actual enqueue */
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ if (throttled_hierarchy(cfs_rq)) {
+ /* throttled task move across task groups/rqs. */
+ list_add(&p->se.group_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* unthrottle */
+ p->se.sched_throttled = 0;
+ return false;
}
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5896,8 +5912,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
}
/* Re-enqueue the tasks that have been throttled at this level. */
- list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
- list_del_init(&p->throttle_node);
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, se.group_node) {
+ WARN_ON_ONCE(!p->se.sched_throttled);
+
+ p->se.sched_throttled = 0;
+ list_del_init(&p->se.group_node);
enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
@@ -6714,6 +6733,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6907,6 +6927,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -13244,7 +13267,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
- if (task_on_rq_queued(p)) {
+ if (task_on_rq_queued(p) && !task_is_throttled(p)) {
/*
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
--
Thanks and Regards,
Prateek
Hi Prateek, On Tue, May 27, 2025 at 04:49:36PM +0530, K Prateek Nayak wrote: ... ... > Since we now have an official per-task throttle indicator, what are your > thoughts on reusing "p->se.group_node" for throttled_limbo_list? > I'm not sure. I can easily get confused when I see se.group_node and thought it was something related with rq->cfs_tasks :) Maybe using a union could make it look better? Anyway, if space is a concern then this is a good way to do it, thanks for the suggestion. I'll leave it to Peter to decide. Best wishes, Aaron
On 5/27/2025 5:24 PM, Aaron Lu wrote: > Hi Prateek, > > On Tue, May 27, 2025 at 04:49:36PM +0530, K Prateek Nayak wrote: > ... ... >> Since we now have an official per-task throttle indicator, what are your >> thoughts on reusing "p->se.group_node" for throttled_limbo_list? >> > > I'm not sure. I can easily get confused when I see se.group_node and > thought it was something related with rq->cfs_tasks :) Maybe using a > union could make it look better? > > Anyway, if space is a concern then this is a good way to do it, thanks > for the suggestion. I'll leave it to Peter to decide. Ack! Was just trying something out. I don't think space is actually a worry (yet!) looking at the amount of members behind CONFIG_CFS_BANDWIDTH :) Union is a good idea but if space is not a concern, it is great as is. -- Thanks and Regards, Prateek > > Best wishes, > Aaron
© 2016 - 2025 Red Hat, Inc.