[v3] Defer throttle when task exits to user

[PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Aaron Lu 2 months, 3 weeks ago

With task based throttle model, the previous way to check cfs_rq's
nr_queued to decide if throttled time should be accounted doesn't work
as expected, e.g. when a cfs_rq which has a single task is throttled,
that task could later block in kernel mode instead of being dequeued on
limbo list and account this as throttled time is not accurate.

Rework throttle time accounting for a cfs_rq as follows:
- start accounting when the first task gets throttled in its hierarchy;
- stop accounting on unthrottle.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # accounting mechanism
Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com> # simplify implementation
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c  | 56 ++++++++++++++++++++++++--------------------
 kernel/sched/sched.h |  1 +
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0eeea7f2e693d..6f534fbe89bcf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5287,19 +5287,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		check_enqueue_throttle(cfs_rq);
 		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
-		if (throttled_hierarchy(cfs_rq)) {
+		if (cfs_rq->pelt_clock_throttled) {
 			struct rq *rq = rq_of(cfs_rq);
 
-			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
-				cfs_rq->throttled_clock = rq_clock(rq);
-			if (!cfs_rq->throttled_clock_self)
-				cfs_rq->throttled_clock_self = rq_clock(rq);
-
-			if (cfs_rq->pelt_clock_throttled) {
-				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
-					cfs_rq->throttled_clock_pelt;
-				cfs_rq->pelt_clock_throttled = 0;
-			}
+			cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+				cfs_rq->throttled_clock_pelt;
+			cfs_rq->pelt_clock_throttled = 0;
 		}
 #endif
 	}
@@ -5387,7 +5380,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		 * DELAY_DEQUEUE relies on spurious wakeups, special task
 		 * states must not suffer spurious wakeups, excempt them.
 		 */
-		if (flags & DEQUEUE_SPECIAL)
+		if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
 			delay = false;
 
 		WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5793,7 +5786,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		rq = scope.rq;
 		update_rq_clock(rq);
 		WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
-		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 		/*
 		 * Must not set throttled before dequeue or dequeue will
@@ -5948,6 +5941,17 @@ static inline void task_throttle_setup_work(struct task_struct *p)
 	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
 }
 
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+
+	if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+		cfs_rq->throttled_clock = rq_clock(rq);
+
+	if (!cfs_rq->throttled_clock_self)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
@@ -5956,21 +5960,17 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	if (cfs_rq->throttle_count++)
 		return 0;
 
-
-	/* group is entering throttled state, stop time */
-	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock_self = rq_clock(rq);
-	else {
-		/*
-		 * For cfs_rqs that still have entities enqueued, PELT clock
-		 * stop happens at dequeue time when all entities are dequeued.
-		 */
+	/*
+	 * For cfs_rqs that still have entities enqueued, PELT clock
+	 * stop happens at dequeue time when all entities are dequeued.
+	 */
+	if (!cfs_rq->nr_queued) {
 		list_del_leaf_cfs_rq(cfs_rq);
 		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
 		cfs_rq->pelt_clock_throttled = 1;
 	}
 
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
 	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
@@ -6013,8 +6013,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	 */
 	cfs_rq->throttled = 1;
 	WARN_ON_ONCE(cfs_rq->throttled_clock);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock = rq_clock(rq);
 	return true;
 }
 
@@ -6722,6 +6720,7 @@ static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
 static void dequeue_throttled_task(struct task_struct *p, int flags) {}
 static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -7040,6 +7039,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	bool was_sched_idle = sched_idle_rq(rq);
 	bool task_sleep = flags & DEQUEUE_SLEEP;
 	bool task_delayed = flags & DEQUEUE_DELAYED;
+	bool task_throttled = flags & DEQUEUE_THROTTLE;
 	struct task_struct *p = NULL;
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
@@ -7073,6 +7073,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
+		if (throttled_hierarchy(cfs_rq) && task_throttled)
+			record_throttle_clock(cfs_rq);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7109,6 +7112,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
+
+		if (throttled_hierarchy(cfs_rq) && task_throttled)
+			record_throttle_clock(cfs_rq);
 	}
 
 	sub_nr_running(rq, h_nr_queued);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc697d4bf6685..dbe52e18b93a0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2326,6 +2326,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_SPECIAL		0x10
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE	0x800
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
-- 
2.39.5

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Valentin Schneider 1 month, 2 weeks ago

On 15/07/25 15:16, Aaron Lu wrote:
> @@ -5287,19 +5287,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>               check_enqueue_throttle(cfs_rq);
>               list_add_leaf_cfs_rq(cfs_rq);
>  #ifdef CONFIG_CFS_BANDWIDTH
> -		if (throttled_hierarchy(cfs_rq)) {
> +		if (cfs_rq->pelt_clock_throttled) {
>                       struct rq *rq = rq_of(cfs_rq);
>
> -			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> -				cfs_rq->throttled_clock = rq_clock(rq);
> -			if (!cfs_rq->throttled_clock_self)
> -				cfs_rq->throttled_clock_self = rq_clock(rq);
> -
> -			if (cfs_rq->pelt_clock_throttled) {
> -				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> -					cfs_rq->throttled_clock_pelt;
> -				cfs_rq->pelt_clock_throttled = 0;
> -			}
> +			cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> +				cfs_rq->throttled_clock_pelt;
> +			cfs_rq->pelt_clock_throttled = 0;

This is the only hunk of the patch that affects the PELT stuff; should this
have been included in patch 3 which does the rest of the PELT accounting changes?

> @@ -7073,6 +7073,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>               if (cfs_rq_is_idle(cfs_rq))
>                       h_nr_idle = h_nr_queued;
>
> +		if (throttled_hierarchy(cfs_rq) && task_throttled)
> +			record_throttle_clock(cfs_rq);
> +

Apologies if this has been discussed before.

So the throttled time (as reported by cpu.stat.local) is now accounted as
the time from which the first task in the hierarchy gets effectively
throttled - IOW the first time a task in a throttled hierarchy reaches
resume_user_mode_work() - as opposed to as soon as the hierarchy runs out
of quota.

The gap between the two shouldn't be much, but that should at the very
least be highlighted in the changelog.

AFAICT this is a purely user-facing stat; Josh/Tejun, any opinions on this?

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Aaron Lu 1 month, 1 week ago

On Mon, Aug 18, 2025 at 04:57:27PM +0200, Valentin Schneider wrote:
> On 15/07/25 15:16, Aaron Lu wrote:
> > @@ -5287,19 +5287,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >               check_enqueue_throttle(cfs_rq);
> >               list_add_leaf_cfs_rq(cfs_rq);
> >  #ifdef CONFIG_CFS_BANDWIDTH
> > -		if (throttled_hierarchy(cfs_rq)) {
> > +		if (cfs_rq->pelt_clock_throttled) {
> >                       struct rq *rq = rq_of(cfs_rq);
> >
> > -			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> > -				cfs_rq->throttled_clock = rq_clock(rq);
> > -			if (!cfs_rq->throttled_clock_self)
> > -				cfs_rq->throttled_clock_self = rq_clock(rq);
> > -
> > -			if (cfs_rq->pelt_clock_throttled) {
> > -				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> > -					cfs_rq->throttled_clock_pelt;
> > -				cfs_rq->pelt_clock_throttled = 0;
> > -			}
> > +			cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> > +				cfs_rq->throttled_clock_pelt;
> > +			cfs_rq->pelt_clock_throttled = 0;
> 
> This is the only hunk of the patch that affects the PELT stuff; should this
> have been included in patch 3 which does the rest of the PELT accounting changes?
> 

While working on a rebase and staring at this further, this hunk that
deals with pelt stuff is actually introduced in patch 3 and do not have
any real changes here. i.e. after throttled_clock related lines are
removed, pelt stuffs just moved from an inner if to an outer if but it
didn't have any real changes.

I hope I've clarified it clear this time, last time my brain stopped
working...

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Aaron Lu 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 04:57:27PM +0200, Valentin Schneider wrote:
> On 15/07/25 15:16, Aaron Lu wrote:
> > @@ -5287,19 +5287,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >               check_enqueue_throttle(cfs_rq);
> >               list_add_leaf_cfs_rq(cfs_rq);
> >  #ifdef CONFIG_CFS_BANDWIDTH
> > -		if (throttled_hierarchy(cfs_rq)) {
> > +		if (cfs_rq->pelt_clock_throttled) {
> >                       struct rq *rq = rq_of(cfs_rq);
> >
> > -			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> > -				cfs_rq->throttled_clock = rq_clock(rq);
> > -			if (!cfs_rq->throttled_clock_self)
> > -				cfs_rq->throttled_clock_self = rq_clock(rq);
> > -
> > -			if (cfs_rq->pelt_clock_throttled) {
> > -				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> > -					cfs_rq->throttled_clock_pelt;
> > -				cfs_rq->pelt_clock_throttled = 0;
> > -			}
> > +			cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> > +				cfs_rq->throttled_clock_pelt;
> > +			cfs_rq->pelt_clock_throttled = 0;
> 
> This is the only hunk of the patch that affects the PELT stuff; should this
> have been included in patch 3 which does the rest of the PELT accounting changes?
> 

Yes, I think your suggestion makes sense, I'll move it to patch3 in next
version, thanks.

> > @@ -7073,6 +7073,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >               if (cfs_rq_is_idle(cfs_rq))
> >                       h_nr_idle = h_nr_queued;
> >
> > +		if (throttled_hierarchy(cfs_rq) && task_throttled)
> > +			record_throttle_clock(cfs_rq);
> > +
> 
> Apologies if this has been discussed before.
> 
> So the throttled time (as reported by cpu.stat.local) is now accounted as
> the time from which the first task in the hierarchy gets effectively
> throttled - IOW the first time a task in a throttled hierarchy reaches
> resume_user_mode_work() - as opposed to as soon as the hierarchy runs out
> of quota.

Right.

> 
> The gap between the two shouldn't be much, but that should at the very
> least be highlighted in the changelog.
>

Got it, does the below added words make this clear?

    With task based throttle model, the previous way to check cfs_rq's
    nr_queued to decide if throttled time should be accounted doesn't work
    as expected, e.g. when a cfs_rq which has a single task is throttled,
    that task could later block in kernel mode instead of being dequeued on
    limbo list and account this as throttled time is not accurate.

    Rework throttle time accounting for a cfs_rq as follows:
    - start accounting when the first task gets throttled in its hierarchy;
    - stop accounting on unthrottle.

    Note that there will be a time gap between when a cfs_rq is throttled
    and when a task in its hierarchy is actually throttled. This accounting
    mechanism only started accounting in the latter case.

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Michal Koutný 1 month, 1 week ago

Hello.

On Tue, Aug 19, 2025 at 05:34:27PM +0800, Aaron Lu <ziqianlu@bytedance.com> wrote:
> Got it, does the below added words make this clear?
> 
>     With task based throttle model, the previous way to check cfs_rq's
>     nr_queued to decide if throttled time should be accounted doesn't work
>     as expected, e.g. when a cfs_rq which has a single task is throttled,
>     that task could later block in kernel mode instead of being dequeued on
>     limbo list and account this as throttled time is not accurate.
> 
>     Rework throttle time accounting for a cfs_rq as follows:
>     - start accounting when the first task gets throttled in its hierarchy;
>     - stop accounting on unthrottle.
> 
>     Note that there will be a time gap between when a cfs_rq is throttled
>     and when a task in its hierarchy is actually throttled. This accounting
>     mechanism only started accounting in the latter case.

Do I understand it correctly that this rework doesn't change the
cumulative amount of throttled_time in cpu.stat.local but the value gets
updated only later?

I'd say such little shifts are OK [1]. What should be avoided is
changing the semantics so that throttled_time time would scale with the
number of tasks inside the cgroup (assuming a single cfs_rq, i.e. number
of tasks on the cfs_rq).

0.02€,
Michal

[1] Maybe not even shifts -- in that case of a cfs_rq with a task, it
can manage to run in kernel almost for the whole period, so it gets
dequeued on return to userspace only to be re-enqueued when its cfs_rq
is unthrottled. It apparently escaped throttling, so the reported
throttled_time would be rightfully lower.

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Aaron Lu 1 month, 1 week ago

Hi Michal,

Thanks for taking a look.

On Tue, Aug 26, 2025 at 04:10:37PM +0200, Michal Koutný wrote:
> Hello.
> 
> On Tue, Aug 19, 2025 at 05:34:27PM +0800, Aaron Lu <ziqianlu@bytedance.com> wrote:
> > Got it, does the below added words make this clear?
> > 
> >     With task based throttle model, the previous way to check cfs_rq's
> >     nr_queued to decide if throttled time should be accounted doesn't work
> >     as expected, e.g. when a cfs_rq which has a single task is throttled,
> >     that task could later block in kernel mode instead of being dequeued on
> >     limbo list and account this as throttled time is not accurate.
> > 
> >     Rework throttle time accounting for a cfs_rq as follows:
> >     - start accounting when the first task gets throttled in its hierarchy;
> >     - stop accounting on unthrottle.
> > 
> >     Note that there will be a time gap between when a cfs_rq is throttled
> >     and when a task in its hierarchy is actually throttled. This accounting
> >     mechanism only started accounting in the latter case.
> 
> Do I understand it correctly that this rework doesn't change the
> cumulative amount of throttled_time in cpu.stat.local but the value gets
> updated only later?
> 
> I'd say such little shifts are OK [1]. What should be avoided is
> changing the semantics so that throttled_time time would scale with the
> number of tasks inside the cgroup (assuming a single cfs_rq, i.e. number
> of tasks on the cfs_rq).

As Valetin explained, throttle_time does not scale with the number of
tasks inside the cgroup.

> [1] Maybe not even shifts -- in that case of a cfs_rq with a task, it
> can manage to run in kernel almost for the whole period, so it gets
> dequeued on return to userspace only to be re-enqueued when its cfs_rq
> is unthrottled. It apparently escaped throttling, so the reported
> throttled_time would be rightfully lower.

Right, in this case, the throttle_time would be very small.

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Valentin Schneider 1 month, 1 week ago

On 26/08/25 16:10, Michal Koutný wrote:
> Hello.
>
> On Tue, Aug 19, 2025 at 05:34:27PM +0800, Aaron Lu <ziqianlu@bytedance.com> wrote:
>> Got it, does the below added words make this clear?
>>
>>     With task based throttle model, the previous way to check cfs_rq's
>>     nr_queued to decide if throttled time should be accounted doesn't work
>>     as expected, e.g. when a cfs_rq which has a single task is throttled,
>>     that task could later block in kernel mode instead of being dequeued on
>>     limbo list and account this as throttled time is not accurate.
>>
>>     Rework throttle time accounting for a cfs_rq as follows:
>>     - start accounting when the first task gets throttled in its hierarchy;
>>     - stop accounting on unthrottle.
>>
>>     Note that there will be a time gap between when a cfs_rq is throttled
>>     and when a task in its hierarchy is actually throttled. This accounting
>>     mechanism only started accounting in the latter case.
>
> Do I understand it correctly that this rework doesn't change the
> cumulative amount of throttled_time in cpu.stat.local but the value gets
> updated only later?
>

No, so currently when a cfs_rq runs out of quota, all of its tasks
instantly get throttled, synchronously with that we record the time at
which it got throttled and use that to report how long it was throttled
(cpu.stat.local).

What this is doing is separating running out of quota and actually
throttling the tasks. When a cfs_rq runs out of quota, we "mark" its tasks
to throttle themselves whenever they next exit the kernel. We record the
throttled time (cpu.stat.local) as the time between the first
to-be-throttled task exiting the kernel and the unthrottle/quota
replenishment.

IOW, this is inducing a (short) delay between a cfs_rq running out of quota
and we starting to account for its cumulative throttled time.

Hopefully that was somewhat clear.

> I'd say such little shifts are OK [1]. What should be avoided is
> changing the semantics so that throttled_time time would scale with the
> number of tasks inside the cgroup (assuming a single cfs_rq, i.e. number
> of tasks on the cfs_rq).
>

Yeah that's fine we don't do that.

Re: [PATCH v3 4/5] sched/fair: Task based throttle time accounting

Posted by Valentin Schneider 1 month, 2 weeks ago

On 19/08/25 17:34, Aaron Lu wrote:
> On Mon, Aug 18, 2025 at 04:57:27PM +0200, Valentin Schneider wrote:
>> On 15/07/25 15:16, Aaron Lu wrote:
>> Apologies if this has been discussed before.
>>
>> So the throttled time (as reported by cpu.stat.local) is now accounted as
>> the time from which the first task in the hierarchy gets effectively
>> throttled - IOW the first time a task in a throttled hierarchy reaches
>> resume_user_mode_work() - as opposed to as soon as the hierarchy runs out
>> of quota.
>
> Right.
>
>>
>> The gap between the two shouldn't be much, but that should at the very
>> least be highlighted in the changelog.
>>
>
> Got it, does the below added words make this clear?
>

Yes, thank you. Small corrections below.

>     With task based throttle model, the previous way to check cfs_rq's
>     nr_queued to decide if throttled time should be accounted doesn't work
>     as expected, e.g. when a cfs_rq which has a single task is throttled,
>     that task could later block in kernel mode instead of being dequeued on
>     limbo list and account this as throttled time is not accurate.
                     ^^^^^^
                     accounting
>
>     Rework throttle time accounting for a cfs_rq as follows:
>     - start accounting when the first task gets throttled in its hierarchy;
>     - stop accounting on unthrottle.
>
>     Note that there will be a time gap between when a cfs_rq is throttled
>     and when a task in its hierarchy is actually throttled. This accounting
>     mechanism only started accounting in the latter case.
                     ^^^^^^
                     starts