[v1] Defer throttle when task exits to user

[RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by Aaron Lu 11 months ago

From: Valentin Schneider <vschneid@redhat.com>

On unthrottle, enqueue throttled tasks back so they can continue to run.

Note that for this task based throttling, the only throttle place is
when it returns to user space so as long as a task is enqueued, no
matter its cfs_rq is throttled or not, it will be allowed to run till it
reaches that throttle place.

leaf_cfs_rq list is handled differently now: as long as a task is
enqueued to a throttled or not cfs_rq, this cfs_rq will be added to that
list and when cfs_rq is throttled and all its tasks are dequeued, it
will be removed from that list. I think this is easy to reason so chose
to do so.

[aaronlu: extracted from Valentin's original patches. I also changed the
 implementation to using enqueue_task_fair() for queuing back tasks to
 unthrottled cfs_rq]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
 1 file changed, 45 insertions(+), 87 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab403ff7d53c8..4a95fe3785e43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)

 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);

 			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
 			if (!cfs_rq->throttled_clock_self)
 				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
 		}
+#endif
 	}
 }

@@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);

-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+		if (throttled_hierarchy(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
+	}

 	return true;
 }
@@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
task_group *tg,
 	       throttled_hierarchy(dest_cfs_rq);
 }

+static inline bool task_is_throttled(struct task_struct *p)
+{
+	return !list_empty(&p->throttle_node);
+}
+
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
@@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
 	INIT_LIST_HEAD(&p->throttle_node);
 }

+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;

 	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
-					     cfs_rq->throttled_clock_pelt;
+	if (cfs_rq->throttle_count)
+		return 0;

-		/* Add cfs_rq with load or one or more already running entities to
the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+		cfs_rq->throttled_clock_pelt;

-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;

-			cfs_rq->throttled_clock_self = 0;
+		cfs_rq->throttled_clock_self = 0;

-			if (SCHED_WARN_ON((s64)delta < 0))
-				delta = 0;
+		if (SCHED_WARN_ON((s64)delta < 0))
+			delta = 0;

-			cfs_rq->throttled_clock_self_time += delta;
-		}
+		cfs_rq->throttled_clock_self_time += delta;
 	}

+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
throttle_node) {
+		list_del_init(&p->throttle_node);
+		/*
+		 * FIXME: p may not be allowed to run on this rq anymore
+		 * due to affinity change while p is throttled.
+		 */
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+	}
+
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }

@@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
*tg, void *data)

 	/* group is entering throttled state, stop time */
 	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
-	list_del_leaf_cfs_rq(cfs_rq);

 	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
 	if (cfs_rq->nr_queued)
 		cfs_rq->throttled_clock_self = rq_clock(rq);

+	if (!cfs_rq->nr_queued) {
+		list_del_leaf_cfs_rq(cfs_rq);
+		return 0;
+	}
+
 	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	/*
 	 * rq_lock is held, current is (obviously) executing this in kernelspace.
@@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
-
-	se = cfs_rq->tg->se[cpu_of(rq)];
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

 	cfs_rq->throttled = 0;

@@ -6063,62 +6083,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
 	}

-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);

 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6989,6 +6955,7 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 		util_est_enqueue(&rq->cfs, p);

 	if (flags & ENQUEUE_DELAYED) {
+		SCHED_WARN_ON(task_is_throttled(p));
 		requeue_delayed_entity(se);
 		return;
 	}
@@ -7031,10 +6998,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;

-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}

@@ -7056,10 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)

 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}

 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7089,7 +7048,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);

-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);

 	hrtick_update(rq);
-- 
2.39.5

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by K Prateek Nayak 11 months ago

Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:

[..snip..]

> ---
>   kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
>   1 file changed, 45 insertions(+), 87 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ab403ff7d53c8..4a95fe3785e43 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
> 
>   	if (cfs_rq->nr_queued == 1) {
>   		check_enqueue_throttle(cfs_rq);
> -		if (!throttled_hierarchy(cfs_rq)) {
> -			list_add_leaf_cfs_rq(cfs_rq);
> -		} else {
> +		list_add_leaf_cfs_rq(cfs_rq);
>   #ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
>   			struct rq *rq = rq_of(cfs_rq);
> 
>   			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>   				cfs_rq->throttled_clock = rq_clock(rq);
>   			if (!cfs_rq->throttled_clock_self)
>   				cfs_rq->throttled_clock_self = rq_clock(rq);

These bits probabaly need revisiting. From what I understand, these
stats were maintained to know when a task was woken up on a
throttled hierarchy which was not connected to the parent essentially
tracking the amount of time runnable tasks were waiting on the
cfs_rq before an unthrottle event allowed them to be picked.

With per-task throttle, these semantics no longer apply since a woken
task will run and dequeue itself when exiting to userspace.

Josh do you have any thoughts on this?

> -#endif
>   		}
> +#endif
>   	}
>   }
> 
> @@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
>   	if (flags & DEQUEUE_DELAYED)
>   		finish_delayed_dequeue_entity(se);
> 
> -	if (cfs_rq->nr_queued == 0)
> +	if (cfs_rq->nr_queued == 0) {
>   		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +		if (throttled_hierarchy(cfs_rq))
> +			list_del_leaf_cfs_rq(cfs_rq);
> +	}
> 
>   	return true;
>   }
> @@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
> task_group *tg,
>   	       throttled_hierarchy(dest_cfs_rq);
>   }
> 
> +static inline bool task_is_throttled(struct task_struct *p)
> +{
> +	return !list_empty(&p->throttle_node);
> +}
> +
>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static void throttle_cfs_rq_work(struct callback_head *work)
>   {
> @@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
>   	INIT_LIST_HEAD(&p->throttle_node);
>   }
> 
> +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static int tg_unthrottle_up(struct task_group *tg, void *data)
>   {
>   	struct rq *rq = data;
>   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +	struct task_struct *p, *tmp;
> 
>   	cfs_rq->throttle_count--;
> -	if (!cfs_rq->throttle_count) {
> -		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> -					     cfs_rq->throttled_clock_pelt;
> +	if (cfs_rq->throttle_count)
> +		return 0;
> 
> -		/* Add cfs_rq with load or one or more already running entities to
> the list */
> -		if (!cfs_rq_is_decayed(cfs_rq))
> -			list_add_leaf_cfs_rq(cfs_rq);
> +	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> +		cfs_rq->throttled_clock_pelt;
> 
> -		if (cfs_rq->throttled_clock_self) {
> -			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> +	if (cfs_rq->throttled_clock_self) {
> +		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> 
> -			cfs_rq->throttled_clock_self = 0;
> +		cfs_rq->throttled_clock_self = 0;
> 
> -			if (SCHED_WARN_ON((s64)delta < 0))
> -				delta = 0;
> +		if (SCHED_WARN_ON((s64)delta < 0))
> +			delta = 0;
> 
> -			cfs_rq->throttled_clock_self_time += delta;
> -		}
> +		cfs_rq->throttled_clock_self_time += delta;
>   	}
> 
> +	/* Re-enqueue the tasks that have been throttled at this level. */
> +	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
> throttle_node) {
> +		list_del_init(&p->throttle_node);
> +		/*
> +		 * FIXME: p may not be allowed to run on this rq anymore
> +		 * due to affinity change while p is throttled.
> +		 */

Using dequeue_task_fair() for throttle should ensure that the core now
sees task_on_rq_queued() which should make it go throgh a full dequeue
cycle which will remove the task from the "throttled_limbo_list" and
the enqueue should put it back on the correct runqueue.

Is the above comment inaccurate with your changes or did I miss
something?

> +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> +	}
> +
> +	/* Add cfs_rq with load or one or more already running entities to the list */
> +	if (!cfs_rq_is_decayed(cfs_rq))
> +		list_add_leaf_cfs_rq(cfs_rq);
> +
>   	return 0;
>   }
> 
> @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
> *tg, void *data)
> 
>   	/* group is entering throttled state, stop time */
>   	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -	list_del_leaf_cfs_rq(cfs_rq);
> 
>   	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>   	if (cfs_rq->nr_queued)
>   		cfs_rq->throttled_clock_self = rq_clock(rq);
> 
> +	if (!cfs_rq->nr_queued) {
> +		list_del_leaf_cfs_rq(cfs_rq);
> +		return 0;
> +	}
> +

This bit can perhaps go in Patch 2?

>   	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>   	/*
>   	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> @@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>   {
>   	struct rq *rq = rq_of(cfs_rq);
>   	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> -	struct sched_entity *se;
> -	long queued_delta, runnable_delta, idle_delta;
> -	long rq_h_nr_queued = rq->cfs.h_nr_queued;
> -
> -	se = cfs_rq->tg->se[cpu_of(rq)];
> +	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
> 
>   	cfs_rq->throttled = 0;
> 

[..snip..]

-- 
Thanks and Regards,
Prateek

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by Aaron Lu 11 months ago

On Fri, Mar 14, 2025 at 09:23:47AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
> 
> [..snip..]
> 
> > ---
> >   kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
> >   1 file changed, 45 insertions(+), 87 deletions(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index ab403ff7d53c8..4a95fe3785e43 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> > sched_entity *se, int flags)
> > 
> >   	if (cfs_rq->nr_queued == 1) {
> >   		check_enqueue_throttle(cfs_rq);
> > -		if (!throttled_hierarchy(cfs_rq)) {
> > -			list_add_leaf_cfs_rq(cfs_rq);
> > -		} else {
> > +		list_add_leaf_cfs_rq(cfs_rq);
> >   #ifdef CONFIG_CFS_BANDWIDTH
> > +		if (throttled_hierarchy(cfs_rq)) {
> >   			struct rq *rq = rq_of(cfs_rq);
> > 
> >   			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> >   				cfs_rq->throttled_clock = rq_clock(rq);
> >   			if (!cfs_rq->throttled_clock_self)
> >   				cfs_rq->throttled_clock_self = rq_clock(rq);
> 
> These bits probabaly need revisiting. From what I understand, these
> stats were maintained to know when a task was woken up on a
> throttled hierarchy which was not connected to the parent essentially
> tracking the amount of time runnable tasks were waiting on the
> cfs_rq before an unthrottle event allowed them to be picked.

Do you mean these throttled_clock stats?

I think they are here because we do not record the throttled_clock for
empty cfs_rqs and once the cfs_rq has task enqueued, it needs to record
its throttled_clock. This is done in commit 79462e8c879a("sched: don't
account throttle time for empty groups") by Josh. I don't think per-task
throttle change this.

With this said, I think there is a gap in per-task throttle, i.e. when
all tasks are dequeued from this throttled cfs_rq, we should record its
throttled_time and clear its throttled_clock.

> 
> With per-task throttle, these semantics no longer apply since a woken
> task will run and dequeue itself when exiting to userspace.
> 
> Josh do you have any thoughts on this?
> 
> > -#endif
> >   		}
> > +#endif
> >   	}
> >   }
> >

> > @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
> > *tg, void *data)
> > 
> >   	/* group is entering throttled state, stop time */
> >   	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > -	list_del_leaf_cfs_rq(cfs_rq);
> > 
> >   	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> >   	if (cfs_rq->nr_queued)
> >   		cfs_rq->throttled_clock_self = rq_clock(rq);
> > 
> > +	if (!cfs_rq->nr_queued) {
> > +		list_del_leaf_cfs_rq(cfs_rq);
> > +		return 0;
> > +	}
> > +
> 
> This bit can perhaps go in Patch 2?

I kept all the changes to leaf cfs_rq handling in one patch, I think it
is easier to review :-)

Thanks,
Aaron

> >   	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> >   	/*
> >   	 * rq_lock is held, current is (obviously) executing this in kernelspace.

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by K Prateek Nayak 11 months ago

Hello Aaron,

On 3/14/2025 4:13 PM, Aaron Lu wrote:
> On Fri, Mar 14, 2025 at 09:23:47AM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 3/13/2025 12:51 PM, Aaron Lu wrote:
>>
>> [..snip..]
>>
>>> ---
>>>    kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
>>>    1 file changed, 45 insertions(+), 87 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index ab403ff7d53c8..4a95fe3785e43 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
>>> sched_entity *se, int flags)
>>>
>>>    	if (cfs_rq->nr_queued == 1) {
>>>    		check_enqueue_throttle(cfs_rq);
>>> -		if (!throttled_hierarchy(cfs_rq)) {
>>> -			list_add_leaf_cfs_rq(cfs_rq);
>>> -		} else {
>>> +		list_add_leaf_cfs_rq(cfs_rq);
>>>    #ifdef CONFIG_CFS_BANDWIDTH
>>> +		if (throttled_hierarchy(cfs_rq)) {
>>>    			struct rq *rq = rq_of(cfs_rq);
>>>
>>>    			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>>>    				cfs_rq->throttled_clock = rq_clock(rq);
>>>    			if (!cfs_rq->throttled_clock_self)
>>>    				cfs_rq->throttled_clock_self = rq_clock(rq);
>>
>> These bits probabaly need revisiting. From what I understand, these
>> stats were maintained to know when a task was woken up on a
>> throttled hierarchy which was not connected to the parent essentially
>> tracking the amount of time runnable tasks were waiting on the
>> cfs_rq before an unthrottle event allowed them to be picked.
> 
> Do you mean these throttled_clock stats?
> 
> I think they are here because we do not record the throttled_clock for
> empty cfs_rqs and once the cfs_rq has task enqueued, it needs to record
> its throttled_clock. This is done in commit 79462e8c879a("sched: don't
> account throttle time for empty groups") by Josh. I don't think per-task
> throttle change this.
> 
> With this said, I think there is a gap in per-task throttle, i.e. when
> all tasks are dequeued from this throttled cfs_rq, we should record its
> throttled_time and clear its throttled_clock.

Yes but then what it'll track is the amount of time task were running
when the cfs_rq was on a throttled hierarchy. Is that what we want to
track or something else.

The commit log for 677ea015f231 ("sched: add throttled time stat for
throttled children") says the following for "throttled_clock_self_time":

     We currently export the total throttled time for cgroups that are given
     a bandwidth limit. This patch extends this accounting to also account
     the total time that each children cgroup has been throttled.

     This is useful to understand the degree to which children have been
     affected by the throttling control. Children which are not runnable
     during the entire throttled period, for example, will not show any
     self-throttling time during this period.

but with per-task model, it is probably the amount of time that
"throttled_limbo_list" has a task on it since they are runnable
but are in-fact waiting for an unthrottle.

If a task enqueues itself on a throttled hierarchy and then blocks
again before exiting to the userspace, it should not count in
"throttled_clock_self_time" since the task was runnable the whole
time despite the hierarchy being frozen.

> 
>>
>> With per-task throttle, these semantics no longer apply since a woken
>> task will run and dequeue itself when exiting to userspace.
>>
>> Josh do you have any thoughts on this?
>>
>>> -#endif
>>>    		}
>>> +#endif
>>>    	}
>>>    }
>>>
> 
>>> @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
>>> *tg, void *data)
>>>
>>>    	/* group is entering throttled state, stop time */
>>>    	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> -	list_del_leaf_cfs_rq(cfs_rq);
>>>
>>>    	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>    	if (cfs_rq->nr_queued)
>>>    		cfs_rq->throttled_clock_self = rq_clock(rq);
>>>
>>> +	if (!cfs_rq->nr_queued) {
>>> +		list_del_leaf_cfs_rq(cfs_rq);
>>> +		return 0;
>>> +	}
>>> +
>>
>> This bit can perhaps go in Patch 2?
> 
> I kept all the changes to leaf cfs_rq handling in one patch, I think it
> is easier to review :-)

Thank you! That would be great.

> 
> Thanks,
> Aaron
> 
>>>    	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>>    	/*
>>>    	 * rq_lock is held, current is (obviously) executing this in kernelspace.

-- 
Thanks and Regards,
Prateek

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by Aaron Lu 10 months, 1 week ago

Hi Prateek,

On Fri, Mar 14, 2025 at 11:22:20PM +0530, K Prateek Nayak wrote:
... ... 
> but with per-task model, it is probably the amount of time that
> "throttled_limbo_list" has a task on it since they are runnable
> but are in-fact waiting for an unthrottle.

I tried this way of accounting and realized a problem. Assume a
hierarchy like this: /sys/fs/cgroup/1/1_1, quota configured at
/sys/fs/cgroup/1 level. When throttle happend and tasks of 1_1 get
throttled, the limbo list of /sys/fs/cgroup/1 will always be empty so
its "throttled_clock_self_time" is always 0...This doesn't match
throttled_clock_self_time's semantic. "throttled_time" is similar.

I suppose we can somehow fix this by introducing something like
h_nr_throttled, but I feel that's an overkill. So I'll probabaly just
keep the current accounting as is in the next version, feel free to let
me know if you have other thoughts.

Thanks.

> If a task enqueues itself on a throttled hierarchy and then blocks
> again before exiting to the userspace, it should not count in
> "throttled_clock_self_time" since the task was runnable the whole
> time despite the hierarchy being frozen.

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by K Prateek Nayak 10 months, 1 week ago

Hello Aaron,

On 4/2/2025 2:55 PM, Aaron Lu wrote:
> Hi Prateek,
> 
> On Fri, Mar 14, 2025 at 11:22:20PM +0530, K Prateek Nayak wrote:
> ... ...
>> but with per-task model, it is probably the amount of time that
>> "throttled_limbo_list" has a task on it since they are runnable
>> but are in-fact waiting for an unthrottle.
> 
> I tried this way of accounting and realized a problem. Assume a
> hierarchy like this: /sys/fs/cgroup/1/1_1, quota configured at
> /sys/fs/cgroup/1 level. When throttle happend and tasks of 1_1 get
> throttled, the limbo list of /sys/fs/cgroup/1 will always be empty so
> its "throttled_clock_self_time" is always 0...This doesn't match
> throttled_clock_self_time's semantic. "throttled_time" is similar.
> 
> I suppose we can somehow fix this by introducing something like
> h_nr_throttled, but I feel that's an overkill. So I'll probabaly just
> keep the current accounting as is in the next version, feel free to let
> me know if you have other thoughts.

I agree it might an overkill. We can discuss this more on v2.

-- 
Thanks and Regards,
Prateek

> 
> Thanks.
> 
>> If a task enqueues itself on a throttled hierarchy and then blocks
>> again before exiting to the userspace, it should not count in
>> "throttled_clock_self_time" since the task was runnable the whole
>> time despite the hierarchy being frozen.

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by Aaron Lu 11 months ago

On Fri, Mar 14, 2025 at 11:22:20PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/14/2025 4:13 PM, Aaron Lu wrote:
> > On Fri, Mar 14, 2025 at 09:23:47AM +0530, K Prateek Nayak wrote:
> > > Hello Aaron,
> > > 
> > > On 3/13/2025 12:51 PM, Aaron Lu wrote:
> > > 
> > > [..snip..]
> > > 
> > > > ---
> > > >    kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
> > > >    1 file changed, 45 insertions(+), 87 deletions(-)
> > > > 
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index ab403ff7d53c8..4a95fe3785e43 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> > > > sched_entity *se, int flags)
> > > > 
> > > >    	if (cfs_rq->nr_queued == 1) {
> > > >    		check_enqueue_throttle(cfs_rq);
> > > > -		if (!throttled_hierarchy(cfs_rq)) {
> > > > -			list_add_leaf_cfs_rq(cfs_rq);
> > > > -		} else {
> > > > +		list_add_leaf_cfs_rq(cfs_rq);
> > > >    #ifdef CONFIG_CFS_BANDWIDTH
> > > > +		if (throttled_hierarchy(cfs_rq)) {
> > > >    			struct rq *rq = rq_of(cfs_rq);
> > > > 
> > > >    			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> > > >    				cfs_rq->throttled_clock = rq_clock(rq);
> > > >    			if (!cfs_rq->throttled_clock_self)
> > > >    				cfs_rq->throttled_clock_self = rq_clock(rq);
> > > 
> > > These bits probabaly need revisiting. From what I understand, these
> > > stats were maintained to know when a task was woken up on a
> > > throttled hierarchy which was not connected to the parent essentially
> > > tracking the amount of time runnable tasks were waiting on the
> > > cfs_rq before an unthrottle event allowed them to be picked.
> > 
> > Do you mean these throttled_clock stats?
> > 
> > I think they are here because we do not record the throttled_clock for
> > empty cfs_rqs and once the cfs_rq has task enqueued, it needs to record
> > its throttled_clock. This is done in commit 79462e8c879a("sched: don't
> > account throttle time for empty groups") by Josh. I don't think per-task
> > throttle change this.
> > 
> > With this said, I think there is a gap in per-task throttle, i.e. when
> > all tasks are dequeued from this throttled cfs_rq, we should record its
> > throttled_time and clear its throttled_clock.
> 
> Yes but then what it'll track is the amount of time task were running
> when the cfs_rq was on a throttled hierarchy. Is that what we want to
> track or something else.

Right, my last comment was not correct.

Basically, my current approach tried to mimic the existing accounting,
i.e. when there is task enqueued in a throttled cfs_rq, start recording
this cfs_rq's throttled_clock. It kind of over-accounts the throttled
time for cfs_rq with this per-task throttle model because some task can
still be running in kernel mode while cfs_rq is throttled.

> The commit log for 677ea015f231 ("sched: add throttled time stat for
> throttled children") says the following for "throttled_clock_self_time":
> 
>     We currently export the total throttled time for cgroups that are given
>     a bandwidth limit. This patch extends this accounting to also account
>     the total time that each children cgroup has been throttled.
> 
>     This is useful to understand the degree to which children have been
>     affected by the throttling control. Children which are not runnable
>     during the entire throttled period, for example, will not show any
>     self-throttling time during this period.
> 
> but with per-task model, it is probably the amount of time that
> "throttled_limbo_list" has a task on it since they are runnable
> but are in-fact waiting for an unthrottle.
>
> If a task enqueues itself on a throttled hierarchy and then blocks
> again before exiting to the userspace, it should not count in
> "throttled_clock_self_time" since the task was runnable the whole
> time despite the hierarchy being frozen.

I think there is a mismatch between per-task throttle and per-cfs_rq
stats, it's hard to make the accounting perfect. Assume a throttled
cfs_rq has 4 tasks, with 2 tasks blocked on limbo_list and 2 tasks still
running in kernel mode. Should we treat this time as throttled or not
for this cfs_rq?

This is similar to the pelt clock freeze problem. For the above example,
should we freeze the cfs_rq's pelt clock or let it continue when this
cfs_rq is throttled with some task blocked on limbo_list and some task
still running in kernel mode?

My understanding is, neither approach is perfect, so I just chose the
simpler one for now. Please correct me if my understaning is wrong.

Thanks,
Aaron

Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

Posted by K Prateek Nayak 11 months ago

On 3/14/2025 9:23 AM, K Prateek Nayak wrote:

[..snip..]

>>
>> +    /* Re-enqueue the tasks that have been throttled at this level. */
>> +    list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
>> throttle_node) {
>> +        list_del_init(&p->throttle_node);
>> +        /*
>> +         * FIXME: p may not be allowed to run on this rq anymore
>> +         * due to affinity change while p is throttled.
>> +         */
> 
> Using dequeue_task_fair() for throttle should ensure that the core now
> sees task_on_rq_queued() which should make it go throgh a full dequeue
> cycle which will remove the task from the "throttled_limbo_list" and
> the enqueue should put it back on the correct runqueue.
> 
> Is the above comment inaccurate with your changes or did I miss
> something?

Please ignore this, I just reached Patch 5. Sorry for the noise.

> 
>> +        enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
>> +    }
>> +
>> +    /* Add cfs_rq with load or one or more already running entities to the list */
>> +    if (!cfs_rq_is_decayed(cfs_rq))
>> +        list_add_leaf_cfs_rq(cfs_rq);
>> +
>>       return 0;
>>   }
>>

-- 
Thanks and Regards,
Prateek

[RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle
[RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
[RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
[RFC PATCH 4/7] sched/fair: Take care of migrated task for task based throttle
[RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task
[RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
[RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path