[v2] Defer throttle when task exits to user

[RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle

Posted by Aaron Lu 10 months ago

From: Valentin Schneider <vschneid@redhat.com>

In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets wakeup, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not
remove it from cpu's rq. Instead, for tasks that belong to this cfs_rq,
when they get picked, add a task work to them so that when they return
to user, they can be dequeued. In this way, tasks throttled will not
hold any kernel resources.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c  | 185 +++++++++++++++++++++----------------------
 kernel/sched/sched.h |   1 +
 2 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 894202d232efd..c566a5a90d065 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5516,8 +5516,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);
 
-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+		if (throttled_hierarchy(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
+	}
 
 	return true;
 }
@@ -5598,7 +5601,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 	return se;
 }
 
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -5823,8 +5826,48 @@ static inline int throttled_lb_pair(struct task_group *tg,
 	       throttled_hierarchy(dest_cfs_rq);
 }
 
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
+	struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+
+	WARN_ON_ONCE(p != current);
+	p->sched_throttle_work.next = &p->sched_throttle_work;
+
+	/*
+	 * If task is exiting, then there won't be a return to userspace, so we
+	 * don't have to bother with any of this.
+	 */
+	if ((p->flags & PF_EXITING))
+		return;
+
+	scoped_guard(task_rq_lock, p) {
+		se = &p->se;
+		cfs_rq = cfs_rq_of(se);
+
+		/* Raced, forget */
+		if (p->sched_class != &fair_sched_class)
+			return;
+
+		/*
+		 * If not in limbo, then either replenish has happened or this
+		 * task got migrated out of the throttled cfs_rq, move along.
+		 */
+		if (!cfs_rq->throttle_count)
+			return;
+
+		rq = scope.rq;
+		update_rq_clock(rq);
+		WARN_ON_ONCE(!list_empty(&p->throttle_node));
+		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		resched_curr(rq);
+	}
+
+	cond_resched_tasks_rcu_qs();
 }
 
 void init_cfs_throttle_work(struct task_struct *p)
@@ -5864,32 +5907,53 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	return 0;
 }
 
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+	return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+	if (task_has_throttle_work(p))
+		return;
+
+	/*
+	 * Kthreads and exiting tasks don't return to userspace, so adding the
+	 * work is pointless
+	 */
+	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+		return;
+
+	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
+	cfs_rq->throttle_count++;
+	if (cfs_rq->throttle_count > 1)
+		return 0;
+
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
-		list_del_leaf_cfs_rq(cfs_rq);
+	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
 
-		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
-	}
-	cfs_rq->throttle_count++;
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+	else
+		list_del_leaf_cfs_rq(cfs_rq);
 
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
 
-static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	int dequeue = 1;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5910,74 +5974,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	raw_spin_unlock(&cfs_b->lock);
 
 	if (!dequeue)
-		return false;  /* Throttle no longer required. */
-
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+		return;  /* Throttle no longer required. */
 
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-
-	/* Stop the fair server if throttling resulted in no runnable tasks */
-	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
-		dl_server_stop(&rq->fair_server);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5986,7 +5989,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	WARN_ON_ONCE(cfs_rq->throttled_clock);
 	if (cfs_rq->nr_queued)
 		cfs_rq->throttled_clock = rq_clock(rq);
-	return true;
+	return;
 }
 
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -6462,22 +6465,22 @@ static void sync_throttle(struct task_group *tg, int cpu)
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
-		return false;
+		return;
 
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return false;
+		return;
 
 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
-		return true;
+		return;
 
-	return throttle_cfs_rq(cfs_rq);
+	throttle_cfs_rq(cfs_rq);
 }
 
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -6573,6 +6576,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6738,10 +6742,11 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
 #else /* CONFIG_CFS_BANDWIDTH */
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -7108,10 +7113,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7148,10 +7149,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}
 
 	sub_nr_running(rq, h_nr_queued);
@@ -8860,8 +8857,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		check_cfs_rq_runtime(cfs_rq);
 
 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
@@ -8888,6 +8884,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		goto idle;
 	se = &p->se;
 
+	if (throttled_hierarchy(cfs_rq_of(se)))
+		task_throttle_setup_work(p);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (prev->sched_class != &fair_sched_class)
 		goto simple;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 921527327f107..97be6a6f53b9c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -736,6 +736,7 @@ struct cfs_rq {
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
+	struct list_head	throttled_limbo_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
-- 
2.39.5

Re: [RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle

Posted by Aaron Lu 9 months, 1 week ago

On Wed, Apr 09, 2025 at 08:07:41PM +0800, Aaron Lu wrote:
... ...
> @@ -8888,6 +8884,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>  		goto idle;
>  	se = &p->se;
>  
> +	if (throttled_hierarchy(cfs_rq_of(se)))
> +		task_throttle_setup_work(p);
> +

Looks like this will miss core scheduling case, where the task pick is
done in pick_task_fair().

I plan to do something below on top to fix core scheduling case:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70f7de82d1d9d..500b41f9aea72 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8858,6 +8858,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+	struct task_struct *p;

 again:
 	cfs_rq = &rq->cfs;
@@ -8877,7 +8878,11 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);

-	return task_of(se);
+	p = task_of(se);
+	if (throttled_hierarchy(cfs_rq_of(se)))
+		task_throttle_setup_work(p);
+
+	return p;
 }

 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
@@ -8896,9 +8901,6 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		goto idle;
 	se = &p->se;

-	if (throttled_hierarchy(cfs_rq_of(se)))
-		task_throttle_setup_work(p);
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (prev->sched_class != &fair_sched_class)
 		goto simple;

For non-core-scheduling, this has the same effect as current and for
core-scheduling, this will make sure task picked will also get throttle
task work added. It could add throttle task work to a task unnecessarily
because in core scheduling case, a task picked may not be able to run
due to cookie and priority reasons but at least, it will not miss the
throttle work this way.

Alternatively, I can add a task_throttle_setup_work(p) somewhere in
set_next_task_fair() but that would add one more callsite of
throttle_setup_work() and is not as clean and simple as the above diff.

Feel free to let me know your thoughts, thanks!

>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	if (prev->sched_class != &fair_sched_class)
>  		goto simple;

Re: [RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle

Posted by Florian Bezdeka 9 months, 4 weeks ago

On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> @@ -8888,6 +8884,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>  		goto idle;
>  	se = &p->se;
>  
> +	if (throttled_hierarchy(cfs_rq_of(se)))
> +		task_throttle_setup_work(p);
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	if (prev->sched_class != &fair_sched_class)
>  		goto simple;

For testing purposes I would like to backport that to 6.1-stable. The
situation around pick_next_task_fair() seems to have changed meanwhile:

- it moved out of the CONFIG_SMP guard
- Completely different implementation

Backporting to 6.12 looks doable, but 6.6 and below looks challenging
at first glance. Do you have any insights that could help backporting,
especially for this hunk, but maybe even in general?

Best regards,
Florian

Re: [RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle

Posted by K Prateek Nayak 9 months, 4 weeks ago

Hello Florian,

On 4/14/2025 8:09 PM, Florian Bezdeka wrote:
> On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
>> @@ -8888,6 +8884,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>>   		goto idle;
>>   	se = &p->se;
>>   
>> +	if (throttled_hierarchy(cfs_rq_of(se)))
>> +		task_throttle_setup_work(p);
>> +
>>   #ifdef CONFIG_FAIR_GROUP_SCHED
>>   	if (prev->sched_class != &fair_sched_class)
>>   		goto simple;
> 
> For testing purposes I would like to backport that to 6.1-stable. The
> situation around pick_next_task_fair() seems to have changed meanwhile:
> 
> - it moved out of the CONFIG_SMP guard
> - Completely different implementation
> 
> Backporting to 6.12 looks doable, but 6.6 and below looks challenging

v6.6 introduced the EEVDF algorithm that changes a fair bit of
fair.c but the bandwidth control bits are mostly same and they all
get ripped out in Patch 2 and Patch 3.

> at first glance. Do you have any insights that could help backporting,
> especially for this hunk, but maybe even in general?

For the particular hunk, on v6.5, you can do:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3e25be58e2b..2a8d9f19d0db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8173,6 +8173,11 @@ done: __maybe_unused;

  	update_misfit_status(p, rq);

+#ifdef CONFIG_CFS_BANDWIDTH
+	if (throttled_hierarchy(cfs_rq_of(&p->se)))
+		task_throttle_setup_work(p);
+#endif
+
  	return p;

  idle:
--

Add task work just before you return "p" after the "done" label.

For most part, this should be easily portable since the bandwidth
control mechanism hasn't seen much changes except for the async
throttling and few bits around throttled time accounting. Also, you can
drop all the bits that refer "delayed" of "DEQUEUE_DELAYED" since those
are EEVDF specific (Patch 6 can be fully dropped on versions < v6.6).

> 
> Best regards,
> Florian

-- 
Thanks and Regards,
Prateek

Re: [RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle

Posted by Florian Bezdeka 9 months, 4 weeks ago

On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> From: Valentin Schneider <vschneid@redhat.com>
> 
> In current throttle model, when a cfs_rq is throttled, its entity will
> be dequeued from cpu's rq, making tasks attached to it not able to run,
> thus achiveing the throttle target.
> 
> This has a drawback though: assume a task is a reader of percpu_rwsem
> and is waiting. When it gets wakeup, it can not run till its task group's
> next period comes, which can be a relatively long time. Waiting writer
> will have to wait longer due to this and it also makes further reader
> build up and eventually trigger task hung.
> 
> To improve this situation, change the throttle model to task based, i.e.
> when a cfs_rq is throttled, record its throttled status but do not
> remove it from cpu's rq. Instead, for tasks that belong to this cfs_rq,
> when they get picked, add a task work to them so that when they return
> to user, they can be dequeued. In this way, tasks throttled will not
> hold any kernel resources.
> 
> Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
>  kernel/sched/fair.c  | 185 +++++++++++++++++++++----------------------
>  kernel/sched/sched.h |   1 +
>  2 files changed, 93 insertions(+), 93 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 894202d232efd..c566a5a90d065 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5516,8 +5516,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	if (flags & DEQUEUE_DELAYED)
>  		finish_delayed_dequeue_entity(se);
>  
> -	if (cfs_rq->nr_queued == 0)
> +	if (cfs_rq->nr_queued == 0) {
>  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +		if (throttled_hierarchy(cfs_rq))
> +			list_del_leaf_cfs_rq(cfs_rq);
> +	}
>  
>  	return true;
>  }
> @@ -5598,7 +5601,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
>  	return se;
>  }
>  
> -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>  
>  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
>  {
> @@ -5823,8 +5826,48 @@ static inline int throttled_lb_pair(struct task_group *tg,
>  	       throttled_hierarchy(dest_cfs_rq);
>  }
>  
> +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>  static void throttle_cfs_rq_work(struct callback_head *work)
>  {
> +	struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> +	struct sched_entity *se;
> +	struct cfs_rq *cfs_rq;
> +	struct rq *rq;
> +
> +	WARN_ON_ONCE(p != current);
> +	p->sched_throttle_work.next = &p->sched_throttle_work;
> +
> +	/*
> +	 * If task is exiting, then there won't be a return to userspace, so we
> +	 * don't have to bother with any of this.
> +	 */
> +	if ((p->flags & PF_EXITING))
> +		return;
> +
> +	scoped_guard(task_rq_lock, p) {
> +		se = &p->se;
> +		cfs_rq = cfs_rq_of(se);
> +
> +		/* Raced, forget */
> +		if (p->sched_class != &fair_sched_class)
> +			return;
> +
> +		/*
> +		 * If not in limbo, then either replenish has happened or this
> +		 * task got migrated out of the throttled cfs_rq, move along.
> +		 */
> +		if (!cfs_rq->throttle_count)
> +			return;
> +
> +		rq = scope.rq;
> +		update_rq_clock(rq);
> +		WARN_ON_ONCE(!list_empty(&p->throttle_node));
> +		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> +		resched_curr(rq);
> +	}
> +
> +	cond_resched_tasks_rcu_qs();
>  }
>  
>  void init_cfs_throttle_work(struct task_struct *p)
> @@ -5864,32 +5907,53 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
>  	return 0;
>  }
>  
> +static inline bool task_has_throttle_work(struct task_struct *p)
> +{
> +	return p->sched_throttle_work.next != &p->sched_throttle_work;
> +}
> +
> +static inline void task_throttle_setup_work(struct task_struct *p)
> +{
> +	if (task_has_throttle_work(p))
> +		return;
> +
> +	/*
> +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> +	 * work is pointless
> +	 */
> +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> +		return;
> +
> +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> +}
> +
>  static int tg_throttle_down(struct task_group *tg, void *data)
>  {
>  	struct rq *rq = data;
>  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>  
> +	cfs_rq->throttle_count++;
> +	if (cfs_rq->throttle_count > 1)
> +		return 0;
> +
>  	/* group is entering throttled state, stop time */
> -	if (!cfs_rq->throttle_count) {
> -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -		list_del_leaf_cfs_rq(cfs_rq);
> +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>  
> -		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> -		if (cfs_rq->nr_queued)
> -			cfs_rq->throttled_clock_self = rq_clock(rq);
> -	}
> -	cfs_rq->throttle_count++;
> +	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> +	if (cfs_rq->nr_queued)
> +		cfs_rq->throttled_clock_self = rq_clock(rq);
> +	else
> +		list_del_leaf_cfs_rq(cfs_rq);
>  
> +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>  	return 0;
>  }

tg_throttle_down() is touched twice in this series. Some code added
here (as part of patch 2) is later removed again in patch 7.

Maybe there is some room for improvement...

>  
> -static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  {
>  	struct rq *rq = rq_of(cfs_rq);
>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> -	struct sched_entity *se;
> -	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
> -	long rq_h_nr_queued = rq->cfs.h_nr_queued;
> +	int dequeue = 1;
>  
>  	raw_spin_lock(&cfs_b->lock);
>  	/* This will start the period timer if necessary */
> @@ -5910,74 +5974,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  	raw_spin_unlock(&cfs_b->lock);
>  
>  	if (!dequeue)
> -		return false;  /* Throttle no longer required. */
> -
> -	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> +		return;  /* Throttle no longer required. */
>  
>  	/* freeze hierarchy runnable averages while throttled */
>  	rcu_read_lock();
>  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
>  	rcu_read_unlock();
>  
> -	queued_delta = cfs_rq->h_nr_queued;
> -	runnable_delta = cfs_rq->h_nr_runnable;
> -	idle_delta = cfs_rq->h_nr_idle;
> -	for_each_sched_entity(se) {
> -		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> -		int flags;
> -
> -		/* throttled entity or throttle-on-deactivate */
> -		if (!se->on_rq)
> -			goto done;
> -
> -		/*
> -		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> -		 * This avoids teaching dequeue_entities() about throttled
> -		 * entities and keeps things relatively simple.
> -		 */
> -		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> -		if (se->sched_delayed)
> -			flags |= DEQUEUE_DELAYED;
> -		dequeue_entity(qcfs_rq, se, flags);
> -
> -		if (cfs_rq_is_idle(group_cfs_rq(se)))
> -			idle_delta = cfs_rq->h_nr_queued;
> -
> -		qcfs_rq->h_nr_queued -= queued_delta;
> -		qcfs_rq->h_nr_runnable -= runnable_delta;
> -		qcfs_rq->h_nr_idle -= idle_delta;
> -
> -		if (qcfs_rq->load.weight) {
> -			/* Avoid re-evaluating load for this entity: */
> -			se = parent_entity(se);
> -			break;
> -		}
> -	}
> -
> -	for_each_sched_entity(se) {
> -		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> -		/* throttled entity or throttle-on-deactivate */
> -		if (!se->on_rq)
> -			goto done;
> -
> -		update_load_avg(qcfs_rq, se, 0);
> -		se_update_runnable(se);
> -
> -		if (cfs_rq_is_idle(group_cfs_rq(se)))
> -			idle_delta = cfs_rq->h_nr_queued;
> -
> -		qcfs_rq->h_nr_queued -= queued_delta;
> -		qcfs_rq->h_nr_runnable -= runnable_delta;
> -		qcfs_rq->h_nr_idle -= idle_delta;
> -	}
> -
> -	/* At this point se is NULL and we are at root level*/
> -	sub_nr_running(rq, queued_delta);
> -
> -	/* Stop the fair server if throttling resulted in no runnable tasks */
> -	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
> -		dl_server_stop(&rq->fair_server);
> -done:
>  	/*
>  	 * Note: distribution will already see us throttled via the
>  	 * throttled-list.  rq->lock protects completion.
> @@ -5986,7 +5989,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  	WARN_ON_ONCE(cfs_rq->throttled_clock);
>  	if (cfs_rq->nr_queued)
>  		cfs_rq->throttled_clock = rq_clock(rq);
> -	return true;
> +	return;

Obsolete now, could be removed.

>  }
>  
>  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> @@ -6462,22 +6465,22 @@ static void sync_throttle(struct task_group *tg, int cpu)
>  }
>  
>  /* conditionally throttle active cfs_rq's from put_prev_entity() */
> -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  {
>  	if (!cfs_bandwidth_used())
> -		return false;
> +		return;
>  
>  	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
> -		return false;
> +		return;
>  
>  	/*
>  	 * it's possible for a throttled entity to be forced into a running
>  	 * state (e.g. set_curr_task), in this case we're finished.
>  	 */
>  	if (cfs_rq_throttled(cfs_rq))
> -		return true;
> +		return;
>  
> -	return throttle_cfs_rq(cfs_rq);
> +	throttle_cfs_rq(cfs_rq);
>  }
>  
>  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> @@ -6573,6 +6576,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  	cfs_rq->runtime_enabled = 0;
>  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
>  	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> +	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
>  }
>  
>  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> @@ -6738,10 +6742,11 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
>  #else /* CONFIG_CFS_BANDWIDTH */
>  
>  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
> -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
>  static inline void sync_throttle(struct task_group *tg, int cpu) {}
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> +static void task_throttle_setup_work(struct task_struct *p) {}
>  
>  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>  {
> @@ -7108,10 +7113,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = h_nr_queued;
>  
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			return 0;
> -
>  		/* Don't dequeue parent if it has other entities besides us */
>  		if (cfs_rq->load.weight) {
>  			slice = cfs_rq_min_slice(cfs_rq);
> @@ -7148,10 +7149,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = h_nr_queued;
> -
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			return 0;
>  	}
>  
>  	sub_nr_running(rq, h_nr_queued);
> @@ -8860,8 +8857,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
>  		if (cfs_rq->curr && cfs_rq->curr->on_rq)
>  			update_curr(cfs_rq);
>  
> -		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> -			goto again;
> +		check_cfs_rq_runtime(cfs_rq);
>  
>  		se = pick_next_entity(rq, cfs_rq);
>  		if (!se)
> @@ -8888,6 +8884,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>  		goto idle;
>  	se = &p->se;
>  
> +	if (throttled_hierarchy(cfs_rq_of(se)))
> +		task_throttle_setup_work(p);
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	if (prev->sched_class != &fair_sched_class)
>  		goto simple;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 921527327f107..97be6a6f53b9c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -736,6 +736,7 @@ struct cfs_rq {
>  	int			throttle_count;
>  	struct list_head	throttled_list;
>  	struct list_head	throttled_csd_list;
> +	struct list_head	throttled_limbo_list;
>  #endif /* CONFIG_CFS_BANDWIDTH */
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
>  };

Re: [RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle

Posted by Aaron Lu 9 months, 4 weeks ago

On Mon, Apr 14, 2025 at 10:54:59AM +0200, Florian Bezdeka wrote:
> On Wed, 2025-04-09 at 20:07 +0800, Aaron Lu wrote:
> > From: Valentin Schneider <vschneid@redhat.com>
> > 
> > In current throttle model, when a cfs_rq is throttled, its entity will
> > be dequeued from cpu's rq, making tasks attached to it not able to run,
> > thus achiveing the throttle target.
> > 
> > This has a drawback though: assume a task is a reader of percpu_rwsem
> > and is waiting. When it gets wakeup, it can not run till its task group's
> > next period comes, which can be a relatively long time. Waiting writer
> > will have to wait longer due to this and it also makes further reader
> > build up and eventually trigger task hung.
> > 
> > To improve this situation, change the throttle model to task based, i.e.
> > when a cfs_rq is throttled, record its throttled status but do not
> > remove it from cpu's rq. Instead, for tasks that belong to this cfs_rq,
> > when they get picked, add a task work to them so that when they return
> > to user, they can be dequeued. In this way, tasks throttled will not
> > hold any kernel resources.
> > 
> > Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> >  kernel/sched/fair.c  | 185 +++++++++++++++++++++----------------------
> >  kernel/sched/sched.h |   1 +
> >  2 files changed, 93 insertions(+), 93 deletions(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 894202d232efd..c566a5a90d065 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5516,8 +5516,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >  	if (flags & DEQUEUE_DELAYED)
> >  		finish_delayed_dequeue_entity(se);
> >  
> > -	if (cfs_rq->nr_queued == 0)
> > +	if (cfs_rq->nr_queued == 0) {
> >  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> > +		if (throttled_hierarchy(cfs_rq))
> > +			list_del_leaf_cfs_rq(cfs_rq);
> > +	}
> >  
> >  	return true;
> >  }
> > @@ -5598,7 +5601,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
> >  	return se;
> >  }
> >  
> > -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> > +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> >  
> >  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
> >  {
> > @@ -5823,8 +5826,48 @@ static inline int throttled_lb_pair(struct task_group *tg,
> >  	       throttled_hierarchy(dest_cfs_rq);
> >  }
> >  
> > +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> >  static void throttle_cfs_rq_work(struct callback_head *work)
> >  {
> > +	struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
> > +	struct sched_entity *se;
> > +	struct cfs_rq *cfs_rq;
> > +	struct rq *rq;
> > +
> > +	WARN_ON_ONCE(p != current);
> > +	p->sched_throttle_work.next = &p->sched_throttle_work;
> > +
> > +	/*
> > +	 * If task is exiting, then there won't be a return to userspace, so we
> > +	 * don't have to bother with any of this.
> > +	 */
> > +	if ((p->flags & PF_EXITING))
> > +		return;
> > +
> > +	scoped_guard(task_rq_lock, p) {
> > +		se = &p->se;
> > +		cfs_rq = cfs_rq_of(se);
> > +
> > +		/* Raced, forget */
> > +		if (p->sched_class != &fair_sched_class)
> > +			return;
> > +
> > +		/*
> > +		 * If not in limbo, then either replenish has happened or this
> > +		 * task got migrated out of the throttled cfs_rq, move along.
> > +		 */
> > +		if (!cfs_rq->throttle_count)
> > +			return;
> > +
> > +		rq = scope.rq;
> > +		update_rq_clock(rq);
> > +		WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > +		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > +		resched_curr(rq);
> > +	}
> > +
> > +	cond_resched_tasks_rcu_qs();
> >  }
> >  
> >  void init_cfs_throttle_work(struct task_struct *p)
> > @@ -5864,32 +5907,53 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
> >  	return 0;
> >  }
> >  
> > +static inline bool task_has_throttle_work(struct task_struct *p)
> > +{
> > +	return p->sched_throttle_work.next != &p->sched_throttle_work;
> > +}
> > +
> > +static inline void task_throttle_setup_work(struct task_struct *p)
> > +{
> > +	if (task_has_throttle_work(p))
> > +		return;
> > +
> > +	/*
> > +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> > +	 * work is pointless
> > +	 */
> > +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > +		return;
> > +
> > +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> > +}
> > +
> >  static int tg_throttle_down(struct task_group *tg, void *data)
> >  {
> >  	struct rq *rq = data;
> >  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> >  
> > +	cfs_rq->throttle_count++;
> > +	if (cfs_rq->throttle_count > 1)
> > +		return 0;
> > +
> >  	/* group is entering throttled state, stop time */
> > -	if (!cfs_rq->throttle_count) {
> > -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > -		list_del_leaf_cfs_rq(cfs_rq);
> > +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> >  
> > -		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> > -		if (cfs_rq->nr_queued)
> > -			cfs_rq->throttled_clock_self = rq_clock(rq);
> > -	}
> > -	cfs_rq->throttle_count++;
> > +	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
> > +	if (cfs_rq->nr_queued)
> > +		cfs_rq->throttled_clock_self = rq_clock(rq);
> > +	else
> > +		list_del_leaf_cfs_rq(cfs_rq);
> >  
> > +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> >  	return 0;
> >  }
> 
> tg_throttle_down() is touched twice in this series. Some code added
> here (as part of patch 2) is later removed again in patch 7.
> 
> Maybe there is some room for improvement...

Yes.

So the purpose of patch7 is to show an alternative accounting of this
new per-task throttle model and since we haven't decided the proper way
to do accounting yet so I chose to separate it out. Another rationale
is, I want to keep the core of the patchset(patch2 and patch3) as simple
as possible to ease reviewing. Does this make sense? If folding them is
better, I can do that too for next version.

> >  
> > -static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> > +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >  {
> >  	struct rq *rq = rq_of(cfs_rq);
> >  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> > -	struct sched_entity *se;
> > -	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
> > -	long rq_h_nr_queued = rq->cfs.h_nr_queued;
> > +	int dequeue = 1;
> >  
> >  	raw_spin_lock(&cfs_b->lock);
> >  	/* This will start the period timer if necessary */
> > @@ -5910,74 +5974,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >  	raw_spin_unlock(&cfs_b->lock);
> >  
> >  	if (!dequeue)
> > -		return false;  /* Throttle no longer required. */
> > -
> > -	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> > +		return;  /* Throttle no longer required. */
> >  
> >  	/* freeze hierarchy runnable averages while throttled */
> >  	rcu_read_lock();
> >  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
> >  	rcu_read_unlock();
> >  
> > -	queued_delta = cfs_rq->h_nr_queued;
> > -	runnable_delta = cfs_rq->h_nr_runnable;
> > -	idle_delta = cfs_rq->h_nr_idle;
> > -	for_each_sched_entity(se) {
> > -		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> > -		int flags;
> > -
> > -		/* throttled entity or throttle-on-deactivate */
> > -		if (!se->on_rq)
> > -			goto done;
> > -
> > -		/*
> > -		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> > -		 * This avoids teaching dequeue_entities() about throttled
> > -		 * entities and keeps things relatively simple.
> > -		 */
> > -		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> > -		if (se->sched_delayed)
> > -			flags |= DEQUEUE_DELAYED;
> > -		dequeue_entity(qcfs_rq, se, flags);
> > -
> > -		if (cfs_rq_is_idle(group_cfs_rq(se)))
> > -			idle_delta = cfs_rq->h_nr_queued;
> > -
> > -		qcfs_rq->h_nr_queued -= queued_delta;
> > -		qcfs_rq->h_nr_runnable -= runnable_delta;
> > -		qcfs_rq->h_nr_idle -= idle_delta;
> > -
> > -		if (qcfs_rq->load.weight) {
> > -			/* Avoid re-evaluating load for this entity: */
> > -			se = parent_entity(se);
> > -			break;
> > -		}
> > -	}
> > -
> > -	for_each_sched_entity(se) {
> > -		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> > -		/* throttled entity or throttle-on-deactivate */
> > -		if (!se->on_rq)
> > -			goto done;
> > -
> > -		update_load_avg(qcfs_rq, se, 0);
> > -		se_update_runnable(se);
> > -
> > -		if (cfs_rq_is_idle(group_cfs_rq(se)))
> > -			idle_delta = cfs_rq->h_nr_queued;
> > -
> > -		qcfs_rq->h_nr_queued -= queued_delta;
> > -		qcfs_rq->h_nr_runnable -= runnable_delta;
> > -		qcfs_rq->h_nr_idle -= idle_delta;
> > -	}
> > -
> > -	/* At this point se is NULL and we are at root level*/
> > -	sub_nr_running(rq, queued_delta);
> > -
> > -	/* Stop the fair server if throttling resulted in no runnable tasks */
> > -	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
> > -		dl_server_stop(&rq->fair_server);
> > -done:
> >  	/*
> >  	 * Note: distribution will already see us throttled via the
> >  	 * throttled-list.  rq->lock protects completion.
> > @@ -5986,7 +5989,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >  	WARN_ON_ONCE(cfs_rq->throttled_clock);
> >  	if (cfs_rq->nr_queued)
> >  		cfs_rq->throttled_clock = rq_clock(rq);
> > -	return true;
> > +	return;
> 
> Obsolete now, could be removed.

Indeed and one less line of code :-)

Thanks,
Aaron

> >  }
> >  
> >  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > @@ -6462,22 +6465,22 @@ static void sync_throttle(struct task_group *tg, int cpu)
> >  }
> >  
> >  /* conditionally throttle active cfs_rq's from put_prev_entity() */
> > -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> > +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> >  {
> >  	if (!cfs_bandwidth_used())
> > -		return false;
> > +		return;
> >  
> >  	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
> > -		return false;
> > +		return;
> >  
> >  	/*
> >  	 * it's possible for a throttled entity to be forced into a running
> >  	 * state (e.g. set_curr_task), in this case we're finished.
> >  	 */
> >  	if (cfs_rq_throttled(cfs_rq))
> > -		return true;
> > +		return;
> >  
> > -	return throttle_cfs_rq(cfs_rq);
> > +	throttle_cfs_rq(cfs_rq);
> >  }
> >  
> >  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> > @@ -6573,6 +6576,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> >  	cfs_rq->runtime_enabled = 0;
> >  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
> >  	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> > +	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
> >  }
> >  
> >  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> > @@ -6738,10 +6742,11 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
> >  #else /* CONFIG_CFS_BANDWIDTH */
> >  
> >  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
> > -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
> > +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> >  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> >  static inline void sync_throttle(struct task_group *tg, int cpu) {}
> >  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> > +static void task_throttle_setup_work(struct task_struct *p) {}
> >  
> >  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> >  {
> > @@ -7108,10 +7113,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >  		if (cfs_rq_is_idle(cfs_rq))
> >  			h_nr_idle = h_nr_queued;
> >  
> > -		/* end evaluation on encountering a throttled cfs_rq */
> > -		if (cfs_rq_throttled(cfs_rq))
> > -			return 0;
> > -
> >  		/* Don't dequeue parent if it has other entities besides us */
> >  		if (cfs_rq->load.weight) {
> >  			slice = cfs_rq_min_slice(cfs_rq);
> > @@ -7148,10 +7149,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >  
> >  		if (cfs_rq_is_idle(cfs_rq))
> >  			h_nr_idle = h_nr_queued;
> > -
> > -		/* end evaluation on encountering a throttled cfs_rq */
> > -		if (cfs_rq_throttled(cfs_rq))
> > -			return 0;
> >  	}
> >  
> >  	sub_nr_running(rq, h_nr_queued);
> > @@ -8860,8 +8857,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> >  		if (cfs_rq->curr && cfs_rq->curr->on_rq)
> >  			update_curr(cfs_rq);
> >  
> > -		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> > -			goto again;
> > +		check_cfs_rq_runtime(cfs_rq);
> >  
> >  		se = pick_next_entity(rq, cfs_rq);
> >  		if (!se)
> > @@ -8888,6 +8884,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
> >  		goto idle;
> >  	se = &p->se;
> >  
> > +	if (throttled_hierarchy(cfs_rq_of(se)))
> > +		task_throttle_setup_work(p);
> > +
> >  #ifdef CONFIG_FAIR_GROUP_SCHED
> >  	if (prev->sched_class != &fair_sched_class)
> >  		goto simple;
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 921527327f107..97be6a6f53b9c 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -736,6 +736,7 @@ struct cfs_rq {
> >  	int			throttle_count;
> >  	struct list_head	throttled_list;
> >  	struct list_head	throttled_csd_list;
> > +	struct list_head	throttled_limbo_list;
> >  #endif /* CONFIG_CFS_BANDWIDTH */
> >  #endif /* CONFIG_FAIR_GROUP_SCHED */
> >  };
>

[RFC PATCH v2 1/7] sched/fair: Add related data structure for task based throttle
[RFC PATCH v2 2/7] sched/fair: Handle throttle path for task based throttle
[RFC PATCH v2 3/7] sched/fair: Handle unthrottle path for task based throttle
[RFC PATCH v2 4/7] sched/fair: Take care of group/affinity/sched_class change for throttled task
[RFC PATCH v2 5/7] sched/fair: get rid of throttled_lb_pair()
[RFC PATCH v2 6/7] sched/fair: fix h_nr_runnable accounting with per-task throttle
[RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time