[PATCH v4 3/5] sched/fair: Switch to task based throttle model

Aaron Lu posted 5 patches 1 month ago
[PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 1 month ago
From: Valentin Schneider <vschneid@redhat.com>

In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.

Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock when the throttled cfs_rq has no tasks left.

Tested-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c  | 341 ++++++++++++++++++++++---------------------
 kernel/sched/pelt.h  |   4 +-
 kernel/sched/sched.h |   3 +-
 3 files changed, 181 insertions(+), 167 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dab4ed86d0c82..25b1014d4ef86 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5291,18 +5291,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);
 
 			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
 			if (!cfs_rq->throttled_clock_self)
 				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+
+			if (cfs_rq->pelt_clock_throttled) {
+				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+					cfs_rq->throttled_clock_pelt;
+				cfs_rq->pelt_clock_throttled = 0;
+			}
 		}
+#endif
 	}
 }
 
@@ -5341,8 +5346,6 @@ static void set_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable--;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5363,8 +5366,6 @@ static void clear_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable++;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5450,8 +5451,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);
 
-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
+			struct rq *rq = rq_of(cfs_rq);
+
+			list_del_leaf_cfs_rq(cfs_rq);
+			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+			cfs_rq->pelt_clock_throttled = 1;
+		}
+#endif
+	}
 
 	return true;
 }
@@ -5790,6 +5801,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
 		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		/*
+		 * Must not set throttled before dequeue or dequeue will
+		 * mistakenly regard this task as an already throttled one.
+		 */
 		p->throttled = true;
 		resched_curr(rq);
 	}
@@ -5803,32 +5818,124 @@ void init_cfs_throttle_work(struct task_struct *p)
 	INIT_LIST_HEAD(&p->throttle_node);
 }
 
+/*
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
+ */
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+	WARN_ON_ONCE(p->se.on_rq);
+	list_del_init(&p->throttle_node);
+
+	/* task blocked after throttled */
+	if (flags & DEQUEUE_SLEEP) {
+		p->throttled = false;
+		return;
+	}
+
+	/*
+	 * task is migrating off its old cfs_rq, detach
+	 * the task's load from its old cfs_rq.
+	 */
+	if (task_on_rq_migrating(p))
+		detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	/* @p should have gone through dequeue_throttled_task() first */
+	WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+	/*
+	 * If the throttled task @p is enqueued to a throttled cfs_rq,
+	 * take the fast path by directly putting the task on the
+	 * target cfs_rq's limbo list.
+	 *
+	 * Do not do that when @p is current because the following race can
+	 * cause @p's group_node to be incorectly re-insterted in its rq's
+	 * cfs_tasks list, despite being throttled:
+	 *
+	 *     cpuX                       cpuY
+	 *   p ret2user
+	 *  throttle_cfs_rq_work()  sched_move_task(p)
+	 *  LOCK task_rq_lock
+	 *  dequeue_task_fair(p)
+	 *  UNLOCK task_rq_lock
+	 *                          LOCK task_rq_lock
+	 *                          task_current_donor(p) == true
+	 *                          task_on_rq_queued(p) == true
+	 *                          dequeue_task(p)
+	 *                          put_prev_task(p)
+	 *                          sched_change_group()
+	 *                          enqueue_task(p) -> p's new cfs_rq
+	 *                                             is throttled, go
+	 *                                             fast path and skip
+	 *                                             actual enqueue
+	 *                          set_next_task(p)
+	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
+	 *  schedule()
+	 *
+	 * In the above race case, @p current cfs_rq is in the same rq as
+	 * its previous cfs_rq because sched_move_task() only moves a task
+	 * to a different group from the same rq, so we can use its current
+	 * cfs_rq to derive rq and test if the task is current.
+	 */
+	if (throttled_hierarchy(cfs_rq) &&
+	    !task_current_donor(rq_of(cfs_rq), p)) {
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		return true;
+	}
+
+	/* we can't take the fast path, do an actual enqueue*/
+	p->throttled = false;
+	return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;
+
+	if (--cfs_rq->throttle_count)
+		return 0;
 
-	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
+	if (cfs_rq->pelt_clock_throttled) {
 		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 					     cfs_rq->throttled_clock_pelt;
+		cfs_rq->pelt_clock_throttled = 0;
+	}
 
-		/* Add cfs_rq with load or one or more already running entities to the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
 
-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+		cfs_rq->throttled_clock_self = 0;
 
-			cfs_rq->throttled_clock_self = 0;
+		if (WARN_ON_ONCE((s64)delta < 0))
+			delta = 0;
 
-			if (WARN_ON_ONCE((s64)delta < 0))
-				delta = 0;
+		cfs_rq->throttled_clock_self_time += delta;
+	}
 
-			cfs_rq->throttled_clock_self_time += delta;
-		}
+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+		list_del_init(&p->throttle_node);
+		p->throttled = false;
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}
 
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }
 
@@ -5857,17 +5964,25 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
+	if (cfs_rq->throttle_count++)
+		return 0;
+
+
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+	else {
+		/*
+		 * For cfs_rqs that still have entities enqueued, PELT clock
+		 * stop happens at dequeue time when all entities are dequeued.
+		 */
 		list_del_leaf_cfs_rq(cfs_rq);
-
-		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+		cfs_rq->pelt_clock_throttled = 1;
 	}
-	cfs_rq->throttle_count++;
 
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
 
@@ -5875,8 +5990,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+	int dequeue = 1;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5899,68 +6013,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5976,9 +6033,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+	/*
+	 * It's possible we are called with !runtime_remaining due to things
+	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+	 * unthrottled us with a positive runtime_remaining but other still
+	 * running entities consumed those runtime before we reached here.
+	 *
+	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+	 * because any enqueue in tg_unthrottle_up() will immediately trigger a
+	 * throttle, which is not supposed to happen on unthrottle path.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+		return;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -6008,62 +6076,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
-	}
-
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
 	}
 
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6717,6 +6731,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -6909,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
+	if (task_is_throttled(p) && enqueue_throttled_task(p))
+		return;
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -6961,10 +6980,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -6986,10 +7001,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}
 
 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7019,7 +7030,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);
 
-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
@@ -7074,10 +7084,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7114,10 +7120,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}
 
 	sub_nr_running(rq, h_nr_queued);
@@ -7151,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (task_is_throttled(p)) {
+		dequeue_throttled_task(p, flags);
+		return true;
+	}
+
 	if (!p->se.sched_delayed)
 		util_est_dequeue(&rq->cfs, p);
 
@@ -8819,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+	struct task_struct *p;
+	bool throttled;
 
 again:
 	cfs_rq = &rq->cfs;
 	if (!cfs_rq->nr_queued)
 		return NULL;
 
+	throttled = false;
+
 	do {
 		/* Might not have done put_prev_entity() */
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		throttled |= check_cfs_rq_runtime(cfs_rq);
 
 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
@@ -8839,7 +8849,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	if (unlikely(throttled))
+		task_throttle_setup_work(p);
+	return p;
 }
 
 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f2..f921302dc40fb 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	u64 throttled;
 
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		throttled = U64_MAX;
 	else
 		throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
 
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 06cc7722226b4..528af3ef8fc1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -738,7 +738,8 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled;
+	bool			throttled:1;
+	bool			pelt_clock_throttled:1;
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
-- 
2.39.5
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Benjamin Segall 4 weeks, 1 day ago
Aaron Lu <ziqianlu@bytedance.com> writes:

> +static bool enqueue_throttled_task(struct task_struct *p)
> +{
> +	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> +
> +	/* @p should have gone through dequeue_throttled_task() first */
> +	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> +
> +	/*
> +	 * If the throttled task @p is enqueued to a throttled cfs_rq,
> +	 * take the fast path by directly putting the task on the
> +	 * target cfs_rq's limbo list.
> +	 *
> +	 * Do not do that when @p is current because the following race can
> +	 * cause @p's group_node to be incorectly re-insterted in its rq's
> +	 * cfs_tasks list, despite being throttled:
> +	 *
> +	 *     cpuX                       cpuY
> +	 *   p ret2user
> +	 *  throttle_cfs_rq_work()  sched_move_task(p)
> +	 *  LOCK task_rq_lock
> +	 *  dequeue_task_fair(p)
> +	 *  UNLOCK task_rq_lock
> +	 *                          LOCK task_rq_lock
> +	 *                          task_current_donor(p) == true
> +	 *                          task_on_rq_queued(p) == true
> +	 *                          dequeue_task(p)
> +	 *                          put_prev_task(p)
> +	 *                          sched_change_group()
> +	 *                          enqueue_task(p) -> p's new cfs_rq
> +	 *                                             is throttled, go
> +	 *                                             fast path and skip
> +	 *                                             actual enqueue
> +	 *                          set_next_task(p)
> +	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
> +	 *  schedule()
> +	 *
> +	 * In the above race case, @p current cfs_rq is in the same rq as
> +	 * its previous cfs_rq because sched_move_task() only moves a task
> +	 * to a different group from the same rq, so we can use its current
> +	 * cfs_rq to derive rq and test if the task is current.
> +	 */
> +	if (throttled_hierarchy(cfs_rq) &&
> +	    !task_current_donor(rq_of(cfs_rq), p)) {
> +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> +		return true;
> +	}
> +
> +	/* we can't take the fast path, do an actual enqueue*/
> +	p->throttled = false;
> +	return false;
> +}
> +

Is there a reason that __set_next_task_fair cannot check p->se.on_rq as
well as (or instead of) task_on_rq_queued()? All of the _entity parts of
set_next/put_prev check se.on_rq for this sort of thing, so that seems
fairly standard. And se.on_rq should exactly match if the task is on
cfs_tasks since that add/remove is done in account_entity_{en,de}queue.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 01:55:36PM -0700, Benjamin Segall wrote:
> Aaron Lu <ziqianlu@bytedance.com> writes:
> 
> > +static bool enqueue_throttled_task(struct task_struct *p)
> > +{
> > +	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> > +
> > +	/* @p should have gone through dequeue_throttled_task() first */
> > +	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > +
> > +	/*
> > +	 * If the throttled task @p is enqueued to a throttled cfs_rq,
> > +	 * take the fast path by directly putting the task on the
> > +	 * target cfs_rq's limbo list.
> > +	 *
> > +	 * Do not do that when @p is current because the following race can
> > +	 * cause @p's group_node to be incorectly re-insterted in its rq's
> > +	 * cfs_tasks list, despite being throttled:
> > +	 *
> > +	 *     cpuX                       cpuY
> > +	 *   p ret2user
> > +	 *  throttle_cfs_rq_work()  sched_move_task(p)
> > +	 *  LOCK task_rq_lock
> > +	 *  dequeue_task_fair(p)
> > +	 *  UNLOCK task_rq_lock
> > +	 *                          LOCK task_rq_lock
> > +	 *                          task_current_donor(p) == true
> > +	 *                          task_on_rq_queued(p) == true
> > +	 *                          dequeue_task(p)
> > +	 *                          put_prev_task(p)
> > +	 *                          sched_change_group()
> > +	 *                          enqueue_task(p) -> p's new cfs_rq
> > +	 *                                             is throttled, go
> > +	 *                                             fast path and skip
> > +	 *                                             actual enqueue
> > +	 *                          set_next_task(p)
> > +	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
> > +	 *  schedule()
> > +	 *
> > +	 * In the above race case, @p current cfs_rq is in the same rq as
> > +	 * its previous cfs_rq because sched_move_task() only moves a task
> > +	 * to a different group from the same rq, so we can use its current
> > +	 * cfs_rq to derive rq and test if the task is current.
> > +	 */
> > +	if (throttled_hierarchy(cfs_rq) &&
> > +	    !task_current_donor(rq_of(cfs_rq), p)) {
> > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > +		return true;
> > +	}
> > +
> > +	/* we can't take the fast path, do an actual enqueue*/
> > +	p->throttled = false;
> > +	return false;
> > +}
> > +
> 
> Is there a reason that __set_next_task_fair cannot check p->se.on_rq as
> well as (or instead of) task_on_rq_queued()? All of the _entity parts of
> set_next/put_prev check se.on_rq for this sort of thing, so that seems
> fairly standard. And se.on_rq should exactly match if the task is on
> cfs_tasks since that add/remove is done in account_entity_{en,de}queue.

Makes sense to me.

Only thing that feels a little strange is, a throttled/dequeued task is
set as next now. Maybe not a big deal. I booted a VM and run some tests,
didn't notice anything wrong but I could very well miss some cases.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks, 1 day ago
On Thu, Sep 04, 2025 at 07:26:10PM +0800, Aaron Lu wrote:
> On Wed, Sep 03, 2025 at 01:55:36PM -0700, Benjamin Segall wrote:
> > Aaron Lu <ziqianlu@bytedance.com> writes:
> > 
> > > +static bool enqueue_throttled_task(struct task_struct *p)
> > > +{
> > > +	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> > > +
> > > +	/* @p should have gone through dequeue_throttled_task() first */
> > > +	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > > +
> > > +	/*
> > > +	 * If the throttled task @p is enqueued to a throttled cfs_rq,
> > > +	 * take the fast path by directly putting the task on the
> > > +	 * target cfs_rq's limbo list.
> > > +	 *
> > > +	 * Do not do that when @p is current because the following race can
> > > +	 * cause @p's group_node to be incorectly re-insterted in its rq's
> > > +	 * cfs_tasks list, despite being throttled:
> > > +	 *
> > > +	 *     cpuX                       cpuY
> > > +	 *   p ret2user
> > > +	 *  throttle_cfs_rq_work()  sched_move_task(p)
> > > +	 *  LOCK task_rq_lock
> > > +	 *  dequeue_task_fair(p)
> > > +	 *  UNLOCK task_rq_lock
> > > +	 *                          LOCK task_rq_lock
> > > +	 *                          task_current_donor(p) == true
> > > +	 *                          task_on_rq_queued(p) == true
> > > +	 *                          dequeue_task(p)
> > > +	 *                          put_prev_task(p)
> > > +	 *                          sched_change_group()
> > > +	 *                          enqueue_task(p) -> p's new cfs_rq
> > > +	 *                                             is throttled, go
> > > +	 *                                             fast path and skip
> > > +	 *                                             actual enqueue
> > > +	 *                          set_next_task(p)
> > > +	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
> > > +	 *  schedule()
> > > +	 *
> > > +	 * In the above race case, @p current cfs_rq is in the same rq as
> > > +	 * its previous cfs_rq because sched_move_task() only moves a task
> > > +	 * to a different group from the same rq, so we can use its current
> > > +	 * cfs_rq to derive rq and test if the task is current.
> > > +	 */
> > > +	if (throttled_hierarchy(cfs_rq) &&
> > > +	    !task_current_donor(rq_of(cfs_rq), p)) {
> > > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > > +		return true;
> > > +	}
> > > +
> > > +	/* we can't take the fast path, do an actual enqueue*/
> > > +	p->throttled = false;
> > > +	return false;
> > > +}
> > > +
> > 
> > Is there a reason that __set_next_task_fair cannot check p->se.on_rq as
> > well as (or instead of) task_on_rq_queued()? All of the _entity parts of
> > set_next/put_prev check se.on_rq for this sort of thing, so that seems
> > fairly standard. And se.on_rq should exactly match if the task is on
> > cfs_tasks since that add/remove is done in account_entity_{en,de}queue.
> 
> Makes sense to me.
> 
> Only thing that feels a little strange is, a throttled/dequeued task is
> set as next now. Maybe not a big deal. I booted a VM and run some tests,
> didn't notice anything wrong but I could very well miss some cases.

Sorry, I should have added: the above test was done with following diff:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb93e74a850e8..7a6782617c0e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5836,38 +5836,8 @@ static bool enqueue_throttled_task(struct task_struct *p)
 	 * If the throttled task @p is enqueued to a throttled cfs_rq,
 	 * take the fast path by directly putting the task on the
 	 * target cfs_rq's limbo list.
-	 *
-	 * Do not do that when @p is current because the following race can
-	 * cause @p's group_node to be incorectly re-insterted in its rq's
-	 * cfs_tasks list, despite being throttled:
-	 *
-	 *     cpuX                       cpuY
-	 *   p ret2user
-	 *  throttle_cfs_rq_work()  sched_move_task(p)
-	 *  LOCK task_rq_lock
-	 *  dequeue_task_fair(p)
-	 *  UNLOCK task_rq_lock
-	 *                          LOCK task_rq_lock
-	 *                          task_current_donor(p) == true
-	 *                          task_on_rq_queued(p) == true
-	 *                          dequeue_task(p)
-	 *                          put_prev_task(p)
-	 *                          sched_change_group()
-	 *                          enqueue_task(p) -> p's new cfs_rq
-	 *                                             is throttled, go
-	 *                                             fast path and skip
-	 *                                             actual enqueue
-	 *                          set_next_task(p)
-	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
-	 *  schedule()
-	 *
-	 * In the above race case, @p current cfs_rq is in the same rq as
-	 * its previous cfs_rq because sched_move_task() only moves a task
-	 * to a different group from the same rq, so we can use its current
-	 * cfs_rq to derive rq and test if the task is current.
 	 */
-	if (throttled_hierarchy(cfs_rq) &&
-	    !task_current_donor(rq_of(cfs_rq), p)) {
+	if (throttled_hierarchy(cfs_rq)) {
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 		return true;
 	}
@@ -13256,7 +13226,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
 {
 	struct sched_entity *se = &p->se;
 
-	if (task_on_rq_queued(p)) {
+	if (se->on_rq) {
 		/*
 		 * Move the next running task to the front of the list, so our
 		 * cfs_tasks list becomes MRU one.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Peter Zijlstra 4 weeks, 1 day ago
On Fri, Aug 29, 2025 at 04:11:18PM +0800, Aaron Lu wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dab4ed86d0c82..25b1014d4ef86 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5291,18 +5291,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  
>  	if (cfs_rq->nr_queued == 1) {
>  		check_enqueue_throttle(cfs_rq);
> -		if (!throttled_hierarchy(cfs_rq)) {
> -			list_add_leaf_cfs_rq(cfs_rq);
> -		} else {
> +		list_add_leaf_cfs_rq(cfs_rq);
>  #ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
>  			struct rq *rq = rq_of(cfs_rq);
>  
>  			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>  				cfs_rq->throttled_clock = rq_clock(rq);
>  			if (!cfs_rq->throttled_clock_self)
>  				cfs_rq->throttled_clock_self = rq_clock(rq);
> -#endif
> +
> +			if (cfs_rq->pelt_clock_throttled) {
> +				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> +					cfs_rq->throttled_clock_pelt;
> +				cfs_rq->pelt_clock_throttled = 0;
> +			}
>  		}
> +#endif
>  	}
>  }
>  

> @@ -5450,8 +5451,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	if (flags & DEQUEUE_DELAYED)
>  		finish_delayed_dequeue_entity(se);
>  
> -	if (cfs_rq->nr_queued == 0)
> +	if (cfs_rq->nr_queued == 0) {
>  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +#ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
> +			struct rq *rq = rq_of(cfs_rq);
> +
> +			list_del_leaf_cfs_rq(cfs_rq);
> +			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> +			cfs_rq->pelt_clock_throttled = 1;
> +		}
> +#endif
> +	}
>  
>  	return true;
>  }

> @@ -6717,6 +6731,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
>  static void task_throttle_setup_work(struct task_struct *p) {}
>  static bool task_is_throttled(struct task_struct *p) { return false; }
> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
> +static bool enqueue_throttled_task(struct task_struct *p) { return false; }
>  
>  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>  {
> @@ -6909,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	int rq_h_nr_queued = rq->cfs.h_nr_queued;
>  	u64 slice = 0;
>  
> +	if (task_is_throttled(p) && enqueue_throttled_task(p))
> +		return;
> +
>  	/*
>  	 * The code below (indirectly) updates schedutil which looks at
>  	 * the cfs_rq utilization to select a frequency.
> @@ -6961,10 +6980,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = 1;
>  
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			goto enqueue_throttle;
> -
>  		flags = ENQUEUE_WAKEUP;
>  	}
>  
> @@ -6986,10 +7001,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = 1;
> -
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			goto enqueue_throttle;
>  	}
>  
>  	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
> @@ -7019,7 +7030,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	if (!task_new)
>  		check_update_overutilized_status(rq);
>  
> -enqueue_throttle:
>  	assert_list_leaf_cfs_rq(rq);
>  
>  	hrtick_update(rq);
> @@ -7074,10 +7084,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = h_nr_queued;
>  
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			return 0;
> -
>  		/* Don't dequeue parent if it has other entities besides us */
>  		if (cfs_rq->load.weight) {
>  			slice = cfs_rq_min_slice(cfs_rq);
> @@ -7114,10 +7120,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = h_nr_queued;
> -
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			return 0;
>  	}
>  
>  	sub_nr_running(rq, h_nr_queued);
> @@ -7151,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> +	if (task_is_throttled(p)) {
> +		dequeue_throttled_task(p, flags);
> +		return true;
> +	}
> +
>  	if (!p->se.sched_delayed)
>  		util_est_dequeue(&rq->cfs, p);
>  

OK, so this makes it so that either a task is fully enqueued (all
cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
its tasks are gone, and unthrottled when a task gets added. Right?

But propagate_entity_cfs_rq() is still doing the old thing, and has a
if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
iteration.

This seems somewhat inconsistent; or am I missing something ?
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by K Prateek Nayak 4 weeks, 1 day ago
Hello Peter,

On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>  {
>> +	if (task_is_throttled(p)) {
>> +		dequeue_throttled_task(p, flags);
>> +		return true;
>> +	}
>> +
>>  	if (!p->se.sched_delayed)
>>  		util_est_dequeue(&rq->cfs, p);
>>  
> 
> OK, so this makes it so that either a task is fully enqueued (all
> cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
> its tasks are gone, and unthrottled when a task gets added. Right?

cfs_rq (and the hierarchy below) is marked throttled when the quota
has elapsed. Tasks on the throttled hierarchies will dequeue
themselves completely via task work added during pick. When the last
task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
that cfs_rq.

When a new task is added on the hierarchy, the PELT is unfrozen and
the task becomes runnable. The cfs_rq and the hierarchy is still
marked throttled.

Unthrottling of hierarchy is only done at distribution.

> 
> But propagate_entity_cfs_rq() is still doing the old thing, and has a
> if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
> iteration.
> 
> This seems somewhat inconsistent; or am I missing something ? 

Probably an oversight. But before that, what was the reason to have
stopped this propagation at throttled_cfs_rq() before the changes?

Looking at commit 09a43ace1f98 ("sched/fair: Propagate load during
synchronous attach/detach") was it because the update_load_avg() from
enqueue_entity() loop previously in unthrottle_cfs_rq() would have
propagated the PELT to root on unthrottle?

If that was the intention, perhaps something like:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df8dc389af8e..4b32785231bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5234,6 +5234,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq);
 
 static void
 requeue_delayed_entity(struct sched_entity *se);
@@ -5729,6 +5730,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -6721,6 +6727,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return 0;
 }
 
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -13151,7 +13162,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-	if (cfs_rq_throttled(cfs_rq))
+	if (cfs_rq_pelt_clock_throttled(cfs_rq))
 		return;
 
 	if (!throttled_hierarchy(cfs_rq))
@@ -13165,7 +13176,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 
-		if (cfs_rq_throttled(cfs_rq))
+		if (cfs_rq_pelt_clock_throttled(cfs_rq))
 			break;
 
 		if (!throttled_hierarchy(cfs_rq))
---

The PELT will then be propagated to root either via the
update_load_avg() call when PELT unfreezes or through the
__update_blocked_fair() path after unthrottle. Thoughts?

-- 
Thanks and Regards,
Prateek
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Benjamin Segall 4 weeks, 1 day ago
K Prateek Nayak <kprateek.nayak@amd.com> writes:

> Hello Peter,
>
> On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
>>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>  {
>>> +	if (task_is_throttled(p)) {
>>> +		dequeue_throttled_task(p, flags);
>>> +		return true;
>>> +	}
>>> +
>>>  	if (!p->se.sched_delayed)
>>>  		util_est_dequeue(&rq->cfs, p);
>>>  
>> 
>> OK, so this makes it so that either a task is fully enqueued (all
>> cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
>> its tasks are gone, and unthrottled when a task gets added. Right?
>
> cfs_rq (and the hierarchy below) is marked throttled when the quota
> has elapsed. Tasks on the throttled hierarchies will dequeue
> themselves completely via task work added during pick. When the last
> task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
> that cfs_rq.
>
> When a new task is added on the hierarchy, the PELT is unfrozen and
> the task becomes runnable. The cfs_rq and the hierarchy is still
> marked throttled.
>
> Unthrottling of hierarchy is only done at distribution.
>
>> 
>> But propagate_entity_cfs_rq() is still doing the old thing, and has a
>> if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
>> iteration.
>> 
>> This seems somewhat inconsistent; or am I missing something ? 
>
> Probably an oversight. But before that, what was the reason to have
> stopped this propagation at throttled_cfs_rq() before the changes?
>

Yeah, this was one of the things I was (slowly) looking at - with this
series we currently still abort in:

1) update_cfs_group
2) dequeue_entities's set_next_buddy
3) check_preempt_fair
4) yield_to
5) propagate_entity_cfs_rq

In the old design on throttle immediately remove the entire cfs_rq,
freeze time for it, and stop adjusting load. In the new design we still
pick from it, so we definitely don't want to stop time (and don't). I'm
guessing we probably also want to now adjust load for it, but it is
arguable - since all the cfs_rqs for the tg are likely to throttle at the
same time, so we might not want to mess with the shares distribution,
since when unthrottle comes around the most likely correct distribution
is the distribution we had at the time of throttle.

Assuming we do want to adjust load for a throttle then we probably want
to remove the aborts from update_cfs_group and propagate_entity_cfs_rq.
I'm guessing that we need the list_add_leaf_cfs_rq from propagate, but
I'm not 100% sure when they are actually doing something in propagate as
opposed to enqueue.

The other 3 are the same sort of thing - scheduling pick heuristics
which imo are pretty arbitrary to keep. We can reasonably say that "the
most likely thing a task in a throttled hierarchy will do is just go
throttle itself, so we shouldn't buddy it or let it preempt", but it
would also be reasonable to let them preempt/buddy normally, in case
they hold locks or such.

yield_to is used by kvm and st-dma-fence-chain.c. Yielding to a
throttle-on-exit kvm cpu thread isn't useful (so no need to remove the
abort there). The dma code is just yielding to a just-spawned kthread,
so it should be fine either way.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 01:46:48PM -0700, Benjamin Segall wrote:
> K Prateek Nayak <kprateek.nayak@amd.com> writes:
> 
> > Hello Peter,
> >
> > On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
> >>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>>  {
> >>> +	if (task_is_throttled(p)) {
> >>> +		dequeue_throttled_task(p, flags);
> >>> +		return true;
> >>> +	}
> >>> +
> >>>  	if (!p->se.sched_delayed)
> >>>  		util_est_dequeue(&rq->cfs, p);
> >>>  
> >> 
> >> OK, so this makes it so that either a task is fully enqueued (all
> >> cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
> >> its tasks are gone, and unthrottled when a task gets added. Right?
> >
> > cfs_rq (and the hierarchy below) is marked throttled when the quota
> > has elapsed. Tasks on the throttled hierarchies will dequeue
> > themselves completely via task work added during pick. When the last
> > task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
> > that cfs_rq.
> >
> > When a new task is added on the hierarchy, the PELT is unfrozen and
> > the task becomes runnable. The cfs_rq and the hierarchy is still
> > marked throttled.
> >
> > Unthrottling of hierarchy is only done at distribution.
> >
> >> 
> >> But propagate_entity_cfs_rq() is still doing the old thing, and has a
> >> if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
> >> iteration.
> >> 
> >> This seems somewhat inconsistent; or am I missing something ? 
> >
> > Probably an oversight. But before that, what was the reason to have
> > stopped this propagation at throttled_cfs_rq() before the changes?
> >
> 
> Yeah, this was one of the things I was (slowly) looking at - with this
> series we currently still abort in:
> 
> 1) update_cfs_group
> 2) dequeue_entities's set_next_buddy
> 3) check_preempt_fair
> 4) yield_to
> 5) propagate_entity_cfs_rq
> 
> In the old design on throttle immediately remove the entire cfs_rq,
> freeze time for it, and stop adjusting load. In the new design we still
> pick from it, so we definitely don't want to stop time (and don't). I'm
> guessing we probably also want to now adjust load for it, but it is
> arguable - since all the cfs_rqs for the tg are likely to throttle at the
> same time, so we might not want to mess with the shares distribution,
> since when unthrottle comes around the most likely correct distribution
> is the distribution we had at the time of throttle.
>

I can give it a test to see how things change by adjusting load and share
distribution using my previous performance tests.

> Assuming we do want to adjust load for a throttle then we probably want
> to remove the aborts from update_cfs_group and propagate_entity_cfs_rq.
> I'm guessing that we need the list_add_leaf_cfs_rq from propagate, but
> I'm not 100% sure when they are actually doing something in propagate as
> opposed to enqueue.
>

Yes, commit 0258bdfaff5bd("sched/fair: Fix unfairness caused by missing 
load decay") added that list_add_leaf_cfs_rq() in
propagate_entity_cfs_rq() to fix a problem.

> The other 3 are the same sort of thing - scheduling pick heuristics
> which imo are pretty arbitrary to keep. We can reasonably say that "the
> most likely thing a task in a throttled hierarchy will do is just go
> throttle itself, so we shouldn't buddy it or let it preempt", but it
> would also be reasonable to let them preempt/buddy normally, in case
> they hold locks or such.

I think we do not need to special case tasks in throttled hierarchy in
check_preempt_wakeup_fair().

> 
> yield_to is used by kvm and st-dma-fence-chain.c. Yielding to a
> throttle-on-exit kvm cpu thread isn't useful (so no need to remove the
> abort there). The dma code is just yielding to a just-spawned kthread,
> so it should be fine either way.

Get it.

The cumulated diff I'm going to experiment is below, let me know if
something is wrong, thanks.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e927b9b7eeb6..c2e46b8e5e3d9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se)
 	if (!gcfs_rq || !gcfs_rq->load.weight)
 		return;
 
-	if (throttled_hierarchy(gcfs_rq))
-		return;
-
 	shares = calc_group_shares(gcfs_rq);
 	if (unlikely(se->load.weight != shares))
 		reweight_entity(cfs_rq_of(se), se, shares);
@@ -5234,6 +5231,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq);
 
 static void
 requeue_delayed_entity(struct sched_entity *se);
@@ -5729,6 +5727,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -6721,6 +6724,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return 0;
 }
 
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -7074,7 +7082,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
 			 */
-			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+			if (task_sleep && se)
 				set_next_buddy(se);
 			break;
 		}
@@ -8722,15 +8730,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (unlikely(se == pse))
 		return;
 
-	/*
-	 * This is possible from callers such as attach_tasks(), in which we
-	 * unconditionally wakeup_preempt() after an enqueue (which may have
-	 * lead to a throttle).  This both saves work and prevents false
-	 * next-buddy nomination below.
-	 */
-	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
-		return;
-
 	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
 		set_next_buddy(pse);
 	}
@@ -13154,10 +13153,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	if (!throttled_hierarchy(cfs_rq))
+	if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 		list_add_leaf_cfs_rq(cfs_rq);
 
 	/* Start to propagate at parent */
@@ -13168,10 +13164,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 
-		if (cfs_rq_throttled(cfs_rq))
-			break;
-
-		if (!throttled_hierarchy(cfs_rq))
+		if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
 }
-- 
2.39.5
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks, 1 day ago
On Thu, Sep 04, 2025 at 04:16:11PM +0800, Aaron Lu wrote:
> On Wed, Sep 03, 2025 at 01:46:48PM -0700, Benjamin Segall wrote:
> > K Prateek Nayak <kprateek.nayak@amd.com> writes:
> > 
> > > Hello Peter,
> > >
> > > On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
> > >>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> > >>>  {
> > >>> +	if (task_is_throttled(p)) {
> > >>> +		dequeue_throttled_task(p, flags);
> > >>> +		return true;
> > >>> +	}
> > >>> +
> > >>>  	if (!p->se.sched_delayed)
> > >>>  		util_est_dequeue(&rq->cfs, p);
> > >>>  
> > >> 
> > >> OK, so this makes it so that either a task is fully enqueued (all
> > >> cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
> > >> its tasks are gone, and unthrottled when a task gets added. Right?
> > >
> > > cfs_rq (and the hierarchy below) is marked throttled when the quota
> > > has elapsed. Tasks on the throttled hierarchies will dequeue
> > > themselves completely via task work added during pick. When the last
> > > task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
> > > that cfs_rq.
> > >
> > > When a new task is added on the hierarchy, the PELT is unfrozen and
> > > the task becomes runnable. The cfs_rq and the hierarchy is still
> > > marked throttled.
> > >
> > > Unthrottling of hierarchy is only done at distribution.
> > >
> > >> 
> > >> But propagate_entity_cfs_rq() is still doing the old thing, and has a
> > >> if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
> > >> iteration.
> > >> 
> > >> This seems somewhat inconsistent; or am I missing something ? 
> > >
> > > Probably an oversight. But before that, what was the reason to have
> > > stopped this propagation at throttled_cfs_rq() before the changes?
> > >
> > 
> > Yeah, this was one of the things I was (slowly) looking at - with this
> > series we currently still abort in:
> > 
> > 1) update_cfs_group
> > 2) dequeue_entities's set_next_buddy
> > 3) check_preempt_fair
> > 4) yield_to
> > 5) propagate_entity_cfs_rq
> > 
> > In the old design on throttle immediately remove the entire cfs_rq,
> > freeze time for it, and stop adjusting load. In the new design we still
> > pick from it, so we definitely don't want to stop time (and don't). I'm

Per my understanding, we keep PELT clock running because we want the
throttled cfs_rq's load to continue get update when it still has tasks
running in kernel mode and have that up2date load could let it have a
hopefully more accurate weight through update_cfs_group(). So it looks
to me, if PELT clock should not be stopped, then we should not abort in
propagate_entity_cfs_rq() and update_cfs_group(). I missed these two
aborts in these two functions, but now you and Peter have pointed this
out, I suppose there is no doubt we should not abort in
update_cfs_group() and propagate_entity_cfs_rq()? If we should not mess
with shares distribution, then the up2date load is not useful and why
not simply freeze PELT clock on throttle :)

> > guessing we probably also want to now adjust load for it, but it is
> > arguable - since all the cfs_rqs for the tg are likely to throttle at the
> > same time, so we might not want to mess with the shares distribution,
> > since when unthrottle comes around the most likely correct distribution
> > is the distribution we had at the time of throttle.
> >
> 
> I can give it a test to see how things change by adjusting load and share
> distribution using my previous performance tests.
>

Run hackbench and netperf on AMD Genoa and didn't notice any obvious
difference with the cumulated diff.

> > Assuming we do want to adjust load for a throttle then we probably want
> > to remove the aborts from update_cfs_group and propagate_entity_cfs_rq.
> > I'm guessing that we need the list_add_leaf_cfs_rq from propagate, but
> > I'm not 100% sure when they are actually doing something in propagate as
> > opposed to enqueue.
> >
> 
> Yes, commit 0258bdfaff5bd("sched/fair: Fix unfairness caused by missing 
> load decay") added that list_add_leaf_cfs_rq() in
> propagate_entity_cfs_rq() to fix a problem.
> 
> > The other 3 are the same sort of thing - scheduling pick heuristics
> > which imo are pretty arbitrary to keep. We can reasonably say that "the
> > most likely thing a task in a throttled hierarchy will do is just go
> > throttle itself, so we shouldn't buddy it or let it preempt", but it
> > would also be reasonable to let them preempt/buddy normally, in case
> > they hold locks or such.
> 
> I think we do not need to special case tasks in throttled hierarchy in
> check_preempt_wakeup_fair().
>

Since there is pros and cons either way and consider the performance
test result, I'm now feeling maybe we can leave these 3 as is and revisit
them later when there is some clear case.

> > 
> > yield_to is used by kvm and st-dma-fence-chain.c. Yielding to a
> > throttle-on-exit kvm cpu thread isn't useful (so no need to remove the
> > abort there). The dma code is just yielding to a just-spawned kthread,
> > so it should be fine either way.
> 
> Get it.
> 
> The cumulated diff I'm going to experiment is below, let me know if
> something is wrong, thanks.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks ago
On Thu, Sep 04, 2025 at 08:04:01PM +0800, Aaron Lu wrote:
... ...
> Per my understanding, we keep PELT clock running because we want the
> throttled cfs_rq's load to continue get update when it still has tasks
> running in kernel mode and have that up2date load could let it have a
> hopefully more accurate weight through update_cfs_group(). So it looks
> to me, if PELT clock should not be stopped, then we should not abort in
> propagate_entity_cfs_rq() and update_cfs_group(). I missed these two
> aborts in these two functions, but now you and Peter have pointed this
> out, I suppose there is no doubt we should not abort in
> update_cfs_group() and propagate_entity_cfs_rq()? If we should not mess
> with shares distribution, then the up2date load is not useful and why
> not simply freeze PELT clock on throttle :)

With more thinking, when PELT clock is running, root level cfs_rq's load
can be updated and that is useful for things like load balance.

update_cfs_group() is a different story and as Ben said, there is cons
and pros whether to abort there for throttled cfs_rq so there's no
obvious thing to do for it.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by K Prateek Nayak 4 weeks, 1 day ago
Hello Aaron,

On 9/4/2025 1:46 PM, Aaron Lu wrote:
> @@ -8722,15 +8730,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	if (unlikely(se == pse))
>  		return;
>  
> -	/*
> -	 * This is possible from callers such as attach_tasks(), in which we
> -	 * unconditionally wakeup_preempt() after an enqueue (which may have
> -	 * lead to a throttle).  This both saves work and prevents false
> -	 * next-buddy nomination below.
> -	 */
> -	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
> -		return;

I think we should have a:

	if (task_is_throttled(p))
		return;

here. I can see at least one possibility via prio_changed_fair()
where a throttled task might reach here. Rest looks good. I'll
still wait on Ben for the update_cfs_group() bits :)

> -
>  	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
>  		set_next_buddy(pse);
>  	}
-- 
Thanks and Regards,
Prateek
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks, 1 day ago
On Thu, Sep 04, 2025 at 03:21:06PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 9/4/2025 1:46 PM, Aaron Lu wrote:
> > @@ -8722,15 +8730,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> >  	if (unlikely(se == pse))
> >  		return;
> >  
> > -	/*
> > -	 * This is possible from callers such as attach_tasks(), in which we
> > -	 * unconditionally wakeup_preempt() after an enqueue (which may have
> > -	 * lead to a throttle).  This both saves work and prevents false
> > -	 * next-buddy nomination below.
> > -	 */
> > -	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
> > -		return;
> 
> I think we should have a:
> 
> 	if (task_is_throttled(p))
> 		return;
> 
> here. I can see at least one possibility via prio_changed_fair()

Ah right. I didn't realize wakeup_preempt() can be called for a throttled
task, I think it is not expected. What about forbid that :)
(not tested in anyway, just to show the idea and get feedback)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb93e74a850e8..f1383aede764f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13135,7 +13135,11 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!task_on_rq_queued(p))
+	/*
+	 * p->on_rq can be set for throttled task but there is no need to
+	 * check wakeup preempt for throttled task, so use p->se.on_rq instead.
+	 */
+	if (!p->se.on_rq)
 		return;
 
 	if (rq->cfs.nr_queued == 1)

> where a throttled task might reach here. Rest looks good. I'll
> still wait on Ben for the update_cfs_group() bits :)
> 
> > -
> >  	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
> >  		set_next_buddy(pse);
> >  	}

Best regards,
Aaron
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 3 weeks, 2 days ago
On Thu, Sep 04, 2025 at 07:05:04PM +0800, Aaron Lu wrote:
> On Thu, Sep 04, 2025 at 03:21:06PM +0530, K Prateek Nayak wrote:
> > Hello Aaron,
> > 
> > On 9/4/2025 1:46 PM, Aaron Lu wrote:
> > > @@ -8722,15 +8730,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> > >  	if (unlikely(se == pse))
> > >  		return;
> > >  
> > > -	/*
> > > -	 * This is possible from callers such as attach_tasks(), in which we
> > > -	 * unconditionally wakeup_preempt() after an enqueue (which may have
> > > -	 * lead to a throttle).  This both saves work and prevents false
> > > -	 * next-buddy nomination below.
> > > -	 */
> > > -	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
> > > -		return;
> > 
> > I think we should have a:
> > 
> > 	if (task_is_throttled(p))
> > 		return;
> > 
> > here. I can see at least one possibility via prio_changed_fair()
> 
> Ah right. I didn't realize wakeup_preempt() can be called for a throttled
> task, I think it is not expected. What about forbid that :)
> (not tested in anyway, just to show the idea and get feedback)
>

Turned out there are other places that also call wakeup_preempt() for a
throttled task, like sched_setaffinity() -> move_queued_task() ->
wakeup_preempt(), and it's not possible to change the logic there as
it's outside of fair, so I'll go with your suggestion to add a
task_is_throttled() check in check_preempt_wakeup_fair().

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cb93e74a850e8..f1383aede764f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13135,7 +13135,11 @@ static void task_fork_fair(struct task_struct *p)
>  static void
>  prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
>  {
> -	if (!task_on_rq_queued(p))
> +	/*
> +	 * p->on_rq can be set for throttled task but there is no need to
> +	 * check wakeup preempt for throttled task, so use p->se.on_rq instead.
> +	 */
> +	if (!p->se.on_rq)
>  		return;
>  
>  	if (rq->cfs.nr_queued == 1)
> 
> > where a throttled task might reach here. Rest looks good. I'll
> > still wait on Ben for the update_cfs_group() bits :)
> > 
> > > -
> > >  	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
> > >  		set_next_buddy(pse);
> > >  	}
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Benjamin Segall 3 weeks, 3 days ago
Aaron Lu <ziqianlu@bytedance.com> writes:

> On Thu, Sep 04, 2025 at 03:21:06PM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>> 
>> On 9/4/2025 1:46 PM, Aaron Lu wrote:
>> > @@ -8722,15 +8730,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>> >  	if (unlikely(se == pse))
>> >  		return;
>> >  
>> > -	/*
>> > -	 * This is possible from callers such as attach_tasks(), in which we
>> > -	 * unconditionally wakeup_preempt() after an enqueue (which may have
>> > -	 * lead to a throttle).  This both saves work and prevents false
>> > -	 * next-buddy nomination below.
>> > -	 */
>> > -	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
>> > -		return;
>> 
>> I think we should have a:
>> 
>> 	if (task_is_throttled(p))
>> 		return;
>> 
>> here. I can see at least one possibility via prio_changed_fair()
>
> Ah right. I didn't realize wakeup_preempt() can be called for a throttled
> task, I think it is not expected. What about forbid that :)
> (not tested in anyway, just to show the idea and get feedback)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cb93e74a850e8..f1383aede764f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13135,7 +13135,11 @@ static void task_fork_fair(struct task_struct *p)
>  static void
>  prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
>  {
> -	if (!task_on_rq_queued(p))
> +	/*
> +	 * p->on_rq can be set for throttled task but there is no need to
> +	 * check wakeup preempt for throttled task, so use p->se.on_rq instead.
> +	 */
> +	if (!p->se.on_rq)
>  		return;
>  
>  	if (rq->cfs.nr_queued == 1)
>
>> where a throttled task might reach here. Rest looks good. I'll
>> still wait on Ben for the update_cfs_group() bits :)


Yeah, I think I agree with all of these (this patch and the previous
patch); the preempt ones are subjective but I'd probably default to "no
special case needed for throttle". Removing the check in
update_cfs_group() I think is correct, unless we want to freeze
everything, yeah. (And that seems dangerous in its own way)
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 3 weeks, 3 days ago
On Mon, Sep 08, 2025 at 08:58:09PM -0700, Benjamin Segall wrote:
> Aaron Lu <ziqianlu@bytedance.com> writes:
> 
> > On Thu, Sep 04, 2025 at 03:21:06PM +0530, K Prateek Nayak wrote:
> >> Hello Aaron,
> >> 
> >> On 9/4/2025 1:46 PM, Aaron Lu wrote:
> >> > @@ -8722,15 +8730,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> >> >  	if (unlikely(se == pse))
> >> >  		return;
> >> >  
> >> > -	/*
> >> > -	 * This is possible from callers such as attach_tasks(), in which we
> >> > -	 * unconditionally wakeup_preempt() after an enqueue (which may have
> >> > -	 * lead to a throttle).  This both saves work and prevents false
> >> > -	 * next-buddy nomination below.
> >> > -	 */
> >> > -	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
> >> > -		return;
> >> 
> >> I think we should have a:
> >> 
> >> 	if (task_is_throttled(p))
> >> 		return;
> >> 
> >> here. I can see at least one possibility via prio_changed_fair()
> >
> > Ah right. I didn't realize wakeup_preempt() can be called for a throttled
> > task, I think it is not expected. What about forbid that :)
> > (not tested in anyway, just to show the idea and get feedback)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index cb93e74a850e8..f1383aede764f 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -13135,7 +13135,11 @@ static void task_fork_fair(struct task_struct *p)
> >  static void
> >  prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
> >  {
> > -	if (!task_on_rq_queued(p))
> > +	/*
> > +	 * p->on_rq can be set for throttled task but there is no need to
> > +	 * check wakeup preempt for throttled task, so use p->se.on_rq instead.
> > +	 */
> > +	if (!p->se.on_rq)
> >  		return;
> >  
> >  	if (rq->cfs.nr_queued == 1)
> >
> >> where a throttled task might reach here. Rest looks good. I'll
> >> still wait on Ben for the update_cfs_group() bits :)
> 
> 
> Yeah, I think I agree with all of these (this patch and the previous
> patch); the preempt ones are subjective but I'd probably default to "no
> special case needed for throttle". Removing the check in
> update_cfs_group() I think is correct, unless we want to freeze
> everything, yeah. (And that seems dangerous in its own way)

Thanks for all these info, let me go ahead and send these changes for
review then :)
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by K Prateek Nayak 4 weeks, 1 day ago
On 9/4/2025 4:35 PM, Aaron Lu wrote:
> Ah right. I didn't realize wakeup_preempt() can be called for a throttled
> task, I think it is not expected. What about forbid that :)
> (not tested in anyway, just to show the idea and get feedback)

Ack! Prevention is better than cure ;-)

-- 
Thanks and Regards,
Prateek
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by K Prateek Nayak 4 weeks, 1 day ago
Hello Ben,

On 9/4/2025 2:16 AM, Benjamin Segall wrote:
> K Prateek Nayak <kprateek.nayak@amd.com> writes:
> 
>> Hello Peter,
>>
>> On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
>>>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>  {
>>>> +	if (task_is_throttled(p)) {
>>>> +		dequeue_throttled_task(p, flags);
>>>> +		return true;
>>>> +	}
>>>> +
>>>>  	if (!p->se.sched_delayed)
>>>>  		util_est_dequeue(&rq->cfs, p);
>>>>  
>>>
>>> OK, so this makes it so that either a task is fully enqueued (all
>>> cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
>>> its tasks are gone, and unthrottled when a task gets added. Right?
>>
>> cfs_rq (and the hierarchy below) is marked throttled when the quota
>> has elapsed. Tasks on the throttled hierarchies will dequeue
>> themselves completely via task work added during pick. When the last
>> task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
>> that cfs_rq.
>>
>> When a new task is added on the hierarchy, the PELT is unfrozen and
>> the task becomes runnable. The cfs_rq and the hierarchy is still
>> marked throttled.
>>
>> Unthrottling of hierarchy is only done at distribution.
>>
>>>
>>> But propagate_entity_cfs_rq() is still doing the old thing, and has a
>>> if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
>>> iteration.
>>>
>>> This seems somewhat inconsistent; or am I missing something ? 
>>
>> Probably an oversight. But before that, what was the reason to have
>> stopped this propagation at throttled_cfs_rq() before the changes?
>>
> 
> Yeah, this was one of the things I was (slowly) looking at - with this
> series we currently still abort in:
> 
> 1) update_cfs_group
> 2) dequeue_entities's set_next_buddy
> 3) check_preempt_fair
> 4) yield_to
> 5) propagate_entity_cfs_rq
> 
> In the old design on throttle immediately remove the entire cfs_rq,
> freeze time for it, and stop adjusting load. In the new design we still
> pick from it, so we definitely don't want to stop time (and don't). I'm
> guessing we probably also want to now adjust load for it, but it is
> arguable - since all the cfs_rqs for the tg are likely to throttle at the
> same time, so we might not want to mess with the shares distribution,
> since when unthrottle comes around the most likely correct distribution
> is the distribution we had at the time of throttle.

So we were having a discussion in the parallel thread here
https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/ on whether
we should allow tasks on throttled hierarchies to be load balanced or
not.

If we do want them to be migrated, I think we need update_cfs_group()
cause otherwise we might pick off most task from the hierarchy but
the sched entity of the cfs_rq will still be contributing the same
amount of weight to the root making the CPU look busier than it
actually is.

The alternate is to ensure we don't migrate the tasks on throttled
hierarchies and let them exit to userspace in-place on the same CPU
but that too is less than ideal.

> 
> Assuming we do want to adjust load for a throttle then we probably want
> to remove the aborts from update_cfs_group and propagate_entity_cfs_rq.
> I'm guessing that we need the list_add_leaf_cfs_rq from propagate, but
> I'm not 100% sure when they are actually doing something in propagate as
> opposed to enqueue.

I don't think this is required since the leaf cfs_rq removal is now
done via the dequeue path and if the cfs_rq is not on the list, then
PELT for the hierarchy below is frozen anyway.

> 
> The other 3 are the same sort of thing - scheduling pick heuristics
> which imo are pretty arbitrary to keep. We can reasonably say that "the
> most likely thing a task in a throttled hierarchy will do is just go
> throttle itself, so we shouldn't buddy it or let it preempt", but it
> would also be reasonable to let them preempt/buddy normally, in case
> they hold locks or such.

We can (and should?) consider it as any other runnable task until it
isn't IMO just to reduce the amount of special handling in those cases.

> 
> yield_to is used by kvm and st-dma-fence-chain.c. Yielding to a
> throttle-on-exit kvm cpu thread isn't useful (so no need to remove the
> abort there). The dma code is just yielding to a just-spawned kthread,
> so it should be fine either way.

Yes, this one can stay.

-- 
Thanks and Regards,
Prateek
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Benjamin Segall 3 weeks, 3 days ago
K Prateek Nayak <kprateek.nayak@amd.com> writes:

> Hello Ben,
>
> On 9/4/2025 2:16 AM, Benjamin Segall wrote:
>> K Prateek Nayak <kprateek.nayak@amd.com> writes:
>> 
>>> Hello Peter,
>>>
>>> On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
>>>>>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>  {
>>>>> +	if (task_is_throttled(p)) {
>>>>> +		dequeue_throttled_task(p, flags);
>>>>> +		return true;
>>>>> +	}
>>>>> +
>>>>>  	if (!p->se.sched_delayed)
>>>>>  		util_est_dequeue(&rq->cfs, p);
>>>>>  
>>>>
>>>> OK, so this makes it so that either a task is fully enqueued (all
>>>> cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
>>>> its tasks are gone, and unthrottled when a task gets added. Right?
>>>
>>> cfs_rq (and the hierarchy below) is marked throttled when the quota
>>> has elapsed. Tasks on the throttled hierarchies will dequeue
>>> themselves completely via task work added during pick. When the last
>>> task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
>>> that cfs_rq.
>>>
>>> When a new task is added on the hierarchy, the PELT is unfrozen and
>>> the task becomes runnable. The cfs_rq and the hierarchy is still
>>> marked throttled.
>>>
>>> Unthrottling of hierarchy is only done at distribution.
>>>
>>>>
>>>> But propagate_entity_cfs_rq() is still doing the old thing, and has a
>>>> if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
>>>> iteration.
>>>>
>>>> This seems somewhat inconsistent; or am I missing something ? 
>>>
>>> Probably an oversight. But before that, what was the reason to have
>>> stopped this propagation at throttled_cfs_rq() before the changes?
>>>
>> 
>> Yeah, this was one of the things I was (slowly) looking at - with this
>> series we currently still abort in:
>> 
>> 1) update_cfs_group
>> 2) dequeue_entities's set_next_buddy
>> 3) check_preempt_fair
>> 4) yield_to
>> 5) propagate_entity_cfs_rq
>> 
>> In the old design on throttle immediately remove the entire cfs_rq,
>> freeze time for it, and stop adjusting load. In the new design we still
>> pick from it, so we definitely don't want to stop time (and don't). I'm
>> guessing we probably also want to now adjust load for it, but it is
>> arguable - since all the cfs_rqs for the tg are likely to throttle at the
>> same time, so we might not want to mess with the shares distribution,
>> since when unthrottle comes around the most likely correct distribution
>> is the distribution we had at the time of throttle.
>
> So we were having a discussion in the parallel thread here
> https://lore.kernel.org/lkml/20250903101102.GB42@bytedance/ on whether
> we should allow tasks on throttled hierarchies to be load balanced or
> not.
>
> If we do want them to be migrated, I think we need update_cfs_group()
> cause otherwise we might pick off most task from the hierarchy but
> the sched entity of the cfs_rq will still be contributing the same
> amount of weight to the root making the CPU look busier than it
> actually is.
>
> The alternate is to ensure we don't migrate the tasks on throttled
> hierarchies and let them exit to userspace in-place on the same CPU
> but that too is less than ideal.
>

Yeah - if we don't update group se load then we shouldn't load balance
throttled-hierarchy because the amount of root load migrated in the
moment is always 0. Once we do all of that properly we should be fine to
migrate in/out of a throttled hierarchy.

Much like wakeup there's an argument for not migrating into a throttled
hierarchy, at least from an unthrottled one, where there's presumably a
high likelyhood of the thread just being preempted in userspace. (And my
gut feeling is that this case is probably even more common than wakeup,
that need_rescheds land in (return to) userspace more often than wakeups
go straight to userspace with no significant kernel work)
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Peter Zijlstra 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 10:42:01PM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 9/3/2025 8:21 PM, Peter Zijlstra wrote:
> >>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>  {
> >> +	if (task_is_throttled(p)) {
> >> +		dequeue_throttled_task(p, flags);
> >> +		return true;
> >> +	}
> >> +
> >>  	if (!p->se.sched_delayed)
> >>  		util_est_dequeue(&rq->cfs, p);
> >>  
> > 
> > OK, so this makes it so that either a task is fully enqueued (all
> > cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
> > its tasks are gone, and unthrottled when a task gets added. Right?
> 
> cfs_rq (and the hierarchy below) is marked throttled when the quota
> has elapsed. Tasks on the throttled hierarchies will dequeue
> themselves completely via task work added during pick. When the last
> task leaves on a cfs_rq of throttled hierarchy, PELT is frozen for
> that cfs_rq.

Ah, right.

> When a new task is added on the hierarchy, the PELT is unfrozen and
> the task becomes runnable. The cfs_rq and the hierarchy is still
> marked throttled.
> 
> Unthrottling of hierarchy is only done at distribution.
> 
> > 
> > But propagate_entity_cfs_rq() is still doing the old thing, and has a
> > if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
> > iteration.
> > 
> > This seems somewhat inconsistent; or am I missing something ? 
> 
> Probably an oversight. But before that, what was the reason to have
> stopped this propagation at throttled_cfs_rq() before the changes?
> 
> Looking at commit 09a43ace1f98 ("sched/fair: Propagate load during
> synchronous attach/detach") was it because the update_load_avg() from
> enqueue_entity() loop previously in unthrottle_cfs_rq() would have
> propagated the PELT to root on unthrottle?
> 
> If that was the intention, perhaps something like:

So this is mostly tasks leaving/joining the class/cgroup. And its
purpose seems to be to remove/add the blocked load component.

Previously throttle/unthrottle would {de,en}queue the whole subtree from
PELT, see how {en,de}queue would also stop at throttle.

But now none of that is done; PELT is fully managed by the tasks
{de,en}queueing.

So I'm thinking that when a task joins fair (deboost from RT or
whatever), we add the blocking load and fully propagate it. If the task
is subject to throttling, that will then happen 'naturally' and it will
dequeue itself again.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by K Prateek Nayak 4 weeks, 1 day ago
On 9/4/2025 1:57 AM, Peter Zijlstra wrote:
> So this is mostly tasks leaving/joining the class/cgroup. And its
> purpose seems to be to remove/add the blocked load component.
> 
> Previously throttle/unthrottle would {de,en}queue the whole subtree from
> PELT, see how {en,de}queue would also stop at throttle.
> 
> But now none of that is done; PELT is fully managed by the tasks
> {de,en}queueing.
> 
> So I'm thinking that when a task joins fair (deboost from RT or
> whatever), we add the blocking load and fully propagate it. If the task
> is subject to throttling, that will then happen 'naturally' and it will
> dequeue itself again.

That seems like the correct thing to do yes. Those throttled_cfs_rq()
checks in propagate_entity_cfs_rq() can be removed then.

-- 
Thanks and Regards,
Prateek
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks, 1 day ago
On Thu, Sep 04, 2025 at 11:14:31AM +0530, K Prateek Nayak wrote:
> On 9/4/2025 1:57 AM, Peter Zijlstra wrote:
> > So this is mostly tasks leaving/joining the class/cgroup. And its
> > purpose seems to be to remove/add the blocked load component.
> > 
> > Previously throttle/unthrottle would {de,en}queue the whole subtree from
> > PELT, see how {en,de}queue would also stop at throttle.
> > 
> > But now none of that is done; PELT is fully managed by the tasks
> > {de,en}queueing.
> > 
> > So I'm thinking that when a task joins fair (deboost from RT or
> > whatever), we add the blocking load and fully propagate it. If the task
> > is subject to throttling, that will then happen 'naturally' and it will
> > dequeue itself again.
> 
> That seems like the correct thing to do yes. Those throttled_cfs_rq()
> checks in propagate_entity_cfs_rq() can be removed then.
>

Not sure if I understand correctly, I've come to the below code
according to your discussion:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e927b9b7eeb6..97ae561c60f5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5234,6 +5234,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq);
 
 static void
 requeue_delayed_entity(struct sched_entity *se);
@@ -5729,6 +5730,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -6721,6 +6727,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return 0;
 }
 
+static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -13154,10 +13165,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	if (!throttled_hierarchy(cfs_rq))
+	if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 		list_add_leaf_cfs_rq(cfs_rq);
 
 	/* Start to propagate at parent */
@@ -13168,10 +13176,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 
-		if (cfs_rq_throttled(cfs_rq))
-			break;
-
-		if (!throttled_hierarchy(cfs_rq))
+		if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
 }

So this means when a task left/joined a cfs_rq, we will do propagate
immediately, no matter if the cfs_rq is throttled or has its pelt clock
stopped or not; if cfs_rq still has pelt clock running, it will be added
to leaf cfs_rq list to make sure its load can be decayed. If cfs_rq's
pelt clock is stopped, it will be added to leaf cfs_rq list if necessary
by enqueue_task_fair() or when it's unthrottled.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Aaron Lu 4 weeks ago
Hi Peter,

On Thu, Sep 04, 2025 at 03:04:07PM +0800, Aaron Lu wrote:
> On Thu, Sep 04, 2025 at 11:14:31AM +0530, K Prateek Nayak wrote:
> > On 9/4/2025 1:57 AM, Peter Zijlstra wrote:
> > > So this is mostly tasks leaving/joining the class/cgroup. And its
> > > purpose seems to be to remove/add the blocked load component.
> > > 
> > > Previously throttle/unthrottle would {de,en}queue the whole subtree from
> > > PELT, see how {en,de}queue would also stop at throttle.
> > > 
> > > But now none of that is done; PELT is fully managed by the tasks
> > > {de,en}queueing.
> > > 
> > > So I'm thinking that when a task joins fair (deboost from RT or
> > > whatever), we add the blocking load and fully propagate it. If the task
> > > is subject to throttling, that will then happen 'naturally' and it will
> > > dequeue itself again.
> > 
> > That seems like the correct thing to do yes. Those throttled_cfs_rq()
> > checks in propagate_entity_cfs_rq() can be removed then.
> >
> 
> Not sure if I understand correctly, I've come to the below code
> according to your discussion:
>

Does the below diff look sane to you? If so, shall I send a separate
patch on top or fold it in patch3 and send an updated patch3?

Thanks.

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3e927b9b7eeb6..97ae561c60f5b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5234,6 +5234,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
>  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
> +static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq);
>  
>  static void
>  requeue_delayed_entity(struct sched_entity *se);
> @@ -5729,6 +5730,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>  	return cfs_bandwidth_used() && cfs_rq->throttled;
>  }
>  
> +static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
> +{
> +	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
> +}
> +
>  /* check whether cfs_rq, or any parent, is throttled */
>  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>  {
> @@ -6721,6 +6727,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>  	return 0;
>  }
>  
> +static inline int cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
> +{
> +	return 0;
> +}
> +
>  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>  {
>  	return 0;
> @@ -13154,10 +13165,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
>  {
>  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
> -	if (cfs_rq_throttled(cfs_rq))
> -		return;
> -
> -	if (!throttled_hierarchy(cfs_rq))
> +	if (!cfs_rq_pelt_clock_throttled(cfs_rq))
>  		list_add_leaf_cfs_rq(cfs_rq);
>  
>  	/* Start to propagate at parent */
> @@ -13168,10 +13176,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
>  
>  		update_load_avg(cfs_rq, se, UPDATE_TG);
>  
> -		if (cfs_rq_throttled(cfs_rq))
> -			break;
> -
> -		if (!throttled_hierarchy(cfs_rq))
> +		if (!cfs_rq_pelt_clock_throttled(cfs_rq))
>  			list_add_leaf_cfs_rq(cfs_rq);
>  	}
>  }
> 
> So this means when a task left/joined a cfs_rq, we will do propagate
> immediately, no matter if the cfs_rq is throttled or has its pelt clock
> stopped or not; if cfs_rq still has pelt clock running, it will be added
> to leaf cfs_rq list to make sure its load can be decayed. If cfs_rq's
> pelt clock is stopped, it will be added to leaf cfs_rq list if necessary
> by enqueue_task_fair() or when it's unthrottled.
Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
Posted by Peter Zijlstra 4 weeks ago
On Fri, Sep 05, 2025 at 07:37:19PM +0800, Aaron Lu wrote:
> Hi Peter,
> 
> On Thu, Sep 04, 2025 at 03:04:07PM +0800, Aaron Lu wrote:
> > On Thu, Sep 04, 2025 at 11:14:31AM +0530, K Prateek Nayak wrote:
> > > On 9/4/2025 1:57 AM, Peter Zijlstra wrote:
> > > > So this is mostly tasks leaving/joining the class/cgroup. And its
> > > > purpose seems to be to remove/add the blocked load component.
> > > > 
> > > > Previously throttle/unthrottle would {de,en}queue the whole subtree from
> > > > PELT, see how {en,de}queue would also stop at throttle.
> > > > 
> > > > But now none of that is done; PELT is fully managed by the tasks
> > > > {de,en}queueing.
> > > > 
> > > > So I'm thinking that when a task joins fair (deboost from RT or
> > > > whatever), we add the blocking load and fully propagate it. If the task
> > > > is subject to throttling, that will then happen 'naturally' and it will
> > > > dequeue itself again.
> > > 
> > > That seems like the correct thing to do yes. Those throttled_cfs_rq()
> > > checks in propagate_entity_cfs_rq() can be removed then.
> > >
> > 
> > Not sure if I understand correctly, I've come to the below code
> > according to your discussion:
> >
> 
> Does the below diff look sane to you? If so, shall I send a separate
> patch on top or fold it in patch3 and send an updated patch3?

Yeah, I suppose that works. Please send a follow up patch. It would also
be good to have a comment that explains why we need that list_add_leaf
thing. I think I see, but I'm sure I'll have forgotten all next time I
see this code.
[PATCH] sched/fair: Propagate load for throttled cfs_rq
Posted by Aaron Lu 3 weeks, 4 days ago
On Fri, Sep 05, 2025 at 02:53:28PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 05, 2025 at 07:37:19PM +0800, Aaron Lu wrote:
> > Hi Peter,
> > 
> > On Thu, Sep 04, 2025 at 03:04:07PM +0800, Aaron Lu wrote:
> > > On Thu, Sep 04, 2025 at 11:14:31AM +0530, K Prateek Nayak wrote:
> > > > On 9/4/2025 1:57 AM, Peter Zijlstra wrote:
> > > > > So this is mostly tasks leaving/joining the class/cgroup. And its
> > > > > purpose seems to be to remove/add the blocked load component.
> > > > > 
> > > > > Previously throttle/unthrottle would {de,en}queue the whole subtree from
> > > > > PELT, see how {en,de}queue would also stop at throttle.
> > > > > 
> > > > > But now none of that is done; PELT is fully managed by the tasks
> > > > > {de,en}queueing.
> > > > > 
> > > > > So I'm thinking that when a task joins fair (deboost from RT or
> > > > > whatever), we add the blocking load and fully propagate it. If the task
> > > > > is subject to throttling, that will then happen 'naturally' and it will
> > > > > dequeue itself again.
> > > > 
> > > > That seems like the correct thing to do yes. Those throttled_cfs_rq()
> > > > checks in propagate_entity_cfs_rq() can be removed then.
> > > >
> > > 
> > > Not sure if I understand correctly, I've come to the below code
> > > according to your discussion:
> > >
> > 
> > Does the below diff look sane to you? If so, shall I send a separate
> > patch on top or fold it in patch3 and send an updated patch3?
> 
> Yeah, I suppose that works. Please send a follow up patch. It would also
> be good to have a comment that explains why we need that list_add_leaf
> thing. I think I see, but I'm sure I'll have forgotten all next time I
> see this code.

Here it is.

From d88c2d2be31bb3970708f9b78e1725b0e25824be Mon Sep 17 00:00:00 2001
From: Aaron Lu <ziqianlu@bytedance.com>
Date: Fri, 5 Sep 2025 14:21:50 +0800
Subject: [PATCH] sched/fair: Propagate load for throttled cfs_rq

Before task based throttle model, propagating load will stop at a
throttled cfs_rq and that propagate will happen on unthrottle time by
update_load_avg().

Now that there is no update_load_avg() on unthrottle for throttled
cfs_rq and all load tracking is done by task related operations, let the
propagate happen immediately.

While at it, add a comment to explain why cfs_rqs that are not affected
by throttle have to be added to leaf cfs_rq list in
propagate_entity_cfs_rq() per my understanding of commit 0258bdfaff5b
("sched/fair: Fix unfairness caused by missing load decay").

Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df8dc389af8e1..03f16f70bff8a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5729,6 +5729,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -6721,6 +6726,11 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return 0;
 }
 
++static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return false;
+}
+
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -13151,10 +13161,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	if (!throttled_hierarchy(cfs_rq))
+	/*
+	 * If a task gets attached to this cfs_rq and before being queued,
+	 * it gets migrated to another CPU due to reasons like cpuset change,
+	 * we need to make sure this cfs_rq stays on leaf cfs_rq list to
+	 * have that removed load decayed or it can cause faireness problem.
+	 */
+	if(!cfs_rq_pelt_clock_throttled(cfs_rq))
 		list_add_leaf_cfs_rq(cfs_rq);
 
 	/* Start to propagate at parent */
@@ -13165,10 +13178,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 
-		if (cfs_rq_throttled(cfs_rq))
-			break;
-
-		if (!throttled_hierarchy(cfs_rq))
+		if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
 }
-- 
2.39.5
Re: [PATCH] sched/fair: Propagate load for throttled cfs_rq
Posted by kernel test robot 3 weeks, 3 days ago
Hi Aaron,

kernel test robot noticed the following build errors:



url:    https://github.com/intel-lab-lkp/linux/commits/UPDATE-20250908-190724/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250829-161501
base:   the 3th patch of https://lore.kernel.org/r/20250829081120.806-4-ziqianlu%40bytedance.com
patch link:    https://lore.kernel.org/r/20250908110548.GA35%40bytedance
patch subject: [PATCH] sched/fair: Propagate load for throttled cfs_rq
config: i386-buildonly-randconfig-001-20250909 (https://download.01.org/0day-ci/archive/20250909/202509091233.f6nP3BVh-lkp@intel.com/config)
compiler: gcc-13 (Debian 13.3.0-16) 13.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250909/202509091233.f6nP3BVh-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509091233.f6nP3BVh-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/sched/fair.c:6747:1: error: expected identifier or '(' before '+' token
    6747 | +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
         | ^


vim +6747 kernel/sched/fair.c

  6746	
> 6747	+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
  6748	{
  6749		return false;
  6750	}
  6751	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH] sched/fair: Propagate load for throttled cfs_rq
Posted by Aaron Lu 3 weeks, 3 days ago
On Tue, Sep 09, 2025 at 12:20:47PM +0800, kernel test robot wrote:
> Hi Aaron,
> 
> kernel test robot noticed the following build errors:
> 
> 
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/UPDATE-20250908-190724/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250829-161501
> base:   the 3th patch of https://lore.kernel.org/r/20250829081120.806-4-ziqianlu%40bytedance.com
> patch link:    https://lore.kernel.org/r/20250908110548.GA35%40bytedance
> patch subject: [PATCH] sched/fair: Propagate load for throttled cfs_rq
> config: i386-buildonly-randconfig-001-20250909 (https://download.01.org/0day-ci/archive/20250909/202509091233.f6nP3BVh-lkp@intel.com/config)
> compiler: gcc-13 (Debian 13.3.0-16) 13.3.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250909/202509091233.f6nP3BVh-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202509091233.f6nP3BVh-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
> >> kernel/sched/fair.c:6747:1: error: expected identifier or '(' before '+' token
>     6747 | +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
>          | ^

Sigh, I remembered I did a build test with !CFS_BANDWIDTH and now I went
to check that build directory and noticed I didn't have CFS_BANDWIDTH
disabled...

Sorry for the trouble, will send an updated patch later.

And thanks for the report, lkp.

> 
> 
> vim +6747 kernel/sched/fair.c
> 
>   6746	
> > 6747	+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
>   6748	{
>   6749		return false;
>   6750	}
>   6751	
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki
Re: [PATCH] sched/fair: Propagate load for throttled cfs_rq
Posted by K Prateek Nayak 3 weeks, 3 days ago
Hello Aaron,

On 9/9/2025 11:47 AM, Aaron Lu wrote:
>>>> kernel/sched/fair.c:6747:1: error: expected identifier or '(' before '+' token
>>     6747 | +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
>>          | ^
> 
> Sigh, I remembered I did a build test with !CFS_BANDWIDTH and now I went
> to check that build directory and noticed I didn't have CFS_BANDWIDTH
> disabled...
> 
> Sorry for the trouble, will send an updated patch later.

While at it, another nit.

On 9/8/2025 4:35 PM, Aaron Lu wrote:
> @@ -13151,10 +13161,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
>  {
>  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
> -	if (cfs_rq_throttled(cfs_rq))
> -		return;
> -
> -	if (!throttled_hierarchy(cfs_rq))
> +	/*
> +	 * If a task gets attached to this cfs_rq and before being queued,
> +	 * it gets migrated to another CPU due to reasons like cpuset change,
> +	 * we need to make sure this cfs_rq stays on leaf cfs_rq list to
> +	 * have that removed load decayed or it can cause faireness problem.
> +	 */
> +	if(!cfs_rq_pelt_clock_throttled(cfs_rq))

          ^ Can you also add a space after the "if" here.

>  		list_add_leaf_cfs_rq(cfs_rq);

-- 
Thanks and Regards,
Prateek
Re: [PATCH] sched/fair: Propagate load for throttled cfs_rq
Posted by Aaron Lu 3 weeks, 3 days ago
On Tue, Sep 09, 2025 at 11:52:55AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 9/9/2025 11:47 AM, Aaron Lu wrote:
> >>>> kernel/sched/fair.c:6747:1: error: expected identifier or '(' before '+' token
> >>     6747 | +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
> >>          | ^
> > 
> > Sigh, I remembered I did a build test with !CFS_BANDWIDTH and now I went
> > to check that build directory and noticed I didn't have CFS_BANDWIDTH
> > disabled...
> > 
> > Sorry for the trouble, will send an updated patch later.
> 
> While at it, another nit.
> 
> On 9/8/2025 4:35 PM, Aaron Lu wrote:
> > @@ -13151,10 +13161,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
> >  {
> >  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >  
> > -	if (cfs_rq_throttled(cfs_rq))
> > -		return;
> > -
> > -	if (!throttled_hierarchy(cfs_rq))
> > +	/*
> > +	 * If a task gets attached to this cfs_rq and before being queued,
> > +	 * it gets migrated to another CPU due to reasons like cpuset change,
> > +	 * we need to make sure this cfs_rq stays on leaf cfs_rq list to
> > +	 * have that removed load decayed or it can cause faireness problem.
> > +	 */
> > +	if(!cfs_rq_pelt_clock_throttled(cfs_rq))
> 
>           ^ Can you also add a space after the "if" here.
>

Yeah, I definitely should do that, thanks for catching this.

> >  		list_add_leaf_cfs_rq(cfs_rq);
>
Re: [PATCH] sched/fair: Propagate load for throttled cfs_rq
Posted by Aaron Lu 3 weeks, 2 days ago
On Tue, Sep 09, 2025 at 02:27:15PM +0800, Aaron Lu wrote:
> On Tue, Sep 09, 2025 at 11:52:55AM +0530, K Prateek Nayak wrote:
> > Hello Aaron,
> > 
> > On 9/9/2025 11:47 AM, Aaron Lu wrote:
> > >>>> kernel/sched/fair.c:6747:1: error: expected identifier or '(' before '+' token
> > >>     6747 | +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
> > >>          | ^
> > > 
> > > Sigh, I remembered I did a build test with !CFS_BANDWIDTH and now I went
> > > to check that build directory and noticed I didn't have CFS_BANDWIDTH
> > > disabled...
> > > 
> > > Sorry for the trouble, will send an updated patch later.
> > 
> > While at it, another nit.
> > 
> > On 9/8/2025 4:35 PM, Aaron Lu wrote:
> > > @@ -13151,10 +13161,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
> > >  {
> > >  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> > >  
> > > -	if (cfs_rq_throttled(cfs_rq))
> > > -		return;
> > > -
> > > -	if (!throttled_hierarchy(cfs_rq))
> > > +	/*
> > > +	 * If a task gets attached to this cfs_rq and before being queued,
> > > +	 * it gets migrated to another CPU due to reasons like cpuset change,
> > > +	 * we need to make sure this cfs_rq stays on leaf cfs_rq list to
> > > +	 * have that removed load decayed or it can cause faireness problem.
> > > +	 */
> > > +	if(!cfs_rq_pelt_clock_throttled(cfs_rq))
> > 
> >           ^ Can you also add a space after the "if" here.
> >
> 
> Yeah, I definitely should do that, thanks for catching this.
> 
> > >  		list_add_leaf_cfs_rq(cfs_rq);
> > 

Just a note that the updated patch is sent here:
https://lore.kernel.org/lkml/20250910095044.278-2-ziqianlu@bytedance.com/
[tip: sched/core] sched/fair: Switch to task based throttle model
Posted by tip-bot2 for Valentin Schneider 1 month ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     e1fad12dcb66b7f35573c52b665830a1538f9886
Gitweb:        https://git.kernel.org/tip/e1fad12dcb66b7f35573c52b665830a1538f9886
Author:        Valentin Schneider <vschneid@redhat.com>
AuthorDate:    Fri, 29 Aug 2025 16:11:18 +08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 03 Sep 2025 10:03:14 +02:00

sched/fair: Switch to task based throttle model

In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.

Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock when the throttled cfs_rq has no tasks left.

Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250829081120.806-4-ziqianlu@bytedance.com
---
 kernel/sched/fair.c  | 341 +++++++++++++++++++++---------------------
 kernel/sched/pelt.h  |   4 +-
 kernel/sched/sched.h |   3 +-
 3 files changed, 181 insertions(+), 167 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dab4ed8..25b1014 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5291,18 +5291,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);
 
 			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
 			if (!cfs_rq->throttled_clock_self)
 				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+
+			if (cfs_rq->pelt_clock_throttled) {
+				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+					cfs_rq->throttled_clock_pelt;
+				cfs_rq->pelt_clock_throttled = 0;
+			}
 		}
+#endif
 	}
 }
 
@@ -5341,8 +5346,6 @@ static void set_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable--;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5363,8 +5366,6 @@ static void clear_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable++;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5450,8 +5451,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);
 
-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
+			struct rq *rq = rq_of(cfs_rq);
+
+			list_del_leaf_cfs_rq(cfs_rq);
+			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+			cfs_rq->pelt_clock_throttled = 1;
+		}
+#endif
+	}
 
 	return true;
 }
@@ -5790,6 +5801,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
 		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		/*
+		 * Must not set throttled before dequeue or dequeue will
+		 * mistakenly regard this task as an already throttled one.
+		 */
 		p->throttled = true;
 		resched_curr(rq);
 	}
@@ -5803,32 +5818,124 @@ void init_cfs_throttle_work(struct task_struct *p)
 	INIT_LIST_HEAD(&p->throttle_node);
 }
 
+/*
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
+ */
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+	WARN_ON_ONCE(p->se.on_rq);
+	list_del_init(&p->throttle_node);
+
+	/* task blocked after throttled */
+	if (flags & DEQUEUE_SLEEP) {
+		p->throttled = false;
+		return;
+	}
+
+	/*
+	 * task is migrating off its old cfs_rq, detach
+	 * the task's load from its old cfs_rq.
+	 */
+	if (task_on_rq_migrating(p))
+		detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	/* @p should have gone through dequeue_throttled_task() first */
+	WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+	/*
+	 * If the throttled task @p is enqueued to a throttled cfs_rq,
+	 * take the fast path by directly putting the task on the
+	 * target cfs_rq's limbo list.
+	 *
+	 * Do not do that when @p is current because the following race can
+	 * cause @p's group_node to be incorectly re-insterted in its rq's
+	 * cfs_tasks list, despite being throttled:
+	 *
+	 *     cpuX                       cpuY
+	 *   p ret2user
+	 *  throttle_cfs_rq_work()  sched_move_task(p)
+	 *  LOCK task_rq_lock
+	 *  dequeue_task_fair(p)
+	 *  UNLOCK task_rq_lock
+	 *                          LOCK task_rq_lock
+	 *                          task_current_donor(p) == true
+	 *                          task_on_rq_queued(p) == true
+	 *                          dequeue_task(p)
+	 *                          put_prev_task(p)
+	 *                          sched_change_group()
+	 *                          enqueue_task(p) -> p's new cfs_rq
+	 *                                             is throttled, go
+	 *                                             fast path and skip
+	 *                                             actual enqueue
+	 *                          set_next_task(p)
+	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
+	 *  schedule()
+	 *
+	 * In the above race case, @p current cfs_rq is in the same rq as
+	 * its previous cfs_rq because sched_move_task() only moves a task
+	 * to a different group from the same rq, so we can use its current
+	 * cfs_rq to derive rq and test if the task is current.
+	 */
+	if (throttled_hierarchy(cfs_rq) &&
+	    !task_current_donor(rq_of(cfs_rq), p)) {
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		return true;
+	}
+
+	/* we can't take the fast path, do an actual enqueue*/
+	p->throttled = false;
+	return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;
+
+	if (--cfs_rq->throttle_count)
+		return 0;
 
-	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
+	if (cfs_rq->pelt_clock_throttled) {
 		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 					     cfs_rq->throttled_clock_pelt;
+		cfs_rq->pelt_clock_throttled = 0;
+	}
 
-		/* Add cfs_rq with load or one or more already running entities to the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
 
-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+		cfs_rq->throttled_clock_self = 0;
 
-			cfs_rq->throttled_clock_self = 0;
+		if (WARN_ON_ONCE((s64)delta < 0))
+			delta = 0;
 
-			if (WARN_ON_ONCE((s64)delta < 0))
-				delta = 0;
+		cfs_rq->throttled_clock_self_time += delta;
+	}
 
-			cfs_rq->throttled_clock_self_time += delta;
-		}
+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+		list_del_init(&p->throttle_node);
+		p->throttled = false;
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}
 
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }
 
@@ -5857,17 +5964,25 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
+	if (cfs_rq->throttle_count++)
+		return 0;
+
+
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+	else {
+		/*
+		 * For cfs_rqs that still have entities enqueued, PELT clock
+		 * stop happens at dequeue time when all entities are dequeued.
+		 */
 		list_del_leaf_cfs_rq(cfs_rq);
-
-		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+		cfs_rq->pelt_clock_throttled = 1;
 	}
-	cfs_rq->throttle_count++;
 
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
 
@@ -5875,8 +5990,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+	int dequeue = 1;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5899,68 +6013,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5976,9 +6033,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+	/*
+	 * It's possible we are called with !runtime_remaining due to things
+	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+	 * unthrottled us with a positive runtime_remaining but other still
+	 * running entities consumed those runtime before we reached here.
+	 *
+	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+	 * because any enqueue in tg_unthrottle_up() will immediately trigger a
+	 * throttle, which is not supposed to happen on unthrottle path.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+		return;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -6008,62 +6076,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
-	}
-
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
 	}
 
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6717,6 +6731,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -6909,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
+	if (task_is_throttled(p) && enqueue_throttled_task(p))
+		return;
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -6961,10 +6980,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -6986,10 +7001,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}
 
 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7019,7 +7030,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);
 
-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
@@ -7074,10 +7084,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7114,10 +7120,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}
 
 	sub_nr_running(rq, h_nr_queued);
@@ -7151,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (task_is_throttled(p)) {
+		dequeue_throttled_task(p, flags);
+		return true;
+	}
+
 	if (!p->se.sched_delayed)
 		util_est_dequeue(&rq->cfs, p);
 
@@ -8819,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+	struct task_struct *p;
+	bool throttled;
 
 again:
 	cfs_rq = &rq->cfs;
 	if (!cfs_rq->nr_queued)
 		return NULL;
 
+	throttled = false;
+
 	do {
 		/* Might not have done put_prev_entity() */
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		throttled |= check_cfs_rq_runtime(cfs_rq);
 
 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
@@ -8839,7 +8849,10 @@ again:
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	if (unlikely(throttled))
+		task_throttle_setup_work(p);
+	return p;
 }
 
 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa5..f921302 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	u64 throttled;
 
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		throttled = U64_MAX;
 	else
 		throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
 
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a6493d2..6e1b37b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -735,7 +735,8 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled;
+	bool			throttled:1;
+	bool			pelt_clock_throttled:1;
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;