[v3] Defer throttle when task exits to user

[PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 2 months, 3 weeks ago

From: Valentin Schneider <vschneid@redhat.com>

In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.

Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock only when the throttled cfs_rq has no tasks left.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c  | 336 ++++++++++++++++++++++---------------------
 kernel/sched/pelt.h  |   4 +-
 kernel/sched/sched.h |   3 +-
 3 files changed, 176 insertions(+), 167 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54c2a4df6a5d1..0eeea7f2e693d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5285,18 +5285,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);
 
 			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
 			if (!cfs_rq->throttled_clock_self)
 				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+
+			if (cfs_rq->pelt_clock_throttled) {
+				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+					cfs_rq->throttled_clock_pelt;
+				cfs_rq->pelt_clock_throttled = 0;
+			}
 		}
+#endif
 	}
 }
 
@@ -5335,8 +5340,6 @@ static void set_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable--;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5357,8 +5360,6 @@ static void clear_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable++;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5444,8 +5445,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);
 
-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
+			struct rq *rq = rq_of(cfs_rq);
+
+			list_del_leaf_cfs_rq(cfs_rq);
+			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+			cfs_rq->pelt_clock_throttled = 1;
+		}
+#endif
+	}
 
 	return true;
 }
@@ -5784,6 +5795,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
 		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		/*
+		 * Must not set throttled before dequeue or dequeue will
+		 * mistakenly regard this task as an already throttled one.
+		 */
 		p->throttled = true;
 		resched_curr(rq);
 	}
@@ -5797,32 +5812,119 @@ void init_cfs_throttle_work(struct task_struct *p)
 	INIT_LIST_HEAD(&p->throttle_node);
 }
 
+/*
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
+ */
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+	WARN_ON_ONCE(p->se.on_rq);
+	list_del_init(&p->throttle_node);
+
+	/* task blocked after throttled */
+	if (flags & DEQUEUE_SLEEP) {
+		p->throttled = false;
+		return;
+	}
+
+	/*
+	 * task is migrating off its old cfs_rq, detach
+	 * the task's load from its old cfs_rq.
+	 */
+	if (task_on_rq_migrating(p))
+		detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	/*
+	 * If the throttled task is enqueued to a throttled cfs_rq,
+	 * take the fast path by directly put the task on target
+	 * cfs_rq's limbo list, except when p is current because
+	 * the following race can cause p's group_node left in rq's
+	 * cfs_tasks list when it's throttled:
+	 *
+	 *        cpuX                       cpuY
+	 *   taskA ret2user
+	 *  throttle_cfs_rq_work()    sched_move_task(taskA)
+	 *  task_rq_lock acquired
+	 *  dequeue_task_fair(taskA)
+	 *  task_rq_lock released
+	 *                            task_rq_lock acquired
+	 *                            task_current_donor(taskA) == true
+	 *                            task_on_rq_queued(taskA) == true
+	 *                            dequeue_task(taskA)
+	 *                            put_prev_task(taskA)
+	 *                            sched_change_group()
+	 *                            enqueue_task(taskA) -> taskA's new cfs_rq
+	 *                                                   is throttled, go
+	 *                                                   fast path and skip
+	 *                                                   actual enqueue
+	 *                            set_next_task(taskA)
+	 *                          __set_next_task_fair(taskA)
+	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
+	 *  schedule()
+	 *
+	 * And in the above race case, the task's current cfs_rq is in the same
+	 * rq as its previous cfs_rq because sched_move_task() doesn't migrate
+	 * task so we can use its current cfs_rq to derive rq and test if the
+	 * task is current.
+	 */
+	if (throttled_hierarchy(cfs_rq) &&
+	    !task_current_donor(rq_of(cfs_rq), p)) {
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		return true;
+	}
+
+	/* we can't take the fast path, do an actual enqueue*/
+	p->throttled = false;
+	return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;
+
+	if (--cfs_rq->throttle_count)
+		return 0;
 
-	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
+	if (cfs_rq->pelt_clock_throttled) {
 		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 					     cfs_rq->throttled_clock_pelt;
+		cfs_rq->pelt_clock_throttled = 0;
+	}
 
-		/* Add cfs_rq with load or one or more already running entities to the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
 
-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+		cfs_rq->throttled_clock_self = 0;
 
-			cfs_rq->throttled_clock_self = 0;
+		if (WARN_ON_ONCE((s64)delta < 0))
+			delta = 0;
 
-			if (WARN_ON_ONCE((s64)delta < 0))
-				delta = 0;
+		cfs_rq->throttled_clock_self_time += delta;
+	}
 
-			cfs_rq->throttled_clock_self_time += delta;
-		}
+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+		list_del_init(&p->throttle_node);
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}
 
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }
 
@@ -5851,17 +5953,25 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
+	if (cfs_rq->throttle_count++)
+		return 0;
+
+
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+	else {
+		/*
+		 * For cfs_rqs that still have entities enqueued, PELT clock
+		 * stop happens at dequeue time when all entities are dequeued.
+		 */
 		list_del_leaf_cfs_rq(cfs_rq);
-
-		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+		cfs_rq->pelt_clock_throttled = 1;
 	}
-	cfs_rq->throttle_count++;
 
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
 
@@ -5869,8 +5979,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+	int dequeue = 1;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5893,68 +6002,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5970,9 +6022,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+	/*
+	 * It's possible we are called with !runtime_remaining due to things
+	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+	 * unthrottled us with a positive runtime_remaining but other still
+	 * running entities consumed those runtime before we reached here.
+	 *
+	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+	 * because any enqueue in tg_unthrottle_up() will immediately trigger a
+	 * throttle, which is not supposed to happen on unthrottle path.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+		return;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -6002,62 +6065,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
 	}
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6711,6 +6720,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -6903,6 +6914,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
+	if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+		return;
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -6955,10 +6969,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -6980,10 +6990,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}
 
 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7013,7 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);
 
-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
@@ -7068,10 +7073,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7108,10 +7109,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}
 
 	sub_nr_running(rq, h_nr_queued);
@@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (unlikely(task_is_throttled(p))) {
+		dequeue_throttled_task(p, flags);
+		return true;
+	}
+
 	if (!p->se.sched_delayed)
 		util_est_dequeue(&rq->cfs, p);
 
@@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+	struct task_struct *p;
+	bool throttled;
 
 again:
 	cfs_rq = &rq->cfs;
 	if (!cfs_rq->nr_queued)
 		return NULL;
 
+	throttled = false;
+
 	do {
 		/* Might not have done put_prev_entity() */
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		throttled |= check_cfs_rq_runtime(cfs_rq);
 
 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
@@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	if (unlikely(throttled))
+		task_throttle_setup_work(p);
+	return p;
 }
 
 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f2..f921302dc40fb 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	u64 throttled;
 
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		throttled = U64_MAX;
 	else
 		throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
 
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b0c9559992d8a..fc697d4bf6685 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,7 +737,8 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled;
+	int			throttled:1;
+	int			pelt_clock_throttled:1;
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
-- 
2.39.5

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Chen, Yu C 1 month, 2 weeks ago

On 7/15/2025 3:16 PM, Aaron Lu wrote:
> From: Valentin Schneider <vschneid@redhat.com>
> 
> In current throttle model, when a cfs_rq is throttled, its entity will
> be dequeued from cpu's rq, making tasks attached to it not able to run,
> thus achiveing the throttle target.
> 
> This has a drawback though: assume a task is a reader of percpu_rwsem
> and is waiting. When it gets woken, it can not run till its task group's
> next period comes, which can be a relatively long time. Waiting writer
> will have to wait longer due to this and it also makes further reader
> build up and eventually trigger task hung.
> 
> To improve this situation, change the throttle model to task based, i.e.
> when a cfs_rq is throttled, record its throttled status but do not remove
> it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> they get picked, add a task work to them so that when they return
> to user, they can be dequeued there. In this way, tasks throttled will
> not hold any kernel resources. And on unthrottle, enqueue back those
> tasks so they can continue to run.
> 
> Throttled cfs_rq's PELT clock is handled differently now: previously the
> cfs_rq's PELT clock is stopped once it entered throttled state but since
> now tasks(in kernel mode) can continue to run, change the behaviour to
> stop PELT clock only when the throttled cfs_rq has no tasks left.
> 
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---

[snip]


> @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
>   {
>   	struct sched_entity *se;
>   	struct cfs_rq *cfs_rq;
> +	struct task_struct *p;
> +	bool throttled;
>   
>   again:
>   	cfs_rq = &rq->cfs;
>   	if (!cfs_rq->nr_queued)
>   		return NULL;
>   
> +	throttled = false;
> +
>   	do {
>   		/* Might not have done put_prev_entity() */
>   		if (cfs_rq->curr && cfs_rq->curr->on_rq)
>   			update_curr(cfs_rq);
>   
> -		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> -			goto again;
> +		throttled |= check_cfs_rq_runtime(cfs_rq);
>   
>   		se = pick_next_entity(rq, cfs_rq);
>   		if (!se)
> @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
>   		cfs_rq = group_cfs_rq(se);
>   	} while (cfs_rq);
>   
> -	return task_of(se);
> +	p = task_of(se);
> +	if (unlikely(throttled))
> +		task_throttle_setup_work(p);
> +	return p;
>   }
>   

Previously, I was wondering if the above change might impact
wakeup latency in some corner cases: If there are many tasks
enqueued on a throttled cfs_rq, the above pick-up mechanism
might return an invalid p repeatedly (where p is dequeued,
and a reschedule is triggered in throttle_cfs_rq_work() to
pick the next p; then the new p is found again on a throttled
cfs_rq). Before the above change, the entire cfs_rq's corresponding
sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu)

So I did some tests for this scenario on a Xeon with 6 NUMA nodes and
384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf
cgroup. The results show that there is not much impact in terms of
wakeup latency (considering the standard deviation). Based on the data
and my understanding, for this series,

Tested-by: Chen Yu <yu.c.chen@intel.com>


Tested script parameters are borrowed from the previous attached ones:
#!/bin/bash

if [ $# -ne 1 ]; then
         echo "please provide cgroup level"
         exit
fi

N=$1
current_path="/sys/fs/cgroup"

for ((i=1; i<=N; i++)); do
         new_dir="${current_path}/${i}"
         mkdir -p "$new_dir"
         if [ "$i" -ne "$N" ]; then
                 echo '+cpu +memory +pids' > 
${new_dir}/cgroup.subtree_control
         fi
         current_path="$new_dir"
done

echo "current_path:${current_path}"
echo "1600000 100000" > ${current_path}/cpu.max
echo "34G" > ${current_path}/memory.max

echo $$ > ${current_path}/cgroup.procs
#./run-mmtests.sh --no-monitor --config config-schbench baseline
./run-mmtests.sh --no-monitor --config config-schbench sch_throt


pids=$(cat "${current_path}/cgroup.procs")
for pid in $pids; do
         echo $pid > "/sys/fs/cgroup/cgroup.procs" 2>/dev/null
done
for ((i=N; i>=1; i--)); do
         rmdir ${current_path}
         current_path=$(dirname "$current_path")
done


Results:

schbench thread = 1
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
//the baseline's std% is 35%, the change should not be a problem
Wakeup Latencies 99.0th        15.00(5.29)          17.00(1.00) 
-13.33%
Request Latencies 99.0th       3830.67(33.31)       3854.67(25.72) 
-0.63%
RPS 50.0th                     1598.00(4.00)        1606.00(4.00) 
+0.50%
Average RPS                    1597.77(5.13)        1606.11(4.75) 
+0.52%

schbench thread = 2
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
Wakeup Latencies 99.0th        18.33(0.58)          18.67(0.58) 
-1.85%
Request Latencies 99.0th       3868.00(49.96)       3854.67(44.06) 
+0.34%
RPS 50.0th                     3185.33(4.62)        3204.00(8.00) 
+0.59%
Average RPS                    3186.49(2.70)        3204.21(11.25) 
+0.56%

schbench thread = 4
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
Wakeup Latencies 99.0th        19.33(1.15)          19.33(0.58) 
0.00%
Request Latencies 99.0th       35690.67(517.31)     35946.67(517.31) 
-0.72%
RPS 50.0th                     4418.67(18.48)       4434.67(9.24) 
+0.36%
Average RPS                    4414.38(16.94)       4436.02(8.77) 
+0.49%

schbench thread = 8
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
Wakeup Latencies 99.0th        22.67(0.58)          22.33(0.58) 
+1.50%
Request Latencies 99.0th       73002.67(147.80)     72661.33(147.80) 
+0.47%
RPS 50.0th                     4376.00(16.00)       4392.00(0.00) 
+0.37%
Average RPS                    4373.89(15.04)       4393.88(6.22) 
+0.46%

schbench thread = 16
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
Wakeup Latencies 99.0th        29.00(2.65)          29.00(3.61) 
0.00%
Request Latencies 99.0th       88704.00(0.00)       88704.00(0.00) 
0.00%
RPS 50.0th                     4274.67(24.44)       4290.67(9.24) 
+0.37%
Average RPS                    4277.27(24.80)       4287.97(9.80) 
+0.25%

schbench thread = 32
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
Wakeup Latencies 99.0th        100.00(22.61)        82.00(16.46) 
+18.00%
Request Latencies 99.0th       100138.67(295.60)    100053.33(147.80) 
+0.09%
RPS 50.0th                     3942.67(20.13)       3916.00(42.33) 
-0.68%
Average RPS                    3919.39(19.01)       3892.39(42.26) 
-0.69%

schbench thread = 63
Metric                         Base (mean±std)      Compare (mean±std) 
Change
-------------------------------------------------------------------------------------
Wakeup Latencies 99.0th        94848.00(0.00)       94336.00(0.00) 
+0.54%
//the baseline's std% is 19%, the change should not be a problem
Request Latencies 99.0th       264618.67(51582.78)  298154.67(591.21) 
-12.67%
RPS 50.0th                     2641.33(4.62)        2628.00(8.00) 
-0.50%
Average RPS                    2659.49(8.88)        2636.17(7.58) 
-0.88%

thanks,
Chenyu

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 2 weeks ago

On Sun, Aug 17, 2025 at 04:50:50PM +0800, Chen, Yu C wrote:
> On 7/15/2025 3:16 PM, Aaron Lu wrote:
> > From: Valentin Schneider <vschneid@redhat.com>
> > 
> > In current throttle model, when a cfs_rq is throttled, its entity will
> > be dequeued from cpu's rq, making tasks attached to it not able to run,
> > thus achiveing the throttle target.
> > 
> > This has a drawback though: assume a task is a reader of percpu_rwsem
> > and is waiting. When it gets woken, it can not run till its task group's
> > next period comes, which can be a relatively long time. Waiting writer
> > will have to wait longer due to this and it also makes further reader
> > build up and eventually trigger task hung.
> > 
> > To improve this situation, change the throttle model to task based, i.e.
> > when a cfs_rq is throttled, record its throttled status but do not remove
> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> > they get picked, add a task work to them so that when they return
> > to user, they can be dequeued there. In this way, tasks throttled will
> > not hold any kernel resources. And on unthrottle, enqueue back those
> > tasks so they can continue to run.
> > 
> > Throttled cfs_rq's PELT clock is handled differently now: previously the
> > cfs_rq's PELT clock is stopped once it entered throttled state but since
> > now tasks(in kernel mode) can continue to run, change the behaviour to
> > stop PELT clock only when the throttled cfs_rq has no tasks left.
> > 
> > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> > Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> 
> [snip]
> 
> 
> > @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> >   {
> >   	struct sched_entity *se;
> >   	struct cfs_rq *cfs_rq;
> > +	struct task_struct *p;
> > +	bool throttled;
> >   again:
> >   	cfs_rq = &rq->cfs;
> >   	if (!cfs_rq->nr_queued)
> >   		return NULL;
> > +	throttled = false;
> > +
> >   	do {
> >   		/* Might not have done put_prev_entity() */
> >   		if (cfs_rq->curr && cfs_rq->curr->on_rq)
> >   			update_curr(cfs_rq);
> > -		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> > -			goto again;
> > +		throttled |= check_cfs_rq_runtime(cfs_rq);
> >   		se = pick_next_entity(rq, cfs_rq);
> >   		if (!se)
> > @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> >   		cfs_rq = group_cfs_rq(se);
> >   	} while (cfs_rq);
> > -	return task_of(se);
> > +	p = task_of(se);
> > +	if (unlikely(throttled))
> > +		task_throttle_setup_work(p);
> > +	return p;
> >   }
> 
> Previously, I was wondering if the above change might impact
> wakeup latency in some corner cases: If there are many tasks
> enqueued on a throttled cfs_rq, the above pick-up mechanism
> might return an invalid p repeatedly (where p is dequeued,

By invalid, do you mean task that is in a throttled hierarchy?

> and a reschedule is triggered in throttle_cfs_rq_work() to
> pick the next p; then the new p is found again on a throttled
> cfs_rq). Before the above change, the entire cfs_rq's corresponding
> sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu)
> 

Yes this is true and it sounds inefficient, but these newly woken tasks
may hold some kernel resources like a reader lock so we really want them
to finish their kernel jobs and release that resource before being
throttled or it can block/impact other tasks and even cause the whole
system to hung.

> So I did some tests for this scenario on a Xeon with 6 NUMA nodes and
> 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf
> cgroup. The results show that there is not much impact in terms of
> wakeup latency (considering the standard deviation). Based on the data
> and my understanding, for this series,
> 
> Tested-by: Chen Yu <yu.c.chen@intel.com>

Good to know this and thanks a lot for the test!

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 2 weeks ago

On Mon, Aug 18, 2025 at 10:50:14AM +0800, Aaron Lu wrote:
> On Sun, Aug 17, 2025 at 04:50:50PM +0800, Chen, Yu C wrote:
> > On 7/15/2025 3:16 PM, Aaron Lu wrote:
> > > From: Valentin Schneider <vschneid@redhat.com>
> > > 
> > > In current throttle model, when a cfs_rq is throttled, its entity will
> > > be dequeued from cpu's rq, making tasks attached to it not able to run,
> > > thus achiveing the throttle target.
> > > 
> > > This has a drawback though: assume a task is a reader of percpu_rwsem
> > > and is waiting. When it gets woken, it can not run till its task group's
> > > next period comes, which can be a relatively long time. Waiting writer
> > > will have to wait longer due to this and it also makes further reader
> > > build up and eventually trigger task hung.
> > > 
> > > To improve this situation, change the throttle model to task based, i.e.
> > > when a cfs_rq is throttled, record its throttled status but do not remove
> > > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> > > they get picked, add a task work to them so that when they return
> > > to user, they can be dequeued there. In this way, tasks throttled will
> > > not hold any kernel resources. And on unthrottle, enqueue back those
> > > tasks so they can continue to run.
> > > 
> > > Throttled cfs_rq's PELT clock is handled differently now: previously the
> > > cfs_rq's PELT clock is stopped once it entered throttled state but since
> > > now tasks(in kernel mode) can continue to run, change the behaviour to
> > > stop PELT clock only when the throttled cfs_rq has no tasks left.
> > > 
> > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> > > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> > > Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> > > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > > ---
> > 
> > [snip]
> > 
> > 
> > > @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > >   {
> > >   	struct sched_entity *se;
> > >   	struct cfs_rq *cfs_rq;
> > > +	struct task_struct *p;
> > > +	bool throttled;
> > >   again:
> > >   	cfs_rq = &rq->cfs;
> > >   	if (!cfs_rq->nr_queued)
> > >   		return NULL;
> > > +	throttled = false;
> > > +
> > >   	do {
> > >   		/* Might not have done put_prev_entity() */
> > >   		if (cfs_rq->curr && cfs_rq->curr->on_rq)
> > >   			update_curr(cfs_rq);
> > > -		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> > > -			goto again;
> > > +		throttled |= check_cfs_rq_runtime(cfs_rq);
> > >   		se = pick_next_entity(rq, cfs_rq);
> > >   		if (!se)
> > > @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
> > >   		cfs_rq = group_cfs_rq(se);
> > >   	} while (cfs_rq);
> > > -	return task_of(se);
> > > +	p = task_of(se);
> > > +	if (unlikely(throttled))
> > > +		task_throttle_setup_work(p);
> > > +	return p;
> > >   }
> > 
> > Previously, I was wondering if the above change might impact
> > wakeup latency in some corner cases: If there are many tasks
> > enqueued on a throttled cfs_rq, the above pick-up mechanism
> > might return an invalid p repeatedly (where p is dequeued,
> 
> By invalid, do you mean task that is in a throttled hierarchy?
> 
> > and a reschedule is triggered in throttle_cfs_rq_work() to
> > pick the next p; then the new p is found again on a throttled
> > cfs_rq). Before the above change, the entire cfs_rq's corresponding
> > sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu)
> > 
> 
> Yes this is true and it sounds inefficient, but these newly woken tasks
> may hold some kernel resources like a reader lock so we really want them
                                               ~~~~

Sorry, I meant reader semaphore.

> to finish their kernel jobs and release that resource before being
> throttled or it can block/impact other tasks and even cause the whole
> system to hung.
> 
> > So I did some tests for this scenario on a Xeon with 6 NUMA nodes and
> > 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf
> > cgroup. The results show that there is not much impact in terms of
> > wakeup latency (considering the standard deviation). Based on the data
> > and my understanding, for this series,
> > 
> > Tested-by: Chen Yu <yu.c.chen@intel.com>
> 
> Good to know this and thanks a lot for the test!

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Chen, Yu C 1 month, 2 weeks ago

On 8/18/2025 10:50 AM, Aaron Lu wrote:
> On Sun, Aug 17, 2025 at 04:50:50PM +0800, Chen, Yu C wrote:
>> On 7/15/2025 3:16 PM, Aaron Lu wrote:
>>> From: Valentin Schneider <vschneid@redhat.com>
>>>
>>> In current throttle model, when a cfs_rq is throttled, its entity will
>>> be dequeued from cpu's rq, making tasks attached to it not able to run,
>>> thus achiveing the throttle target.
>>>
>>> This has a drawback though: assume a task is a reader of percpu_rwsem
>>> and is waiting. When it gets woken, it can not run till its task group's
>>> next period comes, which can be a relatively long time. Waiting writer
>>> will have to wait longer due to this and it also makes further reader
>>> build up and eventually trigger task hung.
>>>
>>> To improve this situation, change the throttle model to task based, i.e.
>>> when a cfs_rq is throttled, record its throttled status but do not remove
>>> it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
>>> they get picked, add a task work to them so that when they return
>>> to user, they can be dequeued there. In this way, tasks throttled will
>>> not hold any kernel resources. And on unthrottle, enqueue back those
>>> tasks so they can continue to run.
>>>
>>> Throttled cfs_rq's PELT clock is handled differently now: previously the
>>> cfs_rq's PELT clock is stopped once it entered throttled state but since
>>> now tasks(in kernel mode) can continue to run, change the behaviour to
>>> stop PELT clock only when the throttled cfs_rq has no tasks left.
>>>
>>> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>>> Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
>>> Signed-off-by: Valentin Schneider <vschneid@redhat.com>
>>> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
>>> ---
>>
>> [snip]
>>
>>
>>> @@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
>>>    {
>>>    	struct sched_entity *se;
>>>    	struct cfs_rq *cfs_rq;
>>> +	struct task_struct *p;
>>> +	bool throttled;
>>>    again:
>>>    	cfs_rq = &rq->cfs;
>>>    	if (!cfs_rq->nr_queued)
>>>    		return NULL;
>>> +	throttled = false;
>>> +
>>>    	do {
>>>    		/* Might not have done put_prev_entity() */
>>>    		if (cfs_rq->curr && cfs_rq->curr->on_rq)
>>>    			update_curr(cfs_rq);
>>> -		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
>>> -			goto again;
>>> +		throttled |= check_cfs_rq_runtime(cfs_rq);
>>>    		se = pick_next_entity(rq, cfs_rq);
>>>    		if (!se)
>>> @@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
>>>    		cfs_rq = group_cfs_rq(se);
>>>    	} while (cfs_rq);
>>> -	return task_of(se);
>>> +	p = task_of(se);
>>> +	if (unlikely(throttled))
>>> +		task_throttle_setup_work(p);
>>> +	return p;
>>>    }
>>
>> Previously, I was wondering if the above change might impact
>> wakeup latency in some corner cases: If there are many tasks
>> enqueued on a throttled cfs_rq, the above pick-up mechanism
>> might return an invalid p repeatedly (where p is dequeued,
> 
> By invalid, do you mean task that is in a throttled hierarchy?
> 

Yes.

>> and a reschedule is triggered in throttle_cfs_rq_work() to
>> pick the next p; then the new p is found again on a throttled
>> cfs_rq). Before the above change, the entire cfs_rq's corresponding
>> sched_entity was dequeued in throttle_cfs_rq(): se = cfs_rq->tg->se(cpu)
>>
> 
> Yes this is true and it sounds inefficient, but these newly woken tasks
> may hold some kernel resources like a reader lock so we really want them
> to finish their kernel jobs and release that resource before being
> throttled or it can block/impact other tasks and even cause the whole
> system to hung.
> 

I see. Always dequeue each task during their ret2user phase would be safer.

thanks,
Chenyu
>> So I did some tests for this scenario on a Xeon with 6 NUMA nodes and
>> 384 CPUs. I created 10 levels of cgroups and ran schbench on the leaf
>> cgroup. The results show that there is not much impact in terms of
>> wakeup latency (considering the standard deviation). Based on the data
>> and my understanding, for this series,
>>
>> Tested-by: Chen Yu <yu.c.chen@intel.com>
> 
> Good to know this and thanks a lot for the test!

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Valentin Schneider 1 month, 4 weeks ago

On 15/07/25 15:16, Aaron Lu wrote:
> From: Valentin Schneider <vschneid@redhat.com>
>
> In current throttle model, when a cfs_rq is throttled, its entity will
> be dequeued from cpu's rq, making tasks attached to it not able to run,
> thus achiveing the throttle target.
>
> This has a drawback though: assume a task is a reader of percpu_rwsem
> and is waiting. When it gets woken, it can not run till its task group's
> next period comes, which can be a relatively long time. Waiting writer
> will have to wait longer due to this and it also makes further reader
> build up and eventually trigger task hung.
>
> To improve this situation, change the throttle model to task based, i.e.
> when a cfs_rq is throttled, record its throttled status but do not remove
> it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> they get picked, add a task work to them so that when they return
> to user, they can be dequeued there. In this way, tasks throttled will
> not hold any kernel resources. And on unthrottle, enqueue back those
> tasks so they can continue to run.
>

Moving the actual throttle work to pick time is clever. In my previous
versions I tried really hard to stay out of the enqueue/dequeue/pick paths,
but this makes the code a lot more palatable. I'd like to see how this
impacts performance though.

> Throttled cfs_rq's PELT clock is handled differently now: previously the
> cfs_rq's PELT clock is stopped once it entered throttled state but since
> now tasks(in kernel mode) can continue to run, change the behaviour to
> stop PELT clock only when the throttled cfs_rq has no tasks left.
>

I haven't spent much time looking at the PELT stuff yet; I'll do that next
week. Thanks for all the work you've been putting into this, and sorry it
got me this long to get a proper look at it!

> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>

> +static bool enqueue_throttled_task(struct task_struct *p)
> +{
> +	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> +
> +	/*
> +	 * If the throttled task is enqueued to a throttled cfs_rq,
> +	 * take the fast path by directly put the task on target
> +	 * cfs_rq's limbo list, except when p is current because
> +	 * the following race can cause p's group_node left in rq's
> +	 * cfs_tasks list when it's throttled:
> +	 *
> +	 *        cpuX                       cpuY
> +	 *   taskA ret2user
> +	 *  throttle_cfs_rq_work()    sched_move_task(taskA)
> +	 *  task_rq_lock acquired
> +	 *  dequeue_task_fair(taskA)
> +	 *  task_rq_lock released
> +	 *                            task_rq_lock acquired
> +	 *                            task_current_donor(taskA) == true
> +	 *                            task_on_rq_queued(taskA) == true
> +	 *                            dequeue_task(taskA)
> +	 *                            put_prev_task(taskA)
> +	 *                            sched_change_group()
> +	 *                            enqueue_task(taskA) -> taskA's new cfs_rq
> +	 *                                                   is throttled, go
> +	 *                                                   fast path and skip
> +	 *                                                   actual enqueue
> +	 *                            set_next_task(taskA)
> +	 *                          __set_next_task_fair(taskA)
> +	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
> +	 *  schedule()
> +	 *
> +	 * And in the above race case, the task's current cfs_rq is in the same
> +	 * rq as its previous cfs_rq because sched_move_task() doesn't migrate
> +	 * task so we can use its current cfs_rq to derive rq and test if the
> +	 * task is current.
> +	 */

OK I think I got this; I slightly rephrased things to match similar
comments in the sched code:

--->8---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3a7c86c5b8a2b..8566ee0399527 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5827,37 +5827,38 @@ static bool enqueue_throttled_task(struct task_struct *p)
 	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
 
 	/*
-	 * If the throttled task is enqueued to a throttled cfs_rq,
-	 * take the fast path by directly put the task on target
-	 * cfs_rq's limbo list, except when p is current because
-	 * the following race can cause p's group_node left in rq's
-	 * cfs_tasks list when it's throttled:
+	 * If the throttled task @p is enqueued to a throttled cfs_rq,
+	 * take the fast path by directly putting the task on the
+	 * target cfs_rq's limbo list.
+
+	 * Do not do that when @p is current because the following race can
+	 * cause @p's group_node to be incorectly re-insterted in its in rq's
+	 * cfs_tasks list, despite being throttled:
 	 *
 	 *        cpuX                       cpuY
-	 *   taskA ret2user
-	 *  throttle_cfs_rq_work()    sched_move_task(taskA)
-	 *  task_rq_lock acquired
-	 *  dequeue_task_fair(taskA)
-	 *  task_rq_lock released
-	 *                            task_rq_lock acquired
-	 *                            task_current_donor(taskA) == true
-	 *                            task_on_rq_queued(taskA) == true
-	 *                            dequeue_task(taskA)
-	 *                            put_prev_task(taskA)
-	 *                            sched_change_group()
-	 *                            enqueue_task(taskA) -> taskA's new cfs_rq
-	 *                                                   is throttled, go
-	 *                                                   fast path and skip
-	 *                                                   actual enqueue
-	 *                            set_next_task(taskA)
-	 *                          __set_next_task_fair(taskA)
-	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
+	 *  p ret2user
+	 *    throttle_cfs_rq_work()  sched_move_task(p)
+	 *    LOCK task_rq_lock
+	 *    dequeue_task_fair(p)
+	 *    UNLOCK task_rq_lock
+	 *                              LOCK task_rq_lock
+	 *                              task_current_donor(p) == true
+	 *                              task_on_rq_queued(p) == true
+	 *                              dequeue_task(p)
+	 *                              put_prev_task(p)
+	 *                              sched_change_group()
+	 *                              enqueue_task(p) -> p's new cfs_rq
+	 *                                                 is throttled, go
+	 *                                                 fast path and skip
+	 *                                                 actual enqueue
+	 *                              set_next_task(p)
+	 *                                list_move(&se->group_node, &rq->cfs_tasks); // bug
 	 *  schedule()
 	 *
-	 * And in the above race case, the task's current cfs_rq is in the same
-	 * rq as its previous cfs_rq because sched_move_task() doesn't migrate
-	 * task so we can use its current cfs_rq to derive rq and test if the
-	 * task is current.
+	 * In the above race case, @p current cfs_rq is in the same
+	 * rq as its previous cfs_rq because sched_move_task() only moves a task
+	 * to a different group from the same rq, so we can use its current
+	 * cfs_rq to derive rq and test if the * task is current.
 	 */
 	if (throttled_hierarchy(cfs_rq) &&
 	    !task_current_donor(rq_of(cfs_rq), p)) {
---8<---

> +	if (throttled_hierarchy(cfs_rq) &&
> +	    !task_current_donor(rq_of(cfs_rq), p)) {
> +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> +		return true;
> +	}
> +
> +	/* we can't take the fast path, do an actual enqueue*/
> +	p->throttled = false;

So we clear p->throttled but not p->throttle_node? Won't that cause issues
when @p's previous cfs_rq gets unthrottled?

> +	return false;
> +}
> +

> @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> +	if (unlikely(task_is_throttled(p))) {
> +		dequeue_throttled_task(p, flags);
> +		return true;
> +	}
> +

Handling a throttled task's move pattern at dequeue does simplify things,
however that's quite a hot path. I think this wants at the very least a

  if (cfs_bandwidth_used())

since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't
hurt, but we can probably find some people who really care about that to
run it for us ;)

>  	if (!p->se.sched_delayed)
>  		util_est_dequeue(&rq->cfs, p);
>

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 4 weeks ago

On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote:
> On 15/07/25 15:16, Aaron Lu wrote:
> > From: Valentin Schneider <vschneid@redhat.com>
> >
> > In current throttle model, when a cfs_rq is throttled, its entity will
> > be dequeued from cpu's rq, making tasks attached to it not able to run,
> > thus achiveing the throttle target.
> >
> > This has a drawback though: assume a task is a reader of percpu_rwsem
> > and is waiting. When it gets woken, it can not run till its task group's
> > next period comes, which can be a relatively long time. Waiting writer
> > will have to wait longer due to this and it also makes further reader
> > build up and eventually trigger task hung.
> >
> > To improve this situation, change the throttle model to task based, i.e.
> > when a cfs_rq is throttled, record its throttled status but do not remove
> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> > they get picked, add a task work to them so that when they return
> > to user, they can be dequeued there. In this way, tasks throttled will
> > not hold any kernel resources. And on unthrottle, enqueue back those
> > tasks so they can continue to run.
> >
> 
> Moving the actual throttle work to pick time is clever. In my previous
> versions I tried really hard to stay out of the enqueue/dequeue/pick paths,
> but this makes the code a lot more palatable. I'd like to see how this
> impacts performance though.
> 

Let me run some scheduler benchmark to see how it impacts performance.

I'm thinking maybe running something like hackbench on server platforms,
first with quota not set and see if performance changes; then also test
with quota set and see how performance changes.

Does this sound good to you? Or do you have any specific benchmark and
test methodology in mind?

> > Throttled cfs_rq's PELT clock is handled differently now: previously the
> > cfs_rq's PELT clock is stopped once it entered throttled state but since
> > now tasks(in kernel mode) can continue to run, change the behaviour to
> > stop PELT clock only when the throttled cfs_rq has no tasks left.
> >
> 
> I haven't spent much time looking at the PELT stuff yet; I'll do that next
> week. Thanks for all the work you've been putting into this, and sorry it
> got me this long to get a proper look at it!
> 

Never mind and thanks for taking a look now :)

> > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> > Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
> > Signed-off-by: Valentin Schneider <vschneid@redhat.com>
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> 
> > +static bool enqueue_throttled_task(struct task_struct *p)
> > +{
> > +	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> > +
> > +	/*
> > +	 * If the throttled task is enqueued to a throttled cfs_rq,
> > +	 * take the fast path by directly put the task on target
> > +	 * cfs_rq's limbo list, except when p is current because
> > +	 * the following race can cause p's group_node left in rq's
> > +	 * cfs_tasks list when it's throttled:
> > +	 *
> > +	 *        cpuX                       cpuY
> > +	 *   taskA ret2user
> > +	 *  throttle_cfs_rq_work()    sched_move_task(taskA)
> > +	 *  task_rq_lock acquired
> > +	 *  dequeue_task_fair(taskA)
> > +	 *  task_rq_lock released
> > +	 *                            task_rq_lock acquired
> > +	 *                            task_current_donor(taskA) == true
> > +	 *                            task_on_rq_queued(taskA) == true
> > +	 *                            dequeue_task(taskA)
> > +	 *                            put_prev_task(taskA)
> > +	 *                            sched_change_group()
> > +	 *                            enqueue_task(taskA) -> taskA's new cfs_rq
> > +	 *                                                   is throttled, go
> > +	 *                                                   fast path and skip
> > +	 *                                                   actual enqueue
> > +	 *                            set_next_task(taskA)
> > +	 *                          __set_next_task_fair(taskA)
> > +	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
> > +	 *  schedule()
> > +	 *
> > +	 * And in the above race case, the task's current cfs_rq is in the same
> > +	 * rq as its previous cfs_rq because sched_move_task() doesn't migrate
> > +	 * task so we can use its current cfs_rq to derive rq and test if the
> > +	 * task is current.
> > +	 */
> 
> OK I think I got this; I slightly rephrased things to match similar
> comments in the sched code:
> 
> --->8---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3a7c86c5b8a2b..8566ee0399527 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5827,37 +5827,38 @@ static bool enqueue_throttled_task(struct task_struct *p)
>  	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
>  
>  	/*
> -	 * If the throttled task is enqueued to a throttled cfs_rq,
> -	 * take the fast path by directly put the task on target
> -	 * cfs_rq's limbo list, except when p is current because
> -	 * the following race can cause p's group_node left in rq's
> -	 * cfs_tasks list when it's throttled:
> +	 * If the throttled task @p is enqueued to a throttled cfs_rq,
> +	 * take the fast path by directly putting the task on the
> +	 * target cfs_rq's limbo list.
> +
> +	 * Do not do that when @p is current because the following race can
> +	 * cause @p's group_node to be incorectly re-insterted in its in rq's
> +	 * cfs_tasks list, despite being throttled:
>  	 *
>  	 *        cpuX                       cpuY
> -	 *   taskA ret2user
> -	 *  throttle_cfs_rq_work()    sched_move_task(taskA)
> -	 *  task_rq_lock acquired
> -	 *  dequeue_task_fair(taskA)
> -	 *  task_rq_lock released
> -	 *                            task_rq_lock acquired
> -	 *                            task_current_donor(taskA) == true
> -	 *                            task_on_rq_queued(taskA) == true
> -	 *                            dequeue_task(taskA)
> -	 *                            put_prev_task(taskA)
> -	 *                            sched_change_group()
> -	 *                            enqueue_task(taskA) -> taskA's new cfs_rq
> -	 *                                                   is throttled, go
> -	 *                                                   fast path and skip
> -	 *                                                   actual enqueue
> -	 *                            set_next_task(taskA)
> -	 *                          __set_next_task_fair(taskA)
> -	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
> +	 *  p ret2user
> +	 *    throttle_cfs_rq_work()  sched_move_task(p)
> +	 *    LOCK task_rq_lock
> +	 *    dequeue_task_fair(p)
> +	 *    UNLOCK task_rq_lock
> +	 *                              LOCK task_rq_lock
> +	 *                              task_current_donor(p) == true
> +	 *                              task_on_rq_queued(p) == true
> +	 *                              dequeue_task(p)
> +	 *                              put_prev_task(p)
> +	 *                              sched_change_group()
> +	 *                              enqueue_task(p) -> p's new cfs_rq
> +	 *                                                 is throttled, go
> +	 *                                                 fast path and skip
> +	 *                                                 actual enqueue
> +	 *                              set_next_task(p)
> +	 *                                list_move(&se->group_node, &rq->cfs_tasks); // bug
>  	 *  schedule()
>  	 *
> -	 * And in the above race case, the task's current cfs_rq is in the same
> -	 * rq as its previous cfs_rq because sched_move_task() doesn't migrate
> -	 * task so we can use its current cfs_rq to derive rq and test if the
> -	 * task is current.
> +	 * In the above race case, @p current cfs_rq is in the same
> +	 * rq as its previous cfs_rq because sched_move_task() only moves a task
> +	 * to a different group from the same rq, so we can use its current
> +	 * cfs_rq to derive rq and test if the * task is current.
>  	 */
>  	if (throttled_hierarchy(cfs_rq) &&
>  	    !task_current_donor(rq_of(cfs_rq), p)) {
> ---8<---
>

Will follow your suggestion in next version.

> > +	if (throttled_hierarchy(cfs_rq) &&
> > +	    !task_current_donor(rq_of(cfs_rq), p)) {
> > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > +		return true;
> > +	}
> > +
> > +	/* we can't take the fast path, do an actual enqueue*/
> > +	p->throttled = false;
> 
> So we clear p->throttled but not p->throttle_node? Won't that cause issues
> when @p's previous cfs_rq gets unthrottled?
> 

p->throttle_node is already removed from its previous cfs_rq at dequeue
time in dequeue_throttled_task().

This is done so because in enqueue time, we may not hold its previous
rq's lock so can't touch its previous cfs_rq's limbo list, like when
dealing with affinity changes.

> > +	return false;
> > +}
> > +
> 
> > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >   */
> >  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >  {
> > +	if (unlikely(task_is_throttled(p))) {
> > +		dequeue_throttled_task(p, flags);
> > +		return true;
> > +	}
> > +
> 
> Handling a throttled task's move pattern at dequeue does simplify things,
> however that's quite a hot path. I think this wants at the very least a
> 
>   if (cfs_bandwidth_used())
> 
> since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't
> hurt, but we can probably find some people who really care about that to
> run it for us ;)
> 

No problem.

I'm thinking maybe I can add this cfs_bandwidth_used() in
task_is_throttled()? So that other callsites of task_is_throttled() can
also get the benefit of paying less cost when cfs bandwidth is not
enabled.

> >  	if (!p->se.sched_delayed)
> >  		util_est_dequeue(&rq->cfs, p);
> >  
>

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Valentin Schneider 1 month, 4 weeks ago

On 08/08/25 18:13, Aaron Lu wrote:
> On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote:
>> On 15/07/25 15:16, Aaron Lu wrote:
>> > From: Valentin Schneider <vschneid@redhat.com>
>> >
>> > In current throttle model, when a cfs_rq is throttled, its entity will
>> > be dequeued from cpu's rq, making tasks attached to it not able to run,
>> > thus achiveing the throttle target.
>> >
>> > This has a drawback though: assume a task is a reader of percpu_rwsem
>> > and is waiting. When it gets woken, it can not run till its task group's
>> > next period comes, which can be a relatively long time. Waiting writer
>> > will have to wait longer due to this and it also makes further reader
>> > build up and eventually trigger task hung.
>> >
>> > To improve this situation, change the throttle model to task based, i.e.
>> > when a cfs_rq is throttled, record its throttled status but do not remove
>> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
>> > they get picked, add a task work to them so that when they return
>> > to user, they can be dequeued there. In this way, tasks throttled will
>> > not hold any kernel resources. And on unthrottle, enqueue back those
>> > tasks so they can continue to run.
>> >
>> 
>> Moving the actual throttle work to pick time is clever. In my previous
>> versions I tried really hard to stay out of the enqueue/dequeue/pick paths,
>> but this makes the code a lot more palatable. I'd like to see how this
>> impacts performance though.
>> 
>
> Let me run some scheduler benchmark to see how it impacts performance.
>
> I'm thinking maybe running something like hackbench on server platforms,
> first with quota not set and see if performance changes; then also test
> with quota set and see how performance changes.
>
> Does this sound good to you? Or do you have any specific benchmark and
> test methodology in mind?
>

Yeah hackbench is pretty good for stressing the EQ/DQ paths.

>> > +	if (throttled_hierarchy(cfs_rq) &&
>> > +	    !task_current_donor(rq_of(cfs_rq), p)) {
>> > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
>> > +		return true;
>> > +	}
>> > +
>> > +	/* we can't take the fast path, do an actual enqueue*/
>> > +	p->throttled = false;
>> 
>> So we clear p->throttled but not p->throttle_node? Won't that cause issues
>> when @p's previous cfs_rq gets unthrottled?
>> 
>
> p->throttle_node is already removed from its previous cfs_rq at dequeue
> time in dequeue_throttled_task().
>
> This is done so because in enqueue time, we may not hold its previous
> rq's lock so can't touch its previous cfs_rq's limbo list, like when
> dealing with affinity changes.
>

Ah right, the DQ/EQ_throttled_task() functions are when DQ/EQ is applied to an
already-throttled task and it does the right thing.

Does this mean we want this as enqueue_throttled_task()'s prologue?

  /* @p should have gone through dequeue_throttled_task() first */
  WARN_ON_ONCE(!list_empty(&p->throttle_node));

>> > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>> >   */
>> >  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>> >  {
>> > +	if (unlikely(task_is_throttled(p))) {
>> > +		dequeue_throttled_task(p, flags);
>> > +		return true;
>> > +	}
>> > +
>> 
>> Handling a throttled task's move pattern at dequeue does simplify things,
>> however that's quite a hot path. I think this wants at the very least a
>> 
>>   if (cfs_bandwidth_used())
>> 
>> since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't
>> hurt, but we can probably find some people who really care about that to
>> run it for us ;)
>> 
>
> No problem.
>
> I'm thinking maybe I can add this cfs_bandwidth_used() in
> task_is_throttled()? So that other callsites of task_is_throttled() can
> also get the benefit of paying less cost when cfs bandwidth is not
> enabled.
>

Sounds good to me; just drop the unlikely and let the static key do its
thing :)

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 1 week ago

On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote:
> On 08/08/25 18:13, Aaron Lu wrote:
> > On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote:
... ...
> >> > +	if (throttled_hierarchy(cfs_rq) &&
> >> > +	    !task_current_donor(rq_of(cfs_rq), p)) {
> >> > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> >> > +		return true;
> >> > +	}
> >> > +
> >> > +	/* we can't take the fast path, do an actual enqueue*/
> >> > +	p->throttled = false;
> >> 
> >> So we clear p->throttled but not p->throttle_node? Won't that cause issues
> >> when @p's previous cfs_rq gets unthrottled?
> >> 
> >
> > p->throttle_node is already removed from its previous cfs_rq at dequeue
> > time in dequeue_throttled_task().
> >
> > This is done so because in enqueue time, we may not hold its previous
> > rq's lock so can't touch its previous cfs_rq's limbo list, like when
> > dealing with affinity changes.
> >
> 
> Ah right, the DQ/EQ_throttled_task() functions are when DQ/EQ is applied to an
> already-throttled task and it does the right thing.
> 
> Does this mean we want this as enqueue_throttled_task()'s prologue?
> 
>   /* @p should have gone through dequeue_throttled_task() first */
>   WARN_ON_ONCE(!list_empty(&p->throttle_node));
>

While adding this change to the new version, I remembered this
enqueue_throttled_task() also gets called for tasks that are going to be
unthrottled on unthrottle path, i.e.

unthrottle_cfs_rq() -> tg_unthrottle_up() -> enqueue_task_fair()

because task's throttled flag is not cleared yet(but throttle_node is
removed from the limbo list so the above warn still works as expected).

I didn't clear p->throttled in tg_unthrottle_up() before calling
enqueue_task_fair() because enqueue_throttled_task() will take care of
that but now I look at it, I think it is better to clear p->throttled
before calling enqueue_task_fair(): this saves some cycles by skipping
enqueue_throttled_task() for these unthrottled tasks and
enqueue_throttled_task() only has to deal with migrated throttled task.
This feels cleaner and more efficient. I remember Prateek also suggested
this before but I couldn't find his email now, not sure if I remembered
wrong.

Any way, just a note that I'm going to make below changes to the next
version, let me know if this doesn't look right, thanks.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 785a15caffbcc..df8dc389af8e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5904,6 +5904,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	/* Re-enqueue the tasks that have been throttled at this level. */
 	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
 		list_del_init(&p->throttle_node);
+		p->throttled = false;
 		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 3 weeks ago

On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote:
> On 08/08/25 18:13, Aaron Lu wrote:
> > On Fri, Aug 08, 2025 at 11:12:48AM +0200, Valentin Schneider wrote:
> >> On 15/07/25 15:16, Aaron Lu wrote:
> >> > From: Valentin Schneider <vschneid@redhat.com>
> >> >
> >> > In current throttle model, when a cfs_rq is throttled, its entity will
> >> > be dequeued from cpu's rq, making tasks attached to it not able to run,
> >> > thus achiveing the throttle target.
> >> >
> >> > This has a drawback though: assume a task is a reader of percpu_rwsem
> >> > and is waiting. When it gets woken, it can not run till its task group's
> >> > next period comes, which can be a relatively long time. Waiting writer
> >> > will have to wait longer due to this and it also makes further reader
> >> > build up and eventually trigger task hung.
> >> >
> >> > To improve this situation, change the throttle model to task based, i.e.
> >> > when a cfs_rq is throttled, record its throttled status but do not remove
> >> > it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
> >> > they get picked, add a task work to them so that when they return
> >> > to user, they can be dequeued there. In this way, tasks throttled will
> >> > not hold any kernel resources. And on unthrottle, enqueue back those
> >> > tasks so they can continue to run.
> >> >
> >> 
> >> Moving the actual throttle work to pick time is clever. In my previous
> >> versions I tried really hard to stay out of the enqueue/dequeue/pick paths,
> >> but this makes the code a lot more palatable. I'd like to see how this
> >> impacts performance though.
> >> 
> >
> > Let me run some scheduler benchmark to see how it impacts performance.
> >
> > I'm thinking maybe running something like hackbench on server platforms,
> > first with quota not set and see if performance changes; then also test
> > with quota set and see how performance changes.
> >
> > Does this sound good to you? Or do you have any specific benchmark and
> > test methodology in mind?
> >
> 
> Yeah hackbench is pretty good for stressing the EQ/DQ paths.
> 

Tested hackbench/pipe and netperf/UDP_RR on Intel EMR(2 sockets/240
cpus) and AMD Genoa(2 sockets/384 cpus), the tldr is: there is no clear
performance change between base and this patchset(head). Below is
detailed test data:
(turbo/boost disabled, cpuidle disabled, cpufreq set to performance)

hackbench/pipe/loops=150000
(seconds, smaller is better)

On Intel EMR:

nr_group          base             head          change
 1              3.62±2.99%      3.61±10.42%      +0.28%
 8              8.06±1.58%      7.88±5.82%       +2.23%
16             11.40±2.57%     11.25±3.72%       +1.32%

For nr_group=16 case, configure a cgroup and set quota to half cpu and
then let hackbench run in this cgroup:

                 base             head           change
quota=50%      18.35±2.40%     18.78±1.97%       -2.34%

On AMD Genoa:

nr_group          base             head          change
 1             17.05±1.92%     16.99±2.81%       +0.35%
 8             16.54±0.71%     16.73±1.18%       -1.15%
16             27.04±0.39%     26.72±2.37%       +1.18%

For nr_group=16 case, configure a cgroup and set quota to half cpu and
then let hackbench run in this cgroup:

                  base             head          change
quota=50%      43.79±1.10%     44.65±0.37%       -1.96%

Netperf/UDP_RR/testlen=30s
(throughput, higher is better)

25% means nr_clients set to 1/4 nr_cpu, 50% means nr_clients is 1/2
nr_cpu, etc.

On Intel EMR:

nr_clients     base                 head             change
  25%       83,567±0.06%         84,298±0.23%        +0.87%
  50%       61,336±1.49%         60,816±0.63%        -0.85%
  75%       40,592±0.97%         40,461±0.14%        -0.32%
 100%       31,277±2.11%         30,948±1.84%        -1.05%

For nr_clients=100% case, configure a cgroup and set quota to half cpu
and then let netperf run in this cgroup:

nr_clients     base                 head             change
 100%       25,532±0.56%         26,772±3.05%        +4.86%

On AMD Genoa:

nr_clients     base                 head             change
 25%        12,443±0.40%         12,525±0.06%        +0.66%
 50%        11,403±0.35%         11,472±0.50%        +0.61%
 75%        10,070±0.19%         10,071±0.95%         0.00%
100%         9,947±0.80%          9,881±0.58%        -0.66%

For nr_clients=100% case, configure a cgroup and set quota to half cpu
and then let netperf run in this cgroup:

nr_clients     base                 head             change
100%         4,954±0.24%          4,952±0.14%         0.00%

> >> > +	if (throttled_hierarchy(cfs_rq) &&
> >> > +	    !task_current_donor(rq_of(cfs_rq), p)) {
> >> > +		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> >> > +		return true;
> >> > +	}
> >> > +
> >> > +	/* we can't take the fast path, do an actual enqueue*/
> >> > +	p->throttled = false;
> >> 
> >> So we clear p->throttled but not p->throttle_node? Won't that cause issues
> >> when @p's previous cfs_rq gets unthrottled?
> >> 
> >
> > p->throttle_node is already removed from its previous cfs_rq at dequeue
> > time in dequeue_throttled_task().
> >
> > This is done so because in enqueue time, we may not hold its previous
> > rq's lock so can't touch its previous cfs_rq's limbo list, like when
> > dealing with affinity changes.
> >
> 
> Ah right, the DQ/EQ_throttled_task() functions are when DQ/EQ is applied to an
> already-throttled task and it does the right thing.
> 
> Does this mean we want this as enqueue_throttled_task()'s prologue?
> 
>   /* @p should have gone through dequeue_throttled_task() first */
>   WARN_ON_ONCE(!list_empty(&p->throttle_node));
>

Sure, will add it in next version.

> >> > @@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> >> >   */
> >> >  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >> >  {
> >> > +	if (unlikely(task_is_throttled(p))) {
> >> > +		dequeue_throttled_task(p, flags);
> >> > +		return true;
> >> > +	}
> >> > +
> >> 
> >> Handling a throttled task's move pattern at dequeue does simplify things,
> >> however that's quite a hot path. I think this wants at the very least a
> >> 
> >>   if (cfs_bandwidth_used())
> >> 
> >> since that has a static key underneath. Some heavy EQ/DQ benchmark wouldn't
> >> hurt, but we can probably find some people who really care about that to
> >> run it for us ;)
> >> 
> >
> > No problem.
> >
> > I'm thinking maybe I can add this cfs_bandwidth_used() in
> > task_is_throttled()? So that other callsites of task_is_throttled() can
> > also get the benefit of paying less cost when cfs bandwidth is not
> > enabled.
> >
> 
> Sounds good to me; just drop the unlikely and let the static key do its
> thing :)

Got it, thanks for the suggestion.

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Valentin Schneider 1 month, 3 weeks ago

On 12/08/25 16:48, Aaron Lu wrote:
> On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote:
>> On 08/08/25 18:13, Aaron Lu wrote:
>> > Let me run some scheduler benchmark to see how it impacts performance.
>> >
>> > I'm thinking maybe running something like hackbench on server platforms,
>> > first with quota not set and see if performance changes; then also test
>> > with quota set and see how performance changes.
>> >
>> > Does this sound good to you? Or do you have any specific benchmark and
>> > test methodology in mind?
>> >
>>
>> Yeah hackbench is pretty good for stressing the EQ/DQ paths.
>>
>
> Tested hackbench/pipe and netperf/UDP_RR on Intel EMR(2 sockets/240
> cpus) and AMD Genoa(2 sockets/384 cpus), the tldr is: there is no clear
> performance change between base and this patchset(head). Below is
> detailed test data:
> (turbo/boost disabled, cpuidle disabled, cpufreq set to performance)
>
> hackbench/pipe/loops=150000
> (seconds, smaller is better)
>
> On Intel EMR:
>
> nr_group          base             head          change
>  1              3.62±2.99%      3.61±10.42%      +0.28%
>  8              8.06±1.58%      7.88±5.82%       +2.23%
> 16             11.40±2.57%     11.25±3.72%       +1.32%
>
> For nr_group=16 case, configure a cgroup and set quota to half cpu and
> then let hackbench run in this cgroup:
>
>                  base             head           change
> quota=50%      18.35±2.40%     18.78±1.97%       -2.34%
>
> On AMD Genoa:
>
> nr_group          base             head          change
>  1             17.05±1.92%     16.99±2.81%       +0.35%
>  8             16.54±0.71%     16.73±1.18%       -1.15%
> 16             27.04±0.39%     26.72±2.37%       +1.18%
>
> For nr_group=16 case, configure a cgroup and set quota to half cpu and
> then let hackbench run in this cgroup:
>
>                   base             head          change
> quota=50%      43.79±1.10%     44.65±0.37%       -1.96%
>
> Netperf/UDP_RR/testlen=30s
> (throughput, higher is better)
>
> 25% means nr_clients set to 1/4 nr_cpu, 50% means nr_clients is 1/2
> nr_cpu, etc.
>
> On Intel EMR:
>
> nr_clients     base                 head             change
>   25%       83,567±0.06%         84,298±0.23%        +0.87%
>   50%       61,336±1.49%         60,816±0.63%        -0.85%
>   75%       40,592±0.97%         40,461±0.14%        -0.32%
>  100%       31,277±2.11%         30,948±1.84%        -1.05%
>
> For nr_clients=100% case, configure a cgroup and set quota to half cpu
> and then let netperf run in this cgroup:
>
> nr_clients     base                 head             change
>  100%       25,532±0.56%         26,772±3.05%        +4.86%
>
> On AMD Genoa:
>
> nr_clients     base                 head             change
>  25%        12,443±0.40%         12,525±0.06%        +0.66%
>  50%        11,403±0.35%         11,472±0.50%        +0.61%
>  75%        10,070±0.19%         10,071±0.95%         0.00%
> 100%         9,947±0.80%          9,881±0.58%        -0.66%
>
> For nr_clients=100% case, configure a cgroup and set quota to half cpu
> and then let netperf run in this cgroup:
>
> nr_clients     base                 head             change
> 100%         4,954±0.24%          4,952±0.14%         0.00%

Thank you for running these, looks like mostly slightly bigger variance on
a few of these but that's about it.

I would also suggest running similar benchmarks but with deeper
hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get
when tg_unthrottle_up() goes up a bigger tree.

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 3 weeks ago

On Thu, Aug 14, 2025 at 05:54:34PM +0200, Valentin Schneider wrote:
> On 12/08/25 16:48, Aaron Lu wrote:
> > On Fri, Aug 08, 2025 at 01:45:11PM +0200, Valentin Schneider wrote:
> >> On 08/08/25 18:13, Aaron Lu wrote:
> >> > Let me run some scheduler benchmark to see how it impacts performance.
> >> >
> >> > I'm thinking maybe running something like hackbench on server platforms,
> >> > first with quota not set and see if performance changes; then also test
> >> > with quota set and see how performance changes.
> >> >
> >> > Does this sound good to you? Or do you have any specific benchmark and
> >> > test methodology in mind?
> >> >
> >>
> >> Yeah hackbench is pretty good for stressing the EQ/DQ paths.
> >>
> >
> > Tested hackbench/pipe and netperf/UDP_RR on Intel EMR(2 sockets/240
> > cpus) and AMD Genoa(2 sockets/384 cpus), the tldr is: there is no clear
> > performance change between base and this patchset(head). Below is
> > detailed test data:
> > (turbo/boost disabled, cpuidle disabled, cpufreq set to performance)
> >
> > hackbench/pipe/loops=150000
> > (seconds, smaller is better)
> >
> > On Intel EMR:
> >
> > nr_group          base             head          change
> >  1              3.62±2.99%      3.61±10.42%      +0.28%
> >  8              8.06±1.58%      7.88±5.82%       +2.23%
> > 16             11.40±2.57%     11.25±3.72%       +1.32%
> >
> > For nr_group=16 case, configure a cgroup and set quota to half cpu and
> > then let hackbench run in this cgroup:
> >
> >                  base             head           change
> > quota=50%      18.35±2.40%     18.78±1.97%       -2.34%
> >
> > On AMD Genoa:
> >
> > nr_group          base             head          change
> >  1             17.05±1.92%     16.99±2.81%       +0.35%
> >  8             16.54±0.71%     16.73±1.18%       -1.15%
> > 16             27.04±0.39%     26.72±2.37%       +1.18%
> >
> > For nr_group=16 case, configure a cgroup and set quota to half cpu and
> > then let hackbench run in this cgroup:
> >
> >                   base             head          change
> > quota=50%      43.79±1.10%     44.65±0.37%       -1.96%
> >
> > Netperf/UDP_RR/testlen=30s
> > (throughput, higher is better)
> >
> > 25% means nr_clients set to 1/4 nr_cpu, 50% means nr_clients is 1/2
> > nr_cpu, etc.
> >
> > On Intel EMR:
> >
> > nr_clients     base                 head             change
> >   25%       83,567±0.06%         84,298±0.23%        +0.87%
> >   50%       61,336±1.49%         60,816±0.63%        -0.85%
> >   75%       40,592±0.97%         40,461±0.14%        -0.32%
> >  100%       31,277±2.11%         30,948±1.84%        -1.05%
> >
> > For nr_clients=100% case, configure a cgroup and set quota to half cpu
> > and then let netperf run in this cgroup:
> >
> > nr_clients     base                 head             change
> >  100%       25,532±0.56%         26,772±3.05%        +4.86%
> >
> > On AMD Genoa:
> >
> > nr_clients     base                 head             change
> >  25%        12,443±0.40%         12,525±0.06%        +0.66%
> >  50%        11,403±0.35%         11,472±0.50%        +0.61%
> >  75%        10,070±0.19%         10,071±0.95%         0.00%
> > 100%         9,947±0.80%          9,881±0.58%        -0.66%
> >
> > For nr_clients=100% case, configure a cgroup and set quota to half cpu
> > and then let netperf run in this cgroup:
> >
> > nr_clients     base                 head             change
> > 100%         4,954±0.24%          4,952±0.14%         0.00%
> 
> Thank you for running these, looks like mostly slightly bigger variance on
> a few of these but that's about it.
> 
> I would also suggest running similar benchmarks but with deeper
> hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get
> when tg_unthrottle_up() goes up a bigger tree.

No problem.

I suppose I can reuse the previous shared test script:
https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/

There I used:
nr_level1=2
nr_level2=100
nr_level3=10

But I can tweak these numbers for this performance evaluation. I can make
the leaf level to be 5 level deep and place tasks in leaf level cgroups
and configure quota on 1st level cgroups.

I'll get back to you once I finished collecting data, feel free to let
me know if you have other idea testing this :)

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month, 2 weeks ago

On Fri, Aug 15, 2025 at 05:30:08PM +0800, Aaron Lu wrote:
> On Thu, Aug 14, 2025 at 05:54:34PM +0200, Valentin Schneider wrote:
... ...
> > I would also suggest running similar benchmarks but with deeper
> > hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get
> > when tg_unthrottle_up() goes up a bigger tree.
> 
> No problem.
> 
> I suppose I can reuse the previous shared test script:
> https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/
> 
> There I used:
> nr_level1=2
> nr_level2=100
> nr_level3=10
> 
> But I can tweak these numbers for this performance evaluation. I can make
> the leaf level to be 5 level deep and place tasks in leaf level cgroups
> and configure quota on 1st level cgroups.

Tested on Intel EMR(2 sockets, 120cores, 240cpus) and AMD Genoa(2
sockets, 192cores, 384cpus), with turbo/boost disabled, cpufreq set to
performance and cpuidle states all disabled.

cgroup hierarchy:
nr_level1=2
nr_level2=2
nr_level3=2
nr_level4=5
nr_level5=5
i.e. two cgroups in the root level, with each level1 cgroup having 2
child cgroups, and each level2 cgroup having 2 child cgroups, etc. This
creates a 5 level deep, 200 leaf cgroups setup. Tasks are placed in leaf
cgroups. Quota are set on the two level1 cgroups.

The TLDR is, when there is a very large number of tasks(like 8000 tasks),
task based throttle saw 10-20% performance drop on AMD Genoa; otherwise,
no obvious performance change is observed. Detailed test results below.

Netperf: measured in throughput, more is better
- quota set to 50 cpu for each level1 cgroup;
- each leaf cgroup run a pair of netserver and netperf with following
  cmdline:
  netserver -p $port_for_this_cgroup
  netperf -p $port_for_this_cgroup -H 127.0.0.1 -t UDP_RR -c -C -l 30
  i.e. each cgroup has 2 tasks, total task number is 2 * 200 = 400
  tasks.

On Intel EMR:
              base            head         diff
throughput    33305±8.40%     33995±7.84%  noise

On AMD Genoa:
              base            head         diff
throughput    5013±1.16%      4967±1.82    noise

Hackbench, measured in seconds, less is better:
- quota set to 50cpu for each level1 cgroup;
- each cgroup runs with the following cmdline:
  hackbench -p -g 1 -l $see_below
i.e. each leaf cgroup has 20 sender tasks and 20 receiver tasks, total
task number is 40 * 200 = 8000 tasks.

On Intel EMR(loops set to 100000):

         base             head              diff
Time     85.45±3.98%      86.41±3.98%       noise

On AMD Genoa(loops set to 20000):

         base             head              diff
Time     104±4.33%        116±7.71%        -11.54%

So for this test case, task based throttle suffered ~10% performance
drop. I also tested on another AMD Genoa(same cpu spec) to make sure
it's not a machine problem and performance dropped there too:

On 2nd AMD Genoa(loops set to 50000)

        base             head               diff
Time    81±3.13%         101±7.05%         -24.69%

According to perf, __schedule() in head takes 7.29% cycles while in base
it takes 4.61% cycles. I suppose with task based throttle, __schedule()
is more frequent since tasks in a throttled cfs_rq have to be dequeued
one by one while in current behaviour, the cfs_rq can be dequeued off rq
in one go. This is most obvious when there are multiple tasks in a single
cfs_rq; if there is only 1 task per cfs_rq, things should be roughly the
same for the two throttling model.

With this said, I reduced the task number and retested on this 2nd AMD
Genoa:
- quota set to 50 cpu for each level1 cgroup;
- using only 1 fd pair, i.e. 2 task for each cgroup:
  hackbench -p -g 1 -f 1 -l 50000000
  i.e. each leaf cgroup has 1 sender task and 1 receiver task, total
  task number is 2 * 200 = 400 tasks.

        base             head               diff
Time    127.77±2.60%     127.49±2.63%       noise

In this setup, performance is about the same.

Now I'm wondering why on Intel EMR, running that extreme setup(8000
tasks), performance of task based throttle didn't see noticeable drop...

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month ago

On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote:
> On Fri, Aug 15, 2025 at 05:30:08PM +0800, Aaron Lu wrote:
> > On Thu, Aug 14, 2025 at 05:54:34PM +0200, Valentin Schneider wrote:
> ... ...
> > > I would also suggest running similar benchmarks but with deeper
> > > hierarchies, to get an idea of how much worse unthrottle_cfs_rq() can get
> > > when tg_unthrottle_up() goes up a bigger tree.
> > 
> > No problem.
> > 
> > I suppose I can reuse the previous shared test script:
> > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/
> > 
> > There I used:
> > nr_level1=2
> > nr_level2=100
> > nr_level3=10
> > 
> > But I can tweak these numbers for this performance evaluation. I can make
> > the leaf level to be 5 level deep and place tasks in leaf level cgroups
> > and configure quota on 1st level cgroups.
> 
> Tested on Intel EMR(2 sockets, 120cores, 240cpus) and AMD Genoa(2
> sockets, 192cores, 384cpus), with turbo/boost disabled, cpufreq set to
> performance and cpuidle states all disabled.
> 
> cgroup hierarchy:
> nr_level1=2
> nr_level2=2
> nr_level3=2
> nr_level4=5
> nr_level5=5
> i.e. two cgroups in the root level, with each level1 cgroup having 2
> child cgroups, and each level2 cgroup having 2 child cgroups, etc. This
> creates a 5 level deep, 200 leaf cgroups setup. Tasks are placed in leaf
> cgroups. Quota are set on the two level1 cgroups.
> 
> The TLDR is, when there is a very large number of tasks(like 8000 tasks),
> task based throttle saw 10-20% performance drop on AMD Genoa; otherwise,
> no obvious performance change is observed. Detailed test results below.
> 
> Netperf: measured in throughput, more is better
> - quota set to 50 cpu for each level1 cgroup;
> - each leaf cgroup run a pair of netserver and netperf with following
>   cmdline:
>   netserver -p $port_for_this_cgroup
>   netperf -p $port_for_this_cgroup -H 127.0.0.1 -t UDP_RR -c -C -l 30
>   i.e. each cgroup has 2 tasks, total task number is 2 * 200 = 400
>   tasks.
> 
> On Intel EMR:
>               base            head         diff
> throughput    33305±8.40%     33995±7.84%  noise
> 
> On AMD Genoa:
>               base            head         diff
> throughput    5013±1.16%      4967±1.82    noise
> 
> 
> Hackbench, measured in seconds, less is better:
> - quota set to 50cpu for each level1 cgroup;
> - each cgroup runs with the following cmdline:
>   hackbench -p -g 1 -l $see_below
> i.e. each leaf cgroup has 20 sender tasks and 20 receiver tasks, total
> task number is 40 * 200 = 8000 tasks.
> 
> On Intel EMR(loops set to 100000):
> 
>          base             head              diff
> Time     85.45±3.98%      86.41±3.98%       noise
> 
> On AMD Genoa(loops set to 20000):
> 
>          base             head              diff
> Time     104±4.33%        116±7.71%        -11.54%
> 
> So for this test case, task based throttle suffered ~10% performance
> drop. I also tested on another AMD Genoa(same cpu spec) to make sure
> it's not a machine problem and performance dropped there too:
> 
> On 2nd AMD Genoa(loops set to 50000)
> 
>         base             head               diff
> Time    81±3.13%         101±7.05%         -24.69%
> 
> According to perf, __schedule() in head takes 7.29% cycles while in base
> it takes 4.61% cycles. I suppose with task based throttle, __schedule()
> is more frequent since tasks in a throttled cfs_rq have to be dequeued
> one by one while in current behaviour, the cfs_rq can be dequeued off rq
> in one go. This is most obvious when there are multiple tasks in a single
> cfs_rq; if there is only 1 task per cfs_rq, things should be roughly the
> same for the two throttling model.
> 
> With this said, I reduced the task number and retested on this 2nd AMD
> Genoa:
> - quota set to 50 cpu for each level1 cgroup;
> - using only 1 fd pair, i.e. 2 task for each cgroup:
>   hackbench -p -g 1 -f 1 -l 50000000
>   i.e. each leaf cgroup has 1 sender task and 1 receiver task, total
>   task number is 2 * 200 = 400 tasks.
> 
>         base             head               diff
> Time    127.77±2.60%     127.49±2.63%       noise
> 
> In this setup, performance is about the same.
> 
> Now I'm wondering why on Intel EMR, running that extreme setup(8000
> tasks), performance of task based throttle didn't see noticeable drop...

Looks like hackbench doesn't like task migration on this AMD system
(domain0 SMT; domain1 MC; domain2 PKG; domain3 NUMA).

If I revert patch5, running this 40 * 200 = 8000 hackbench workload
again, performance is roughly the same now(head~1 is slightly worse but
given the 4+% stddev in base, it can be considered in noise range):

         base              head~1(patch1-4)    diff     head(patch1-5)
Time     82.55±4.82%       84.45±2.70%         -2.3%    99.69±6.71%

According to /proc/schedstat, the lb_gained for domain2 is:

         NOT_IDLE IDLE  NEWLY_IDLE
base        0     8052    81791    
head~1      0     7197   175096
head        1    14818   793065

Other domains have similar number: base has smallest migration number
while head has the most and head~1 reduce the number a lot. I suppose
this is expected, because we removed the throttled_lb_pair() restriction
in patch5 and that can cause runnable tasks in throttled hierarchy to be
balanced to other cpus while in base, this can not happen.

I think patch5 still makes sense and is correct, it's just this specific
workload doesn't like task migrations. Intel EMR doesn't suffer from
this, I suppose that's because EMR has a much larger LLC while AMD Genoa
has a relatively small LLC and task migrations across LLC boundary hurts
hackbench's performance.

I also tried to apply below hack to prove this "task migration across
LLC boundary hurts hackbench" theory on both base and head:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c2..34c5f6b75e53d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9297,6 +9297,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
 		return 0;
 
+	if (!(env->sd->flags & SD_SHARE_LLC))
+		return 0;
+
 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 		return 0;
 
With this diff applied, the result is:


         base'              head'              diff
Time     74.78±8.2%         78.87±15.4%        -5.47%

base': base + above diff
head': head + above diff

So both performs better now, but with much larger variance, I guess
that's because no load balance on domain2 and above now. head' is still
worse than base, but not as much as before.

To conclude this: hackbench doesn't like task migration, especially when
task is migrated across LLC boundary. patch5 removed the restriction of
no balancing throttled tasks, this caused more balance to happen and
hackbench doesn't like this. But balancing has its own merit and could
still benefit other workloads so I think patch5 should stay, especially
considering that when throttled tasks are eventually dequeued, they will
not stay on rq's cfs_tasks list so no need to take special care for them
when doing load balance.

On a side note: should we increase the cost of balancing tasks out of LLC
boundary? I tried to enlarge sysctl_sched_migration_cost 100 times for
domains without SD_SHARE_LLC in task_hot() but that didn't help.

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by K Prateek Nayak 1 month ago

Hello Aaron,

On 9/3/2025 12:44 PM, Aaron Lu wrote:
> On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote:
>> With this said, I reduced the task number and retested on this 2nd AMD
>> Genoa:
>> - quota set to 50 cpu for each level1 cgroup;

What exactly is the quota and period when you say 50cpu?

>> - using only 1 fd pair, i.e. 2 task for each cgroup:
>>   hackbench -p -g 1 -f 1 -l 50000000
>>   i.e. each leaf cgroup has 1 sender task and 1 receiver task, total
>>   task number is 2 * 200 = 400 tasks.
>>
>>         base             head               diff
>> Time    127.77±2.60%     127.49±2.63%       noise
>>
>> In this setup, performance is about the same.
>>
>> Now I'm wondering why on Intel EMR, running that extreme setup(8000
>> tasks), performance of task based throttle didn't see noticeable drop...
> 
> Looks like hackbench doesn't like task migration on this AMD system
> (domain0 SMT; domain1 MC; domain2 PKG; domain3 NUMA).
> 
> If I revert patch5, running this 40 * 200 = 8000 hackbench workload
> again, performance is roughly the same now(head~1 is slightly worse but
> given the 4+% stddev in base, it can be considered in noise range):
> 
>          base              head~1(patch1-4)    diff     head(patch1-5)
> Time     82.55±4.82%       84.45±2.70%         -2.3%    99.69±6.71%
> 
> According to /proc/schedstat, the lb_gained for domain2 is:
> 
>          NOT_IDLE IDLE  NEWLY_IDLE
> base        0     8052    81791    
> head~1      0     7197   175096
> head        1    14818   793065

Since these are mostly idle and newidle balance, I wonder if we can run
into a scenario where,

1. All the tasks are throttled.
2. CPU turning idle does a newidle balance.
3. CPU pulls a tasks from throttled hierarchy and selects it.
4. The task exits to user space and is dequeued.
5. Goto 1.

and when the CPU is unthrottled, it has a large number of tasks on it
that'll again require a load balance to even stuff out.

> 
> Other domains have similar number: base has smallest migration number
> while head has the most and head~1 reduce the number a lot. I suppose
> this is expected, because we removed the throttled_lb_pair() restriction
> in patch5 and that can cause runnable tasks in throttled hierarchy to be
> balanced to other cpus while in base, this can not happen.
> 
> I think patch5 still makes sense and is correct, it's just this specific
> workload doesn't like task migrations. Intel EMR doesn't suffer from
> this, I suppose that's because EMR has a much larger LLC while AMD Genoa
> has a relatively small LLC and task migrations across LLC boundary hurts
> hackbench's performance.

I think we can leave the throttled_lb_pair() condition as is and revisit
it later if this is visible in real world workloads. I cannot think of
any easy way to avoid the case for potential pileup without accounting
for the throttled tasks in limbo except for something like below at
head~1:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bdc9bfa0b9ef..3dc807af21ba 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9385,7 +9385,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) delayed dequeued unless we migrate load, or
-	 * 2) throttled_lb_pair, or
+	 * 2) throttled_lb_pair unless we migrate load, or
 	 * 3) cannot be migrated to this CPU due to cpus_ptr, or
 	 * 4) running (obviously), or
 	 * 5) are cache-hot on their current CPU, or
@@ -9394,7 +9394,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
 		return 0;
 
-	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu) &&
+	    env->migration_type != migrate_load)
 		return 0;
 
 	/*
---

Since load_avg moves slowly, it might be enough to avoid pileup of
tasks. This is similar to the condition for migrating delayed tasks
above but unlike the hierarchies of delayed tasks, the weight of
throttled hierarchy does change when throttled tasks are transitioned to
limbo so this needs some more staring at.

> 
> I also tried to apply below hack to prove this "task migration across
> LLC boundary hurts hackbench" theory on both base and head:
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b173a059315c2..34c5f6b75e53d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9297,6 +9297,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>  	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
>  		return 0;
>  
> +	if (!(env->sd->flags & SD_SHARE_LLC))
> +		return 0;
> +
>  	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
>  		return 0;
>  
> With this diff applied, the result is:
> 
> 
>          base'              head'              diff
> Time     74.78±8.2%         78.87±15.4%        -5.47%
> 
> base': base + above diff
> head': head + above diff
> 
> So both performs better now, but with much larger variance, I guess
> that's because no load balance on domain2 and above now. head' is still
> worse than base, but not as much as before.
> 
> To conclude this: hackbench doesn't like task migration, especially when
> task is migrated across LLC boundary. patch5 removed the restriction of
> no balancing throttled tasks, this caused more balance to happen and
> hackbench doesn't like this. But balancing has its own merit and could
> still benefit other workloads so I think patch5 should stay, especially
> considering that when throttled tasks are eventually dequeued, they will
> not stay on rq's cfs_tasks list so no need to take special care for them
> when doing load balance.

Mathieu had run some experiments a couple years ago where he too
discovered reducing the number of migrations for hackbench helps but it
wasn't clear if these strategies would benefit real-world workloads:

https://lore.kernel.org/lkml/20230905171105.1005672-1-mathieu.desnoyers@efficios.com/ 
https://lore.kernel.org/lkml/20231018204511.1563390-1-mathieu.desnoyers@efficios.com/

> 
> On a side note: should we increase the cost of balancing tasks out of LLC
> boundary? I tried to enlarge sysctl_sched_migration_cost 100 times for
> domains without SD_SHARE_LLC in task_hot() but that didn't help.

I'll take a look at sd->imbalance_pct and see if there is any room for
improvements there. Thank you again for the detailed analysis.

-- 
Thanks and Regards,
Prateek

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month ago

Hi Prateek,

On Wed, Sep 03, 2025 at 02:41:55PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 9/3/2025 12:44 PM, Aaron Lu wrote:
> > On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote:
> >> With this said, I reduced the task number and retested on this 2nd AMD
> >> Genoa:
> >> - quota set to 50 cpu for each level1 cgroup;
> 
> What exactly is the quota and period when you say 50cpu?

period is the default 100000 and quota is set to 5000000.

> 
> >> - using only 1 fd pair, i.e. 2 task for each cgroup:
> >>   hackbench -p -g 1 -f 1 -l 50000000
> >>   i.e. each leaf cgroup has 1 sender task and 1 receiver task, total
> >>   task number is 2 * 200 = 400 tasks.
> >>
> >>         base             head               diff
> >> Time    127.77±2.60%     127.49±2.63%       noise
> >>
> >> In this setup, performance is about the same.
> >>
> >> Now I'm wondering why on Intel EMR, running that extreme setup(8000
> >> tasks), performance of task based throttle didn't see noticeable drop...
> > 
> > Looks like hackbench doesn't like task migration on this AMD system
> > (domain0 SMT; domain1 MC; domain2 PKG; domain3 NUMA).
> > 
> > If I revert patch5, running this 40 * 200 = 8000 hackbench workload
> > again, performance is roughly the same now(head~1 is slightly worse but
> > given the 4+% stddev in base, it can be considered in noise range):
> > 
> >          base              head~1(patch1-4)    diff     head(patch1-5)
> > Time     82.55±4.82%       84.45±2.70%         -2.3%    99.69±6.71%
> > 
> > According to /proc/schedstat, the lb_gained for domain2 is:
> > 
> >          NOT_IDLE IDLE  NEWLY_IDLE
> > base        0     8052    81791    
> > head~1      0     7197   175096
> > head        1    14818   793065
> 
> Since these are mostly idle and newidle balance, I wonder if we can run
> into a scenario where,
> 
> 1. All the tasks are throttled.
> 2. CPU turning idle does a newidle balance.
> 3. CPU pulls a tasks from throttled hierarchy and selects it.
> 4. The task exits to user space and is dequeued.
> 5. Goto 1.
> 
> and when the CPU is unthrottled, it has a large number of tasks on it
> that'll again require a load balance to even stuff out.
> 

I think it is because we allow balancing tasks under a throttled
hirarchy that made the balance number much larger.

> > 
> > Other domains have similar number: base has smallest migration number
> > while head has the most and head~1 reduce the number a lot. I suppose
> > this is expected, because we removed the throttled_lb_pair() restriction
> > in patch5 and that can cause runnable tasks in throttled hierarchy to be
> > balanced to other cpus while in base, this can not happen.
> > 
> > I think patch5 still makes sense and is correct, it's just this specific
> > workload doesn't like task migrations. Intel EMR doesn't suffer from
> > this, I suppose that's because EMR has a much larger LLC while AMD Genoa
> > has a relatively small LLC and task migrations across LLC boundary hurts
> > hackbench's performance.
> 
> I think we can leave the throttled_lb_pair() condition as is and revisit
> it later if this is visible in real world workloads. I cannot think of
> any easy way to avoid the case for potential pileup without accounting
> for the throttled tasks in limbo except for something like below at
> head~1:
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bdc9bfa0b9ef..3dc807af21ba 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9385,7 +9385,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>  	/*
>  	 * We do not migrate tasks that are:
>  	 * 1) delayed dequeued unless we migrate load, or
> -	 * 2) throttled_lb_pair, or
> +	 * 2) throttled_lb_pair unless we migrate load, or
>  	 * 3) cannot be migrated to this CPU due to cpus_ptr, or
>  	 * 4) running (obviously), or
>  	 * 5) are cache-hot on their current CPU, or
> @@ -9394,7 +9394,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>  	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
>  		return 0;
>  
> -	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
> +	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu) &&
> +	    env->migration_type != migrate_load)
>  		return 0;
>  
>  	/*
> ---
> 
> Since load_avg moves slowly, it might be enough to avoid pileup of
> tasks. This is similar to the condition for migrating delayed tasks
> above but unlike the hierarchies of delayed tasks, the weight of
> throttled hierarchy does change when throttled tasks are transitioned to
> limbo so this needs some more staring at.
>

I was thinking: should we not allow task balancing to a throttled target
cfs_rq? For task based throttle model, if a task is on rq's cfs_tasks
list, it is allowed to run so we should not check src cfs_rq's throttle
status but we should check if the target cfs_rq is throttled and if it is,
then it's probably not very useful to do the balance. I tried below diff
and the performance is restored:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df8dc389af8e1..3e927b9b7eeb6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9370,6 +9370,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
 		return 0;
 
+	if (throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
+		return 0;
+
 	/*
 	 * We want to prioritize the migration of eligible tasks.
 	 * For ineligible tasks we soft-limit them and only allow

         base              head'               diff     head(patch1-5)
Time     82.55±4.82%       83.81±2.89%         -1.5%    99.69±6.71%

head': head + above diff

I also tested netperf on this AMD system as well as hackbench and
netperf on Intel EMR, no obvious performance difference observed
after applying the above diff, i.e. base and head' performance is
roughly the same.

Does the above diff make sense? One thing I'm slightly concerned is,
there may be one case when balancing a task to a throttled target
cfs_rq makes sense: if the task holds some kernel resource and is
running inside kernel, even its target cfs_rq is throttled, we still
want this task to go there and finish its job in kernel mode sooner,
this could help other resource waiters. But, this may not be a big deal
and in most of the time, balancing a task to a throttled cfs_rq doesn't
look like a meaningful thing to do.

Best regards,
Aaron

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by K Prateek Nayak 1 month ago

Hello Aaron,

On 9/3/2025 3:41 PM, Aaron Lu wrote:
> Hi Prateek,
> 
> On Wed, Sep 03, 2025 at 02:41:55PM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 9/3/2025 12:44 PM, Aaron Lu wrote:
>>> On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote:
>>>> With this said, I reduced the task number and retested on this 2nd AMD
>>>> Genoa:
>>>> - quota set to 50 cpu for each level1 cgroup;
>>
>> What exactly is the quota and period when you say 50cpu?
> 
> period is the default 100000 and quota is set to 5000000.

Thank you! I'll do some tests on my end as well.

[..snip..]
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index df8dc389af8e1..3e927b9b7eeb6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9370,6 +9370,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>  	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
>  		return 0;
>  
> +	if (throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
> +		return 0;
> +

This makes sense instead of the full throttled_lb_pair(). You'll still
need to put it behind CONFIG_CGROUP_SCHED (or better yet
CONFIG_CFS_BANDWIDTH) since task_group() can return NULL if GROUP_SCHED
is not enabled.

>  	/*
>  	 * We want to prioritize the migration of eligible tasks.
>  	 * For ineligible tasks we soft-limit them and only allow
> 
>          base              head'               diff     head(patch1-5)
> Time     82.55±4.82%       83.81±2.89%         -1.5%    99.69±6.71%
> 
> head': head + above diff
> 
> I also tested netperf on this AMD system as well as hackbench and
> netperf on Intel EMR, no obvious performance difference observed
> after applying the above diff, i.e. base and head' performance is
> roughly the same.
> 
> Does the above diff make sense? One thing I'm slightly concerned is,
> there may be one case when balancing a task to a throttled target
> cfs_rq makes sense: if the task holds some kernel resource and is
> running inside kernel, even its target cfs_rq is throttled, we still
> want this task to go there and finish its job in kernel mode sooner,
> this could help other resource waiters. But, this may not be a big deal

I think it is still an improvement over per-cfs_rq throttling from a
tail latency perspective.

> and in most of the time, balancing a task to a throttled cfs_rq doesn't
> look like a meaningful thing to do.Ack.

-- 
Thanks and Regards,
Prateek

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month ago

On Wed, Sep 03, 2025 at 04:01:03PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 9/3/2025 3:41 PM, Aaron Lu wrote:
> > Hi Prateek,
> > 
> > On Wed, Sep 03, 2025 at 02:41:55PM +0530, K Prateek Nayak wrote:
> >> Hello Aaron,
> >>
> >> On 9/3/2025 12:44 PM, Aaron Lu wrote:
> >>> On Fri, Aug 22, 2025 at 07:07:01PM +0800, Aaron Lu wrote:
> >>>> With this said, I reduced the task number and retested on this 2nd AMD
> >>>> Genoa:
> >>>> - quota set to 50 cpu for each level1 cgroup;
> >>
> >> What exactly is the quota and period when you say 50cpu?
> > 
> > period is the default 100000 and quota is set to 5000000.
> 
> Thank you! I'll do some tests on my end as well.
>

I've attached test scripts I've used for your reference.

> [..snip..]
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index df8dc389af8e1..3e927b9b7eeb6 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -9370,6 +9370,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
> >  	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
> >  		return 0;
> >  
> > +	if (throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
> > +		return 0;
> > +
> 
> This makes sense instead of the full throttled_lb_pair(). You'll still
> need to put it behind CONFIG_CGROUP_SCHED (or better yet
> CONFIG_CFS_BANDWIDTH) since task_group() can return NULL if GROUP_SCHED
> is not enabled.
> 

Got it, thanks for the remind. Maybe I can avoid adding new wrappers
and just check task_group() first, something like this:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df8dc389af8e1..d9abde5e631b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9362,6 +9362,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) delayed dequeued unless we migrate load, or
+	 * 2) target cfs_rq is in throttled hierarchy, or
 	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
 	 * 3) running (obviously), or
 	 * 4) are cache-hot on their current CPU, or
@@ -9370,6 +9371,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
 		return 0;
 
+	if (task_group(p) &&
+	    throttled_hierarchy(task_group(p)->cfs_rq[env->dst_cpu]))
+		return 0;
+
 	/*
 	 * We want to prioritize the migration of eligible tasks.
 	 * For ineligible tasks we soft-limit them and only allow


> >  	/*
> >  	 * We want to prioritize the migration of eligible tasks.
> >  	 * For ineligible tasks we soft-limit them and only allow
> > 
> >          base              head'               diff     head(patch1-5)
> > Time     82.55±4.82%       83.81±2.89%         -1.5%    99.69±6.71%
> > 
> > head': head + above diff
> > 
> > I also tested netperf on this AMD system as well as hackbench and
> > netperf on Intel EMR, no obvious performance difference observed
> > after applying the above diff, i.e. base and head' performance is
> > roughly the same.
> > 
> > Does the above diff make sense? One thing I'm slightly concerned is,
> > there may be one case when balancing a task to a throttled target
> > cfs_rq makes sense: if the task holds some kernel resource and is
> > running inside kernel, even its target cfs_rq is throttled, we still
> > want this task to go there and finish its job in kernel mode sooner,
> > this could help other resource waiters. But, this may not be a big deal
> 
> I think it is still an improvement over per-cfs_rq throttling from a
> tail latency perspective.
> 
> > and in most of the time, balancing a task to a throttled cfs_rq doesn't
> > look like a meaningful thing to do.Ack.

Just want to add that with the above diff applied, I also tested
songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't
see any problem.

[0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/
[1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Bezdeka, Florian 1 month ago

Hi Aaron,

On Wed, 2025-09-03 at 19:35 +0800, Aaron Lu wrote:
> 
> > [..snip..]
> > > 
> 
> Just want to add that with the above diff applied, I also tested
> songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't
> see any problem.

Thanks a lot for taking the reproducer into account. But: To trigger
PREEMPT_RT needs to be enabled, otherwise the rwlock in the epoll
infrastructure (that we highly stress) won't sleep. Just to be sure:
PREEMPT_RT was enabled on your end?

> 
> [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/
> [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 1 month ago

Hi Florian,

On Thu, Sep 04, 2025 at 07:33:03AM +0000, Bezdeka, Florian wrote:
> Hi Aaron,
> 
> On Wed, 2025-09-03 at 19:35 +0800, Aaron Lu wrote:
> > 
> > > [..snip..]
> > > > 
> > 
> > Just want to add that with the above diff applied, I also tested
> > songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't
> > see any problem.
> 
> Thanks a lot for taking the reproducer into account. But: To trigger
> PREEMPT_RT needs to be enabled, otherwise the rwlock in the epoll
> infrastructure (that we highly stress) won't sleep. Just to be sure:
> PREEMPT_RT was enabled on your end?
>

Yes, understood.

I've PREEMPT_RT enabled when running Jan's rt deadlock reproducer. I
also run base and make sure there is hung happened in base before
testing this series. Normally, base would hung in 2 minutes and when
testing this series, I'll give it 5 minutes to see if anything abnormal
happened.

> > 
> > [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/
> > [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by K Prateek Nayak 1 month ago

Hello Florian,

On 9/4/2025 1:03 PM, Bezdeka, Florian wrote:
> Hi Aaron,
> 
> On Wed, 2025-09-03 at 19:35 +0800, Aaron Lu wrote:
>>
>>> [..snip..]
>>>>
>>
>> Just want to add that with the above diff applied, I also tested
>> songtang's stress test[0] and Jan's rt deadlock reproducer[1] and didn't
>> see any problem.
> 
> Thanks a lot for taking the reproducer into account. But: To trigger
> PREEMPT_RT needs to be enabled, otherwise the rwlock in the epoll
> infrastructure (that we highly stress) won't sleep. Just to be sure:
> PREEMPT_RT was enabled on your end?

I've too tested this series with Jan's reproducer on PREEMPT_RT and
can confirm no hang was triggered over a full weekend run lasting
more than 72Hrs with Aaron's changes applied on top of v6.16-rc6
based kernel.

> 
>>
>> [0]: https://lore.kernel.org/lkml/20250715072218.GA304@bytedance/
>> [1]: https://lore.kernel.org/all/7483d3ae-5846-4067-b9f7-390a614ba408@siemens.com/

I used the same reproducer from [1] with small modification to
run.sh to periodically move epoll-stall and epoll-stall-writer
to one particular CPU and keep changing this CPU ever minute. 

-- 
Thanks and Regards,
Prateek

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by kernel test robot 2 months, 3 weeks ago

Hi Aaron,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on next-20250716]
[cannot apply to linus/master v6.16-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
base:   tip/sched/core
patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config)
compiler: xtensa-linux-gcc (GCC) 8.5.0
reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
   kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...):
   arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
   arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
   kernel/sched/core.c: note: in included file:
>> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct *
   kernel/sched/core.c:2131:38: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/core.c:2131:38: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/core.c:2131:38: sparse:    struct task_struct const *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2452:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2452:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2452:9: sparse:    struct task_struct *
   kernel/sched/sched.h:2452:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2452:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2452:9: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct *
--
   kernel/sched/fair.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/kernel.h, ...):
   arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
   arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
   kernel/sched/fair.c: note: in included file:
>> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/fair.c:6073:22: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/fair.c:6073:22: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/fair.c:6073:22: sparse:    struct task_struct *
   kernel/sched/fair.c:10625:22: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/fair.c:10625:22: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/fair.c:10625:22: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2452:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2452:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2452:9: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
--
   kernel/sched/build_policy.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/kernel.h, ...):
   arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
   arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
   kernel/sched/build_policy.c: note: in included file:
>> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/build_policy.c: note: in included file:
   kernel/sched/rt.c:2289:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/rt.c:2289:25: sparse:    struct task_struct *
   kernel/sched/rt.c:2289:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/rt.c:1994:13: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/rt.c:1994:13: sparse:    struct task_struct *
   kernel/sched/rt.c:1994:13: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/build_policy.c: note: in included file:
   kernel/sched/deadline.c:2675:13: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/deadline.c:2675:13: sparse:    struct task_struct *
   kernel/sched/deadline.c:2675:13: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/deadline.c:2781:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/deadline.c:2781:25: sparse:    struct task_struct *
   kernel/sched/deadline.c:2781:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/deadline.c:3024:23: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/deadline.c:3024:23: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/deadline.c:3024:23: sparse:    struct task_struct *
   kernel/sched/build_policy.c: note: in included file:
   kernel/sched/syscalls.c:206:22: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/syscalls.c:206:22: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/syscalls.c:206:22: sparse:    struct task_struct *
   kernel/sched/build_policy.c: note: in included file:
   kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct *
   kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct *
   kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct *
   kernel/sched/sched.h:2252:26: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2252:26: sparse:    struct task_struct *
   kernel/sched/sched.h:2429:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2429:9: sparse:    struct task_struct *
--
   kernel/sched/build_utility.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/kernel.h, ...):
   arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
   arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
   kernel/sched/build_utility.c: note: in included file:
>> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/sched.h:742:55: sparse: sparse: dubious one-bit signed bitfield
   kernel/sched/sched.h:2241:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct [noderef] __rcu *
   kernel/sched/sched.h:2241:25: sparse:    struct task_struct *

vim +741 kernel/sched/sched.h

   709	
   710	#ifdef CONFIG_FAIR_GROUP_SCHED
   711		struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */
   712	
   713		/*
   714		 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
   715		 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
   716		 * (like users, containers etc.)
   717		 *
   718		 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
   719		 * This list is used during load balance.
   720		 */
   721		int			on_list;
   722		struct list_head	leaf_cfs_rq_list;
   723		struct task_group	*tg;	/* group that "owns" this runqueue */
   724	
   725		/* Locally cached copy of our task_group's idle value */
   726		int			idle;
   727	
   728	#ifdef CONFIG_CFS_BANDWIDTH
   729		int			runtime_enabled;
   730		s64			runtime_remaining;
   731	
   732		u64			throttled_pelt_idle;
   733	#ifndef CONFIG_64BIT
   734		u64                     throttled_pelt_idle_copy;
   735	#endif
   736		u64			throttled_clock;
   737		u64			throttled_clock_pelt;
   738		u64			throttled_clock_pelt_time;
   739		u64			throttled_clock_self;
   740		u64			throttled_clock_self_time;
 > 741		int			throttled:1;
   742		int			pelt_clock_throttled:1;
   743		int			throttle_count;
   744		struct list_head	throttled_list;
   745		struct list_head	throttled_csd_list;
   746		struct list_head        throttled_limbo_list;
   747	#endif /* CONFIG_CFS_BANDWIDTH */
   748	#endif /* CONFIG_FAIR_GROUP_SCHED */
   749	};
   750	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 11:20:55PM +0800, kernel test robot wrote:
> Hi Aaron,
> 
> kernel test robot noticed the following build warnings:
> 
> [auto build test WARNING on tip/sched/core]
> [also build test WARNING on next-20250716]
> [cannot apply to linus/master v6.16-rc6]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
> base:   tip/sched/core
> patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
> patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
> config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config)
> compiler: xtensa-linux-gcc (GCC) 8.5.0
> reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/
> 
> sparse warnings: (new ones prefixed by >>)
>    kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...):
>    arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
>    arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
>    kernel/sched/core.c: note: in included file:
> >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield

Same problem as last report.

I've downloaded this compiler from kernel.org and confirmed there is no
such warnings after using bool.

Thanks,
Aaron

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Oliver Sang 2 months, 2 weeks ago

hi, Aaron,

On Thu, Jul 17, 2025 at 11:52:43AM +0800, Aaron Lu wrote:
> On Wed, Jul 16, 2025 at 11:20:55PM +0800, kernel test robot wrote:
> > Hi Aaron,
> > 
> > kernel test robot noticed the following build warnings:
> > 
> > [auto build test WARNING on tip/sched/core]
> > [also build test WARNING on next-20250716]
> > [cannot apply to linus/master v6.16-rc6]
> > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > 
> > url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
> > base:   tip/sched/core
> > patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
> > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
> > config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config)
> > compiler: xtensa-linux-gcc (GCC) 8.5.0
> > reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce)
> > 
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/
> > 
> > sparse warnings: (new ones prefixed by >>)
> >    kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...):
> >    arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
> >    arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
> >    kernel/sched/core.c: note: in included file:
> > >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
> 
> Same problem as last report.
> 
> I've downloaded this compiler from kernel.org and confirmed there is no
> such warnings after using bool.


want to confirm, do you mean you can reproduce the build sparse error
> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield


then after doing below change:

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3c3ea0089b0b5..6eb15b00edccd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -738,7 +738,7 @@ struct cfs_rq {
        u64                     throttled_clock_pelt_time;
        u64                     throttled_clock_self;
        u64                     throttled_clock_self_time;
-       int                     throttled:1;
+       bool                    throttled:1;
        int                     pelt_clock_throttled:1;
        int                     throttle_count;
        struct list_head        throttled_list;


the issue will disappear?


> 
> Thanks,
> Aaron
>

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 2 months, 2 weeks ago

Hi Oliver,

On Wed, Jul 23, 2025 at 04:21:59PM +0800, Oliver Sang wrote:
> hi, Aaron,
> 
> On Thu, Jul 17, 2025 at 11:52:43AM +0800, Aaron Lu wrote:
> > On Wed, Jul 16, 2025 at 11:20:55PM +0800, kernel test robot wrote:
> > > Hi Aaron,
> > > 
> > > kernel test robot noticed the following build warnings:
> > > 
> > > [auto build test WARNING on tip/sched/core]
> > > [also build test WARNING on next-20250716]
> > > [cannot apply to linus/master v6.16-rc6]
> > > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > > And when submitting patch, we suggest to use '--base' as documented in
> > > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > > 
> > > url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
> > > base:   tip/sched/core
> > > patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
> > > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
> > > config: xtensa-randconfig-r121-20250716 (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/config)
> > > compiler: xtensa-linux-gcc (GCC) 8.5.0
> > > reproduce: (https://download.01.org/0day-ci/archive/20250716/202507162238.qiw7kyu0-lkp@intel.com/reproduce)
> > > 
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <lkp@intel.com>
> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202507162238.qiw7kyu0-lkp@intel.com/
> > > 
> > > sparse warnings: (new ones prefixed by >>)
> > >    kernel/sched/core.c: note: in included file (through arch/xtensa/include/asm/bitops.h, include/linux/bitops.h, include/linux/thread_info.h, ...):
> > >    arch/xtensa/include/asm/processor.h:105:2: sparse: sparse: Unsupported xtensa ABI
> > >    arch/xtensa/include/asm/processor.h:135:2: sparse: sparse: Unsupported Xtensa ABI
> > >    kernel/sched/core.c: note: in included file:
> > > >> kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
> > 
> > Same problem as last report.
> > 
> > I've downloaded this compiler from kernel.org and confirmed there is no
> > such warnings after using bool.
> 
> 
> want to confirm, do you mean you can reproduce the build sparse error
> > kernel/sched/sched.h:741:44: sparse: sparse: dubious one-bit signed bitfield
>

Right.

> 
> then after doing below change:
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 3c3ea0089b0b5..6eb15b00edccd 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -738,7 +738,7 @@ struct cfs_rq {
>         u64                     throttled_clock_pelt_time;
>         u64                     throttled_clock_self;
>         u64                     throttled_clock_self_time;
> -       int                     throttled:1;
> +       bool                    throttled:1;
>         int                     pelt_clock_throttled:1;
>         int                     throttle_count;
>         struct list_head        throttled_list;
> 
> 
> the issue will disappear?
> 

Yes, that's correct.

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by kernel test robot 2 months, 3 weeks ago

Hi Aaron,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on next-20250715]
[cannot apply to linus/master v6.16-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
base:   tip/sched/core
patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
    5456 |                         cfs_rq->pelt_clock_throttled = 1;
         |                                                      ^ ~
   kernel/sched/fair.c:5971:32: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
    5971 |                 cfs_rq->pelt_clock_throttled = 1;
         |                                              ^ ~
   kernel/sched/fair.c:6014:20: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
    6014 |         cfs_rq->throttled = 1;
         |                           ^ ~
   3 warnings generated.


vim +/int +5456 kernel/sched/fair.c

  5372	
  5373	static bool
  5374	dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  5375	{
  5376		bool sleep = flags & DEQUEUE_SLEEP;
  5377		int action = UPDATE_TG;
  5378	
  5379		update_curr(cfs_rq);
  5380		clear_buddies(cfs_rq, se);
  5381	
  5382		if (flags & DEQUEUE_DELAYED) {
  5383			WARN_ON_ONCE(!se->sched_delayed);
  5384		} else {
  5385			bool delay = sleep;
  5386			/*
  5387			 * DELAY_DEQUEUE relies on spurious wakeups, special task
  5388			 * states must not suffer spurious wakeups, excempt them.
  5389			 */
  5390			if (flags & DEQUEUE_SPECIAL)
  5391				delay = false;
  5392	
  5393			WARN_ON_ONCE(delay && se->sched_delayed);
  5394	
  5395			if (sched_feat(DELAY_DEQUEUE) && delay &&
  5396			    !entity_eligible(cfs_rq, se)) {
  5397				update_load_avg(cfs_rq, se, 0);
  5398				set_delayed(se);
  5399				return false;
  5400			}
  5401		}
  5402	
  5403		if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
  5404			action |= DO_DETACH;
  5405	
  5406		/*
  5407		 * When dequeuing a sched_entity, we must:
  5408		 *   - Update loads to have both entity and cfs_rq synced with now.
  5409		 *   - For group_entity, update its runnable_weight to reflect the new
  5410		 *     h_nr_runnable of its group cfs_rq.
  5411		 *   - Subtract its previous weight from cfs_rq->load.weight.
  5412		 *   - For group entity, update its weight to reflect the new share
  5413		 *     of its group cfs_rq.
  5414		 */
  5415		update_load_avg(cfs_rq, se, action);
  5416		se_update_runnable(se);
  5417	
  5418		update_stats_dequeue_fair(cfs_rq, se, flags);
  5419	
  5420		update_entity_lag(cfs_rq, se);
  5421		if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
  5422			se->deadline -= se->vruntime;
  5423			se->rel_deadline = 1;
  5424		}
  5425	
  5426		if (se != cfs_rq->curr)
  5427			__dequeue_entity(cfs_rq, se);
  5428		se->on_rq = 0;
  5429		account_entity_dequeue(cfs_rq, se);
  5430	
  5431		/* return excess runtime on last dequeue */
  5432		return_cfs_rq_runtime(cfs_rq);
  5433	
  5434		update_cfs_group(se);
  5435	
  5436		/*
  5437		 * Now advance min_vruntime if @se was the entity holding it back,
  5438		 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
  5439		 * put back on, and if we advance min_vruntime, we'll be placed back
  5440		 * further than we started -- i.e. we'll be penalized.
  5441		 */
  5442		if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
  5443			update_min_vruntime(cfs_rq);
  5444	
  5445		if (flags & DEQUEUE_DELAYED)
  5446			finish_delayed_dequeue_entity(se);
  5447	
  5448		if (cfs_rq->nr_queued == 0) {
  5449			update_idle_cfs_rq_clock_pelt(cfs_rq);
  5450	#ifdef CONFIG_CFS_BANDWIDTH
  5451			if (throttled_hierarchy(cfs_rq)) {
  5452				struct rq *rq = rq_of(cfs_rq);
  5453	
  5454				list_del_leaf_cfs_rq(cfs_rq);
  5455				cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> 5456				cfs_rq->pelt_clock_throttled = 1;
  5457			}
  5458	#endif
  5459		}
  5460	
  5461		return true;
  5462	}
  5463	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 07:29:37AM +0800, kernel test robot wrote:
> Hi Aaron,
> 
> kernel test robot noticed the following build warnings:
> 
> [auto build test WARNING on tip/sched/core]
> [also build test WARNING on next-20250715]
> [cannot apply to linus/master v6.16-rc6]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
> base:   tip/sched/core
> patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
> patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
> config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config)
> compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/
> 
> All warnings (new ones prefixed by >>):
> 
> >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
>     5456 |                         cfs_rq->pelt_clock_throttled = 1;
>          |                                                      ^ ~

Thanks for the report.

I don't think this will affect correctness since both cfs_rq's throttled
and pelt_clock_throttled fields are used as true(not 0) or false(0). I
used bitfield for them to save some space.

Change their types to either unsigned int or bool should cure this
warning, I suppose bool looks more clear?

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dbe52e18b93a0..434f816a56701 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,8 +737,8 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled:1;
-	int			pelt_clock_throttled:1;
+	bool			throttled:1;
+	bool			pelt_clock_throttled:1;
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;

Hi LKP,

I tried using clang-19 but couldn't reproduce this warning and I don't
have clang-20 at hand. Can you please apply the above hunk on top of
this series and see if that warning is gone? Thanks.

Best regards,
Aaron

>    kernel/sched/fair.c:5971:32: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
>     5971 |                 cfs_rq->pelt_clock_throttled = 1;
>          |                                              ^ ~
>    kernel/sched/fair.c:6014:20: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
>     6014 |         cfs_rq->throttled = 1;
>          |                           ^ ~
>    3 warnings generated.
> 
> 
> vim +/int +5456 kernel/sched/fair.c
> 
>   5372	
>   5373	static bool
>   5374	dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>   5375	{
>   5376		bool sleep = flags & DEQUEUE_SLEEP;
>   5377		int action = UPDATE_TG;
>   5378	
>   5379		update_curr(cfs_rq);
>   5380		clear_buddies(cfs_rq, se);
>   5381	
>   5382		if (flags & DEQUEUE_DELAYED) {
>   5383			WARN_ON_ONCE(!se->sched_delayed);
>   5384		} else {
>   5385			bool delay = sleep;
>   5386			/*
>   5387			 * DELAY_DEQUEUE relies on spurious wakeups, special task
>   5388			 * states must not suffer spurious wakeups, excempt them.
>   5389			 */
>   5390			if (flags & DEQUEUE_SPECIAL)
>   5391				delay = false;
>   5392	
>   5393			WARN_ON_ONCE(delay && se->sched_delayed);
>   5394	
>   5395			if (sched_feat(DELAY_DEQUEUE) && delay &&
>   5396			    !entity_eligible(cfs_rq, se)) {
>   5397				update_load_avg(cfs_rq, se, 0);
>   5398				set_delayed(se);
>   5399				return false;
>   5400			}
>   5401		}
>   5402	
>   5403		if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
>   5404			action |= DO_DETACH;
>   5405	
>   5406		/*
>   5407		 * When dequeuing a sched_entity, we must:
>   5408		 *   - Update loads to have both entity and cfs_rq synced with now.
>   5409		 *   - For group_entity, update its runnable_weight to reflect the new
>   5410		 *     h_nr_runnable of its group cfs_rq.
>   5411		 *   - Subtract its previous weight from cfs_rq->load.weight.
>   5412		 *   - For group entity, update its weight to reflect the new share
>   5413		 *     of its group cfs_rq.
>   5414		 */
>   5415		update_load_avg(cfs_rq, se, action);
>   5416		se_update_runnable(se);
>   5417	
>   5418		update_stats_dequeue_fair(cfs_rq, se, flags);
>   5419	
>   5420		update_entity_lag(cfs_rq, se);
>   5421		if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
>   5422			se->deadline -= se->vruntime;
>   5423			se->rel_deadline = 1;
>   5424		}
>   5425	
>   5426		if (se != cfs_rq->curr)
>   5427			__dequeue_entity(cfs_rq, se);
>   5428		se->on_rq = 0;
>   5429		account_entity_dequeue(cfs_rq, se);
>   5430	
>   5431		/* return excess runtime on last dequeue */
>   5432		return_cfs_rq_runtime(cfs_rq);
>   5433	
>   5434		update_cfs_group(se);
>   5435	
>   5436		/*
>   5437		 * Now advance min_vruntime if @se was the entity holding it back,
>   5438		 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
>   5439		 * put back on, and if we advance min_vruntime, we'll be placed back
>   5440		 * further than we started -- i.e. we'll be penalized.
>   5441		 */
>   5442		if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
>   5443			update_min_vruntime(cfs_rq);
>   5444	
>   5445		if (flags & DEQUEUE_DELAYED)
>   5446			finish_delayed_dequeue_entity(se);
>   5447	
>   5448		if (cfs_rq->nr_queued == 0) {
>   5449			update_idle_cfs_rq_clock_pelt(cfs_rq);
>   5450	#ifdef CONFIG_CFS_BANDWIDTH
>   5451			if (throttled_hierarchy(cfs_rq)) {
>   5452				struct rq *rq = rq_of(cfs_rq);
>   5453	
>   5454				list_del_leaf_cfs_rq(cfs_rq);
>   5455				cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > 5456				cfs_rq->pelt_clock_throttled = 1;
>   5457			}
>   5458	#endif
>   5459		}
>   5460	
>   5461		return true;
>   5462	}
>   5463	
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Peter Zijlstra 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 02:57:07PM +0800, Aaron Lu wrote:
> On Wed, Jul 16, 2025 at 07:29:37AM +0800, kernel test robot wrote:
> > Hi Aaron,
> > 
> > kernel test robot noticed the following build warnings:
> > 
> > [auto build test WARNING on tip/sched/core]
> > [also build test WARNING on next-20250715]
> > [cannot apply to linus/master v6.16-rc6]
> > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > 
> > url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
> > base:   tip/sched/core
> > patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
> > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
> > config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config)
> > compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
> > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce)
> > 
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/
> > 
> > All warnings (new ones prefixed by >>):
> > 
> > >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
> >     5456 |                         cfs_rq->pelt_clock_throttled = 1;
> >          |                                                      ^ ~

Nice warning from clang.

> 
> Thanks for the report.
> 
> I don't think this will affect correctness since both cfs_rq's throttled
> and pelt_clock_throttled fields are used as true(not 0) or false(0). I
> used bitfield for them to save some space.
> 
> Change their types to either unsigned int or bool should cure this
> warning, I suppose bool looks more clear?
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index dbe52e18b93a0..434f816a56701 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -737,8 +737,8 @@ struct cfs_rq {
>  	u64			throttled_clock_pelt_time;
>  	u64			throttled_clock_self;
>  	u64			throttled_clock_self_time;
> -	int			throttled:1;
> -	int			pelt_clock_throttled:1;
> +	bool			throttled:1;
> +	bool			pelt_clock_throttled:1;
>  	int			throttle_count;
>  	struct list_head	throttled_list;
>  	struct list_head	throttled_csd_list;

Yeah, either this or any unsigned type will do.

Re: [PATCH v3 3/5] sched/fair: Switch to task based throttle model

Posted by Philip Li 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 02:57:07PM +0800, Aaron Lu wrote:
> On Wed, Jul 16, 2025 at 07:29:37AM +0800, kernel test robot wrote:
> > Hi Aaron,
> > 
> > kernel test robot noticed the following build warnings:
> > 
> > [auto build test WARNING on tip/sched/core]
> > [also build test WARNING on next-20250715]
> > [cannot apply to linus/master v6.16-rc6]
> > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > 
> > url:    https://github.com/intel-lab-lkp/linux/commits/Aaron-Lu/sched-fair-Add-related-data-structure-for-task-based-throttle/20250715-152307
> > base:   tip/sched/core
> > patch link:    https://lore.kernel.org/r/20250715071658.267-4-ziqianlu%40bytedance.com
> > patch subject: [PATCH v3 3/5] sched/fair: Switch to task based throttle model
> > config: i386-buildonly-randconfig-006-20250716 (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/config)
> > compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
> > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce)
> > 
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202507160730.0cXkgs0S-lkp@intel.com/
> > 
> > All warnings (new ones prefixed by >>):
> > 
> > >> kernel/sched/fair.c:5456:33: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
> >     5456 |                         cfs_rq->pelt_clock_throttled = 1;
> >          |                                                      ^ ~
> 
> Thanks for the report.
> 
> I don't think this will affect correctness since both cfs_rq's throttled
> and pelt_clock_throttled fields are used as true(not 0) or false(0). I
> used bitfield for them to save some space.
> 
> Change their types to either unsigned int or bool should cure this
> warning, I suppose bool looks more clear?
> 
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index dbe52e18b93a0..434f816a56701 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -737,8 +737,8 @@ struct cfs_rq {
>  	u64			throttled_clock_pelt_time;
>  	u64			throttled_clock_self;
>  	u64			throttled_clock_self_time;
> -	int			throttled:1;
> -	int			pelt_clock_throttled:1;
> +	bool			throttled:1;
> +	bool			pelt_clock_throttled:1;
>  	int			throttle_count;
>  	struct list_head	throttled_list;
>  	struct list_head	throttled_csd_list;
> 
> Hi LKP,
> 
> I tried using clang-19 but couldn't reproduce this warning and I don't
> have clang-20 at hand. Can you please apply the above hunk on top of
> this series and see if that warning is gone? Thanks.

Got it, is it possible to give a try for the reproduce step [1], which can
download clang-20 to local dir? If it has issue, we will follow up to check
as early as possible.

[1] https://download.01.org/0day-ci/archive/20250716/202507160730.0cXkgs0S-lkp@intel.com/reproduce

> 
> Best regards,
> Aaron
> 
> >    kernel/sched/fair.c:5971:32: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
> >     5971 |                 cfs_rq->pelt_clock_throttled = 1;
> >          |                                              ^ ~
> >    kernel/sched/fair.c:6014:20: warning: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Wsingle-bit-bitfield-constant-conversion]
> >     6014 |         cfs_rq->throttled = 1;
> >          |                           ^ ~
> >    3 warnings generated.
> > 
> > 
> > vim +/int +5456 kernel/sched/fair.c
> > 
> >   5372	
> >   5373	static bool
> >   5374	dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> >   5375	{
> >   5376		bool sleep = flags & DEQUEUE_SLEEP;
> >   5377		int action = UPDATE_TG;
> >   5378	
> >   5379		update_curr(cfs_rq);
> >   5380		clear_buddies(cfs_rq, se);
> >   5381	
> >   5382		if (flags & DEQUEUE_DELAYED) {
> >   5383			WARN_ON_ONCE(!se->sched_delayed);
> >   5384		} else {
> >   5385			bool delay = sleep;
> >   5386			/*
> >   5387			 * DELAY_DEQUEUE relies on spurious wakeups, special task
> >   5388			 * states must not suffer spurious wakeups, excempt them.
> >   5389			 */
> >   5390			if (flags & DEQUEUE_SPECIAL)
> >   5391				delay = false;
> >   5392	
> >   5393			WARN_ON_ONCE(delay && se->sched_delayed);
> >   5394	
> >   5395			if (sched_feat(DELAY_DEQUEUE) && delay &&
> >   5396			    !entity_eligible(cfs_rq, se)) {
> >   5397				update_load_avg(cfs_rq, se, 0);
> >   5398				set_delayed(se);
> >   5399				return false;
> >   5400			}
> >   5401		}
> >   5402	
> >   5403		if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
> >   5404			action |= DO_DETACH;
> >   5405	
> >   5406		/*
> >   5407		 * When dequeuing a sched_entity, we must:
> >   5408		 *   - Update loads to have both entity and cfs_rq synced with now.
> >   5409		 *   - For group_entity, update its runnable_weight to reflect the new
> >   5410		 *     h_nr_runnable of its group cfs_rq.
> >   5411		 *   - Subtract its previous weight from cfs_rq->load.weight.
> >   5412		 *   - For group entity, update its weight to reflect the new share
> >   5413		 *     of its group cfs_rq.
> >   5414		 */
> >   5415		update_load_avg(cfs_rq, se, action);
> >   5416		se_update_runnable(se);
> >   5417	
> >   5418		update_stats_dequeue_fair(cfs_rq, se, flags);
> >   5419	
> >   5420		update_entity_lag(cfs_rq, se);
> >   5421		if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
> >   5422			se->deadline -= se->vruntime;
> >   5423			se->rel_deadline = 1;
> >   5424		}
> >   5425	
> >   5426		if (se != cfs_rq->curr)
> >   5427			__dequeue_entity(cfs_rq, se);
> >   5428		se->on_rq = 0;
> >   5429		account_entity_dequeue(cfs_rq, se);
> >   5430	
> >   5431		/* return excess runtime on last dequeue */
> >   5432		return_cfs_rq_runtime(cfs_rq);
> >   5433	
> >   5434		update_cfs_group(se);
> >   5435	
> >   5436		/*
> >   5437		 * Now advance min_vruntime if @se was the entity holding it back,
> >   5438		 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
> >   5439		 * put back on, and if we advance min_vruntime, we'll be placed back
> >   5440		 * further than we started -- i.e. we'll be penalized.
> >   5441		 */
> >   5442		if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
> >   5443			update_min_vruntime(cfs_rq);
> >   5444	
> >   5445		if (flags & DEQUEUE_DELAYED)
> >   5446			finish_delayed_dequeue_entity(se);
> >   5447	
> >   5448		if (cfs_rq->nr_queued == 0) {
> >   5449			update_idle_cfs_rq_clock_pelt(cfs_rq);
> >   5450	#ifdef CONFIG_CFS_BANDWIDTH
> >   5451			if (throttled_hierarchy(cfs_rq)) {
> >   5452				struct rq *rq = rq_of(cfs_rq);
> >   5453	
> >   5454				list_del_leaf_cfs_rq(cfs_rq);
> >   5455				cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > > 5456				cfs_rq->pelt_clock_throttled = 1;
> >   5457			}
> >   5458	#endif
> >   5459		}
> >   5460	
> >   5461		return true;
> >   5462	}
> >   5463	
> > 
> > -- 
> > 0-DAY CI Kernel Test Service
> > https://github.com/intel/lkp-tests/wiki
>

[PATCH v3 update 3/5] sched/fair: Switch to task based throttle model

Posted by Aaron Lu 2 months, 3 weeks ago

On Wed, Jul 16, 2025 at 03:40:26PM +0800, Philip Li wrote:
> On Wed, Jul 16, 2025 at 02:57:07PM +0800, Aaron Lu wrote:
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index dbe52e18b93a0..434f816a56701 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -737,8 +737,8 @@ struct cfs_rq {
> >  	u64			throttled_clock_pelt_time;
> >  	u64			throttled_clock_self;
> >  	u64			throttled_clock_self_time;
> > -	int			throttled:1;
> > -	int			pelt_clock_throttled:1;
> > +	bool			throttled:1;
> > +	bool			pelt_clock_throttled:1;
> >  	int			throttle_count;
> >  	struct list_head	throttled_list;
> >  	struct list_head	throttled_csd_list;
> > 
> > Hi LKP,
> > 
> > I tried using clang-19 but couldn't reproduce this warning and I don't
> > have clang-20 at hand. Can you please apply the above hunk on top of
> > this series and see if that warning is gone? Thanks.
> 
> Got it, is it possible to give a try for the reproduce step [1], which can
> download clang-20 to local dir? If it has issue, we will follow up to check
> as early as possible.

I managed to install clang-20 and can see this warning. With the above
hunk applied, the warning is gone.

Here is the updated patch3:

From: Valentin Schneider <vschneid@redhat.com>
Date: Fri, 23 May 2025 15:30:15 +0800
Subject: [PATCH v3 update 3/5] sched/fair: Switch to task based throttle model

In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.

Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock when the throttled cfs_rq has no tasks left.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
v3 update:
- fix compiler warning about int bit-field.

 kernel/sched/fair.c  | 336 ++++++++++++++++++++++---------------------
 kernel/sched/pelt.h  |   4 +-
 kernel/sched/sched.h |   3 +-
 3 files changed, 176 insertions(+), 167 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54c2a4df6a5d1..0eeea7f2e693d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5285,18 +5285,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);
 
 			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
 			if (!cfs_rq->throttled_clock_self)
 				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+
+			if (cfs_rq->pelt_clock_throttled) {
+				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+					cfs_rq->throttled_clock_pelt;
+				cfs_rq->pelt_clock_throttled = 0;
+			}
 		}
+#endif
 	}
 }
 
@@ -5335,8 +5340,6 @@ static void set_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable--;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5357,8 +5360,6 @@ static void clear_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable++;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5444,8 +5445,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);
 
-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
+			struct rq *rq = rq_of(cfs_rq);
+
+			list_del_leaf_cfs_rq(cfs_rq);
+			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+			cfs_rq->pelt_clock_throttled = 1;
+		}
+#endif
+	}
 
 	return true;
 }
@@ -5784,6 +5795,10 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
 		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		/*
+		 * Must not set throttled before dequeue or dequeue will
+		 * mistakenly regard this task as an already throttled one.
+		 */
 		p->throttled = true;
 		resched_curr(rq);
 	}
@@ -5797,32 +5812,119 @@ void init_cfs_throttle_work(struct task_struct *p)
 	INIT_LIST_HEAD(&p->throttle_node);
 }
 
+/*
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
+ */
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+	WARN_ON_ONCE(p->se.on_rq);
+	list_del_init(&p->throttle_node);
+
+	/* task blocked after throttled */
+	if (flags & DEQUEUE_SLEEP) {
+		p->throttled = false;
+		return;
+	}
+
+	/*
+	 * task is migrating off its old cfs_rq, detach
+	 * the task's load from its old cfs_rq.
+	 */
+	if (task_on_rq_migrating(p))
+		detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	/*
+	 * If the throttled task is enqueued to a throttled cfs_rq,
+	 * take the fast path by directly put the task on target
+	 * cfs_rq's limbo list, except when p is current because
+	 * the following race can cause p's group_node left in rq's
+	 * cfs_tasks list when it's throttled:
+	 *
+	 *        cpuX                       cpuY
+	 *   taskA ret2user
+	 *  throttle_cfs_rq_work()    sched_move_task(taskA)
+	 *  task_rq_lock acquired
+	 *  dequeue_task_fair(taskA)
+	 *  task_rq_lock released
+	 *                            task_rq_lock acquired
+	 *                            task_current_donor(taskA) == true
+	 *                            task_on_rq_queued(taskA) == true
+	 *                            dequeue_task(taskA)
+	 *                            put_prev_task(taskA)
+	 *                            sched_change_group()
+	 *                            enqueue_task(taskA) -> taskA's new cfs_rq
+	 *                                                   is throttled, go
+	 *                                                   fast path and skip
+	 *                                                   actual enqueue
+	 *                            set_next_task(taskA)
+	 *                          __set_next_task_fair(taskA)
+	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
+	 *  schedule()
+	 *
+	 * And in the above race case, the task's current cfs_rq is in the same
+	 * rq as its previous cfs_rq because sched_move_task() doesn't migrate
+	 * task so we can use its current cfs_rq to derive rq and test if the
+	 * task is current.
+	 */
+	if (throttled_hierarchy(cfs_rq) &&
+	    !task_current_donor(rq_of(cfs_rq), p)) {
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		return true;
+	}
+
+	/* we can't take the fast path, do an actual enqueue*/
+	p->throttled = false;
+	return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;
+
+	if (--cfs_rq->throttle_count)
+		return 0;
 
-	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
+	if (cfs_rq->pelt_clock_throttled) {
 		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 					     cfs_rq->throttled_clock_pelt;
+		cfs_rq->pelt_clock_throttled = 0;
+	}
 
-		/* Add cfs_rq with load or one or more already running entities to the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
 
-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+		cfs_rq->throttled_clock_self = 0;
 
-			cfs_rq->throttled_clock_self = 0;
+		if (WARN_ON_ONCE((s64)delta < 0))
+			delta = 0;
 
-			if (WARN_ON_ONCE((s64)delta < 0))
-				delta = 0;
+		cfs_rq->throttled_clock_self_time += delta;
+	}
 
-			cfs_rq->throttled_clock_self_time += delta;
-		}
+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+		list_del_init(&p->throttle_node);
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}
 
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }
 
@@ -5851,17 +5953,25 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
+	if (cfs_rq->throttle_count++)
+		return 0;
+
+
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+	else {
+		/*
+		 * For cfs_rqs that still have entities enqueued, PELT clock
+		 * stop happens at dequeue time when all entities are dequeued.
+		 */
 		list_del_leaf_cfs_rq(cfs_rq);
-
-		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+		cfs_rq->pelt_clock_throttled = 1;
 	}
-	cfs_rq->throttle_count++;
 
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
 
@@ -5869,8 +5979,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+	int dequeue = 1;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5893,68 +6002,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5970,9 +6022,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+	/*
+	 * It's possible we are called with !runtime_remaining due to things
+	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+	 * unthrottled us with a positive runtime_remaining but other still
+	 * running entities consumed those runtime before we reached here.
+	 *
+	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+	 * because any enqueue in tg_unthrottle_up() will immediately trigger a
+	 * throttle, which is not supposed to happen on unthrottle path.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+		return;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -6002,62 +6065,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
 	}
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6711,6 +6720,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -6903,6 +6914,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
+	if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+		return;
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -6955,10 +6969,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -6980,10 +6990,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}
 
 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7013,7 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);
 
-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
@@ -7068,10 +7073,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7108,10 +7109,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}
 
 	sub_nr_running(rq, h_nr_queued);
@@ -7145,6 +7142,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (unlikely(task_is_throttled(p))) {
+		dequeue_throttled_task(p, flags);
+		return true;
+	}
+
 	if (!p->se.sched_delayed)
 		util_est_dequeue(&rq->cfs, p);
 
@@ -8813,19 +8815,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+	struct task_struct *p;
+	bool throttled;
 
 again:
 	cfs_rq = &rq->cfs;
 	if (!cfs_rq->nr_queued)
 		return NULL;
 
+	throttled = false;
+
 	do {
 		/* Might not have done put_prev_entity() */
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		throttled |= check_cfs_rq_runtime(cfs_rq);
 
 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
@@ -8833,7 +8838,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	if (unlikely(throttled))
+		task_throttle_setup_work(p);
+	return p;
 }
 
 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f2..f921302dc40fb 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	u64 throttled;
 
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		throttled = U64_MAX;
 	else
 		throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
 
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b0c9559992d8a..51d000ab4a70d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,7 +737,8 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled;
+	bool			throttled:1;
+	bool			pelt_clock_throttled:1;
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
-- 
2.39.5

[PATCH v3 1/5] sched/fair: Add related data structure for task based throttle
[PATCH v3 2/5] sched/fair: Implement throttle task work and related helpers
[PATCH v3 3/5] sched/fair: Switch to task based throttle model
[PATCH v3 4/5] sched/fair: Task based throttle time accounting
[PATCH v3 5/5] sched/fair: Get rid of throttled_lb_pair()