sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime()

[tip: sched/core] sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime()
Posted by tip-bot2 for Peter Zijlstra 3 days, 11 hours ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     f666241e6bd5d9a494beca982e1953208dce531c
Gitweb:        https://git.kernel.org/tip/f666241e6bd5d9a494beca982e1953208dce531c
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 02 Jun 2026 07:10:05 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 02 Jun 2026 12:26:13 +02:00

sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime()

assign_cfs_rq_runtime() during update_curr() sets the resched indicator
and relies on check_cfs_rq_runtime() during pick_next_task() /
put_prev_entity() to throttle the hierarchy once current task is
preempted / blocks.

Per-task throttle, on the other hand, uses throttle_cfs_rq() to simply
propagate the throttle signals, and then relies on task work to
individually throttle the runnable tasks on their way out to the
userspace.

Remove check_cfs_rq_runtime() and unify throttling into
account_cfs_rq_runtime() which only sets the cfs_rq->throttled,
cfs_rq->throttle_count indicators via throttle_cfs_rq() and optionally
adds the task work to the current task (donor) it is on the throttled
hierarchy.

throttle_cfs_rq() requests for sched_cfs_bandwidth_slice() worth of
bandwidth for the current hierarchy that enable it to continue running
uninterrupted when selected. For the rest, it requests a bare minimum of
"1" to ensure some bandwidth is available and pass the
"runtime_remaining > 0" checks once selected.

For SCHED_PROXY_EXEC, a mutex holder cannot exit to userspace without
dropping it first and the mutex_unlock() ensures proxy is stopped before
the mutex handoff which preserves the current semantics for running a
throttled task until it exits to the userspace even if it acts as a
donor.

  [ prateek: rebased on tip, comments, commit message. ]

Reviewed-By: Benjamin Segall <bsegall@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Aaron Lu <ziqianlu@bytedance.com>
Link: https://patch.msgid.link/20260602071005.11942-1-kprateek.nayak@amd.com
---
 kernel/sched/fair.c | 101 +++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3f3f09a..f4ed841 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -525,7 +525,7 @@ static int se_is_idle(struct sched_entity *se)
 #endif /* !CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -6388,8 +6388,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
 	return se;
 }
 
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
@@ -6399,9 +6397,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	/* throttle cfs_rqs exceeding runtime */
-	check_cfs_rq_runtime(cfs_rq);
-
 	if (prev->on_rq) {
 		update_stats_wait_start_fair(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
@@ -6536,41 +6531,32 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
 	return cfs_rq->runtime_remaining > 0;
 }
 
-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
-	guard(raw_spinlock)(&cfs_b->lock);
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq);
 
-	return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
-}
-
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
 
 	if (likely(cfs_rq->runtime_remaining > 0))
-		return;
+		return false;
 
 	if (cfs_rq->throttled)
-		return;
+		return true;
 	/*
-	 * if we're unable to extend our runtime we resched so that the active
-	 * hierarchy can be throttled
+	 * throttle_cfs_rq() will try to extend the runtime first
+	 * before throttling the hierarchy.
 	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_curr(rq_of(cfs_rq));
+	return throttle_cfs_rq(cfs_rq);
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
-		return;
+		return false;
 
-	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+	return __account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6858,10 +6844,24 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 
 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
-	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *curr = cfs_rq->curr;
+	struct rq *rq = rq_of(cfs_rq);
 
 	scoped_guard(raw_spinlock, &cfs_b->lock) {
+		u64 target_runtime = 1;
+
+		/*
+		 * If cfs_rq->curr is still runnable, we are here from an
+		 * update_curr(). Request sysctl_sched_cfs_bandwidth_slice
+		 * worth of bandwidth to continue running.
+		 *
+		 * If the curr is not runnable, just request enough bandwidth
+		 * to be runnable next time the pick selects this cfs_rq.
+		 */
+		if (curr && curr->on_rq)
+			target_runtime = sched_cfs_bandwidth_slice();
+
 		/*
 		 * Check if We have raced with bandwidth becoming available. If
 		 * we actually throttled the timer might not unthrottle us for
@@ -6872,7 +6872,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		 *
 		 * This will start the period timer if necessary.
 		 */
-		if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1))
+		if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime))
 			return false;
 
 		/*
@@ -6893,6 +6893,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	 */
 	cfs_rq->throttled = 1;
 	WARN_ON_ONCE(cfs_rq->throttled_clock);
+
+	/*
+	 * If current hierarchy was throttled, add throttle work to the
+	 * current donor. In case of proxy-execution, the execution
+	 * context cannot exit to the userspace while holding a mutex
+	 * and the rule of throttle deferral to only throttle the
+	 * throttled context at exit to userspace is still preserved.
+	 */
+	if (curr && curr->on_rq)
+		task_throttle_setup_work(rq->donor);
+
 	return true;
 }
 
@@ -7283,7 +7294,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 	if (!cfs_bandwidth_used())
 		return;
 
-	/* an active group must be handled by the update_curr()->put() path */
+	/* an active group must be handled by the update_curr() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
 
@@ -7293,8 +7304,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 
 	/* update runtime allocation */
 	account_cfs_rq_runtime(cfs_rq, 0);
-	if (cfs_rq->runtime_remaining <= 0)
-		throttle_cfs_rq(cfs_rq);
 }
 
 static void sync_throttle(struct task_group *tg, int cpu)
@@ -7324,25 +7333,6 @@ static void sync_throttle(struct task_group *tg, int cpu)
 		cfs_rq->pelt_clock_throttled = 1;
 }
 
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-	if (!cfs_bandwidth_used())
-		return false;
-
-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return false;
-
-	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
-	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return true;
-
-	return throttle_cfs_rq(cfs_rq);
-}
-
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
@@ -7596,8 +7586,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
 
 #else /* !CONFIG_CFS_BANDWIDTH: */
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9934,8 +9923,6 @@ again:
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		throttled |= check_cfs_rq_runtime(cfs_rq);
-
 		se = pick_next_entity(rq, cfs_rq, true);
 		if (!se)
 			goto again;
@@ -14853,8 +14840,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
-	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
+	struct cfs_rq *cfs_rq;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -15036,6 +15023,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 {
 	struct sched_entity *se = &p->se;
+	bool throttled = false;
 
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -15046,9 +15034,12 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 
 		set_next_entity(cfs_rq, se, first);
 		/* ensure bandwidth has been allocated on our new cfs_rq */
-		account_cfs_rq_runtime(cfs_rq, 0);
+		throttled |= account_cfs_rq_runtime(cfs_rq, 0);
 	}
 
+	if (throttled)
+		task_throttle_setup_work(p);
+
 	se = &p->se;
 
 	if (task_on_rq_queued(p)) {