[tip: sched/core] sched/fair: Convert cfs bandwidth throttling to use guards

Posted by tip-bot2 for K Prateek Nayak 3 days, 11 hours ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     1abbecd1d2d2fdd96e52f541f07ee2b163631bee
Gitweb:        https://git.kernel.org/tip/1abbecd1d2d2fdd96e52f541f07ee2b163631bee
Author:        K Prateek Nayak <kprateek.nayak@amd.com>
AuthorDate:    Tue, 02 Jun 2026 05:00:01 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 02 Jun 2026 12:26:11 +02:00

sched/fair: Convert cfs bandwidth throttling to use guards

Routine conversion of rcu_read_lock(), spin_lock*, and rq_lock usage
within the cfs bandwidth controller to use class guards.

Only notable changes are:

 - Checking for "cfs_rq->runtime_remaining <= 0" instead of the inverse
   to spot a throttle and break early. This also saves the need
   for extra indentation in the unthrottle case.

 - Reordering of list_del_rcu() against throttled_clock indicator update
   in unthrottle_cfs_rq(). Both are done with "cfs_b->lock" held after
   the "cfs_rq->throttled" is cleared which make the reordering safe
   against concurrent list modifications.

No functional changes intended.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Tested-by: Aaron Lu <ziqianlu@bytedance.com>
Link: https://patch.msgid.link/20260602050005.11160-2-kprateek.nayak@amd.com
---
 kernel/sched/fair.c | 193 ++++++++++++++++++++-----------------------
 1 file changed, 90 insertions(+), 103 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d4ed88..261e5ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5035,13 +5035,13 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
 	 */
 	rq_clock_start_loop_update(rq);
 
-	rcu_read_lock();
+	guard(rcu)();
+
 	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
 		clear_tg_load_avg(cfs_rq);
 	}
-	rcu_read_unlock();
 
 	rq_clock_stop_loop_update(rq);
 }
@@ -6540,13 +6540,10 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	int ret;
 
-	raw_spin_lock(&cfs_b->lock);
-	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
-	raw_spin_unlock(&cfs_b->lock);
+	guard(raw_spinlock)(&cfs_b->lock);
 
-	return ret;
+	return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
 }
 
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
@@ -6835,33 +6832,32 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	int dequeue = 1;
 
-	raw_spin_lock(&cfs_b->lock);
-	/* This will start the period timer if necessary */
-	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+	scoped_guard(raw_spinlock, &cfs_b->lock) {
 		/*
-		 * We have raced with bandwidth becoming available, and if we
-		 * actually throttled the timer might not unthrottle us for an
-		 * entire period. We additionally needed to make sure that any
-		 * subsequent check_cfs_rq_runtime calls agree not to throttle
-		 * us, as we may commit to do cfs put_prev+pick_next, so we ask
-		 * for 1ns of runtime rather than just check cfs_b.
+		 * Check if We have raced with bandwidth becoming available. If
+		 * we actually throttled the timer might not unthrottle us for
+		 * an entire period. We additionally needed to make sure that
+		 * any subsequent check_cfs_rq_runtime calls agree not to
+		 * throttle us, as we may commit to do cfs put_prev+pick_next,
+		 * so we ask for 1ns of runtime rather than just check cfs_b.
+		 *
+		 * This will start the period timer if necessary.
+		 */
+		if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1))
+			return false;
+
+		/*
+		 * No bandwidth available; Add ourselves on the list to be
+		 * unthrottled later.
 		 */
-		dequeue = 0;
-	} else {
 		list_add_tail_rcu(&cfs_rq->throttled_list,
 				  &cfs_b->throttled_cfs_rq);
 	}
-	raw_spin_unlock(&cfs_b->lock);
-
-	if (!dequeue)
-		return false;  /* Throttle no longer required. */
 
 	/* freeze hierarchy runnable averages while throttled */
-	rcu_read_lock();
-	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
-	rcu_read_unlock();
+	scoped_guard(rcu)
+		walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 
 	/*
 	 * Note: distribution will already see us throttled via the
@@ -6894,13 +6890,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	update_rq_clock(rq);
 
-	raw_spin_lock(&cfs_b->lock);
-	if (cfs_rq->throttled_clock) {
+	scoped_guard(raw_spinlock, &cfs_b->lock) {
+		list_del_rcu(&cfs_rq->throttled_list);
+
+		if (!cfs_rq->throttled_clock)
+			break;
+
 		cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 		cfs_rq->throttled_clock = 0;
 	}
-	list_del_rcu(&cfs_rq->throttled_list);
-	raw_spin_unlock(&cfs_b->lock);
 
 	/* update hierarchical throttle state */
 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -6929,9 +6927,8 @@ static void __cfsb_csd_unthrottle(void *arg)
 {
 	struct cfs_rq *cursor, *tmp;
 	struct rq *rq = arg;
-	struct rq_flags rf;
 
-	rq_lock(rq, &rf);
+	guard(rq_lock)(rq);
 
 	/*
 	 * Iterating over the list can trigger several call to
@@ -6948,7 +6945,7 @@ static void __cfsb_csd_unthrottle(void *arg)
 	 * race with group being freed in the window between removing it
 	 * from the list and advancing to the next entry in the list.
 	 */
-	rcu_read_lock();
+	guard(rcu)();
 
 	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
 				 throttled_csd_list) {
@@ -6958,10 +6955,7 @@ static void __cfsb_csd_unthrottle(void *arg)
 			unthrottle_cfs_rq(cursor);
 	}
 
-	rcu_read_unlock();
-
 	rq_clock_stop_loop_update(rq);
-	rq_unlock(rq, &rf);
 }
 
 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
@@ -7001,11 +6995,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 	u64 runtime, remaining = 1;
 	bool throttled = false;
 	struct cfs_rq *cfs_rq, *tmp;
-	struct rq_flags rf;
 	struct rq *rq;
 	LIST_HEAD(local_unthrottle);
 
-	rcu_read_lock();
+	guard(rcu)();
+
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 				throttled_list) {
 		rq = rq_of(cfs_rq);
@@ -7015,65 +7009,63 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 			break;
 		}
 
-		rq_lock_irqsave(rq, &rf);
+		guard(rq_lock_irqsave)(rq);
+
 		if (!cfs_rq_throttled(cfs_rq))
-			goto next;
+			continue;
 
 		/* Already queued for async unthrottle */
 		if (!list_empty(&cfs_rq->throttled_csd_list))
-			goto next;
+			continue;
 
 		/* By the above checks, this should never be true */
 		WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
 
-		raw_spin_lock(&cfs_b->lock);
-		runtime = -cfs_rq->runtime_remaining + 1;
-		if (runtime > cfs_b->runtime)
-			runtime = cfs_b->runtime;
-		cfs_b->runtime -= runtime;
-		remaining = cfs_b->runtime;
-		raw_spin_unlock(&cfs_b->lock);
+		scoped_guard(raw_spinlock, &cfs_b->lock) {
+			runtime = -cfs_rq->runtime_remaining + 1;
+			if (runtime > cfs_b->runtime)
+				runtime = cfs_b->runtime;
+			cfs_b->runtime -= runtime;
+			remaining = cfs_b->runtime;
+		}
 
 		cfs_rq->runtime_remaining += runtime;
 
-		/* we check whether we're throttled above */
-		if (cfs_rq->runtime_remaining > 0) {
-			if (cpu_of(rq) != this_cpu) {
-				unthrottle_cfs_rq_async(cfs_rq);
-			} else {
-				/*
-				 * We currently only expect to be unthrottling
-				 * a single cfs_rq locally.
-				 */
-				WARN_ON_ONCE(!list_empty(&local_unthrottle));
-				list_add_tail(&cfs_rq->throttled_csd_list,
-					      &local_unthrottle);
-			}
-		} else {
+		/*
+		 * Ran out of bandwidth during distribution!
+		 * Indicate throttled entities and break early.
+		 */
+		if (cfs_rq->runtime_remaining <= 0) {
 			throttled = true;
+			break;
 		}
 
-next:
-		rq_unlock_irqrestore(rq, &rf);
+		/* we check whether we're throttled above */
+		if (cpu_of(rq) != this_cpu) {
+			unthrottle_cfs_rq_async(cfs_rq);
+			continue;
+		}
+
+		/*
+		 * We currently only expect to be unthrottling
+		 * a single cfs_rq locally.
+		 */
+		WARN_ON_ONCE(!list_empty(&local_unthrottle));
+		list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle);
 	}
 
 	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
 				 throttled_csd_list) {
 		struct rq *rq = rq_of(cfs_rq);
 
-		rq_lock_irqsave(rq, &rf);
+		guard(rq_lock_irqsave)(rq);
 
 		list_del_init(&cfs_rq->throttled_csd_list);
-
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
-
-		rq_unlock_irqrestore(rq, &rf);
 	}
 	WARN_ON_ONCE(!list_empty(&local_unthrottle));
 
-	rcu_read_unlock();
-
 	return throttled;
 }
 
@@ -7196,7 +7188,8 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (slack_runtime <= 0)
 		return;
 
-	raw_spin_lock(&cfs_b->lock);
+	guard(raw_spinlock)(&cfs_b->lock);
+
 	if (cfs_b->quota != RUNTIME_INF) {
 		cfs_b->runtime += slack_runtime;
 
@@ -7205,7 +7198,6 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 		    !list_empty(&cfs_b->throttled_cfs_rq))
 			start_cfs_slack_bandwidth(cfs_b);
 	}
-	raw_spin_unlock(&cfs_b->lock);
 
 	/* even if it's not valid for return we don't want to try again */
 	cfs_rq->runtime_remaining -= slack_runtime;
@@ -7228,25 +7220,21 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  */
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
-	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
-	unsigned long flags;
-
 	/* confirm we're still not at a refresh boundary */
-	raw_spin_lock_irqsave(&cfs_b->lock, flags);
-	cfs_b->slack_started = false;
+	scoped_guard(raw_spinlock_irqsave, &cfs_b->lock) {
+		u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 
-	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
-		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
-		return;
-	}
+		cfs_b->slack_started = false;
 
-	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
-		runtime = cfs_b->runtime;
+		if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+			return;
 
-	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
+		if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
+			runtime = cfs_b->runtime;
 
-	if (!runtime)
-		return;
+		if (!runtime)
+			return;
+	}
 
 	distribute_cfs_runtime(cfs_b);
 }
@@ -7335,18 +7323,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
-	unsigned long flags;
 	int overrun;
 	int idle = 0;
 	int count = 0;
 
-	raw_spin_lock_irqsave(&cfs_b->lock, flags);
+	CLASS(raw_spinlock_irqsave, cfsb_guard)(&cfs_b->lock);
+
 	for (;;) {
 		overrun = hrtimer_forward_now(timer, cfs_b->period);
 		if (!overrun)
 			break;
 
-		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+		idle = do_sched_cfs_period_timer(cfs_b, overrun, cfsb_guard.flags);
 
 		if (++count > 3) {
 			u64 new, old = ktime_to_ns(cfs_b->period);
@@ -7379,11 +7367,13 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 			count = 0;
 		}
 	}
-	if (idle)
+
+	if (idle) {
 		cfs_b->period_active = 0;
-	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
+		return HRTIMER_NORESTART;
+	}
 
-	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+	return HRTIMER_RESTART;
 }
 
 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
@@ -7450,14 +7440,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	 */
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
-		unsigned long flags;
 
 		if (list_empty(&rq->cfsb_csd_list))
 			continue;
 
-		local_irq_save(flags);
-		__cfsb_csd_unthrottle(rq);
-		local_irq_restore(flags);
+		scoped_guard(irqsave)
+			__cfsb_csd_unthrottle(rq);
 	}
 }
 
@@ -7475,16 +7463,15 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
 
 	lockdep_assert_rq_held(rq);
 
-	rcu_read_lock();
+	guard(rcu)();
+
 	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
-		raw_spin_lock(&cfs_b->lock);
-		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
-		raw_spin_unlock(&cfs_b->lock);
+		scoped_guard(raw_spinlock, &cfs_b->lock)
+			cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 	}
-	rcu_read_unlock();
 }
 
 /* cpu offline callback */
@@ -7505,7 +7492,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 	 */
 	rq_clock_start_loop_update(rq);
 
-	rcu_read_lock();
+	guard(rcu)();
+
 	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq));
 
@@ -7528,7 +7516,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		cfs_rq->runtime_remaining = 1;
 		unthrottle_cfs_rq(cfs_rq);
 	}
-	rcu_read_unlock();
 
 	rq_clock_stop_loop_update(rq);
 }