From nobody Tue Apr 7 01:33:44 2026 Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 94962365A1C; Tue, 17 Mar 2026 10:47:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.92.199 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=Z4mF3BpqCgEjTagdjVU60GNsjnGHouEH2iY21cDQC1LetNVr8hYSGObOvByM0EtT90P4t5ANGKKy94kuv5t2BlVXA8oOuRc/8MpALRbLyt58w5zJYWHinsAqQv5gvizTrPdZpObZfCJCNPCkxHtQV1xy5U86usfNGeKzO22cjWw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=dqCcAXFRNfuoGgrf+WlIR8eJDUCDHGl7msCw3V+8qhY=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=HLTNO2dEDuKGEU+Y1VR+uadqDAxLm5FP87S10kdJnzmVCfK6Yg3a3LlDgYB81pSNfXT5PsjLS5CCga2OvCXH+MyOIgZXiLMGO4IhYnc2Y2tvysQ/8CwdbhDfYWIqSbOlshsl9ujkcP+MUcQZ3XLGuR824s5VkPIJb8Fj4V4W5bk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=cu1np/3E; arc=none smtp.client-ip=90.155.92.199 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="cu1np/3E" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=desiato.20200630; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=MUmdy1hmWZji1oYBTrFsPfb5N9nygfW8f4vys4PyEfk=; b=cu1np/3EZEVETHsWafF27pp1ER dil3HSl72/YGi6ycGQBiEUNoTpvkQrrwj2doz7m9JGnvcqMbHWhaUBbLyIBqQwyRyylb+d9SyVtr9 u6/Dk5r31YHTGuNKbVruXiz08GvSnwSLZLWt9doLHHvtmg9cIwMZ81MM47mJ94Z9fnP6quDuIB8lU B7+xylUdEZQ3vD13tQLyxmcwBOHX0GALWPu1ZFZgChxUt6Dldw6iivsQHyDw527ztIM5XOk5V64qs r7ETeAuWm/WM54i51AI5pC7O7EeW7hScz2+g50OtVLYNEexalgQQncuJyG9kZOoMT99qllOAamKvV hfeJiVog==; Received: from 2001-1c00-8d85-5700-266e-96ff-fe07-7dcc.cable.dynamic.v6.ziggo.nl ([2001:1c00:8d85:5700:266e:96ff:fe07:7dcc] helo=noisy.programming.kicks-ass.net) by desiato.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxZ-00000008kbw-0ZFl; Tue, 17 Mar 2026 10:47:37 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 3423D3032FC; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104343.225156112@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:20 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 7/8] sched: Remove sched_class::pick_next_task() References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The reason for pick_next_task_fair() is the put/set optimization that avoids touching the common ancestors. However, it is possible to implement this in the put_prev_task() and set_next_task() calls as used in put_prev_set_next_task(). Notably, put_prev_set_next_task() is the only site that: - calls put_prev_task() with a .next argument; - calls set_next_task() with .first =3D true. This means that put_prev_task() can determine the common hierarchy and stop there, and then set_next_task() can terminate where put_prev_task stopped. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 27 +++------ kernel/sched/fair.c | 153 ++++++++++++++--------------------------------= ----- kernel/sched/sched.h | 14 ---- 3 files changed, 52 insertions(+), 142 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5924,16 +5924,15 @@ __pick_next_task(struct rq *rq, struct t if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running =3D=3D rq->cfs.h_nr_queued)) { =20 - p =3D pick_next_task_fair(rq, prev, rf); + p =3D pick_task_fair(rq, rf); if (unlikely(p =3D=3D RETRY_TASK)) goto restart; =20 /* Assume the next prioritized class is idle_sched_class */ - if (!p) { + if (!p) p =3D pick_task_idle(rq, rf); - put_prev_set_next_task(rq, prev, p); - } =20 + put_prev_set_next_task(rq, prev, p); return p; } =20 @@ -5941,20 +5940,12 @@ __pick_next_task(struct rq *rq, struct t prev_balance(rq, prev, rf); =20 for_each_active_class(class) { - if (class->pick_next_task) { - p =3D class->pick_next_task(rq, prev, rf); - if (unlikely(p =3D=3D RETRY_TASK)) - goto restart; - if (p) - return p; - } else { - p =3D class->pick_task(rq, rf); - if (unlikely(p =3D=3D RETRY_TASK)) - goto restart; - if (p) { - put_prev_set_next_task(rq, prev, p); - return p; - } + p =3D class->pick_task(rq, rf); + if (unlikely(p =3D=3D RETRY_TASK)) + goto restart; + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; } } =20 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8891,7 +8891,7 @@ static void wakeup_preempt_fair(struct r resched_curr_lazy(rq); } =20 -static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *= rf) +struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { struct sched_entity *se; @@ -8934,91 +8934,6 @@ static struct task_struct *pick_task_fai return NULL; } =20 -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, boo= l first); -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first); - -struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_fla= gs *rf) - __must_hold(__rq_lockp(rq)) -{ - struct sched_entity *se; - struct task_struct *p; - int new_tasks; - -again: - p =3D pick_task_fair(rq, rf); - if (!p) - goto idle; - se =3D &p->se; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class !=3D &fair_sched_class) - goto simple; - - __put_prev_set_next_dl_server(rq, prev, p); - - /* - * Because of the set_next_buddy() in dequeue_task_fair() it is rather - * likely that a next task is from the same cgroup as the current. - * - * Therefore attempt to avoid putting and setting the entire cgroup - * hierarchy, only change the part that actually changes. - * - * Since we haven't yet done put_prev_entity and if the selected task - * is a different task than we started out with, try and touch the - * least amount of cfs_rqs. - */ - if (prev !=3D p) { - struct sched_entity *pse =3D &prev->se; - struct cfs_rq *cfs_rq; - - while (!(cfs_rq =3D is_same_group(se, pse))) { - int se_depth =3D se->depth; - int pse_depth =3D pse->depth; - - if (se_depth <=3D pse_depth) { - put_prev_entity(cfs_rq_of(pse), pse); - pse =3D parent_entity(pse); - } - if (se_depth >=3D pse_depth) { - set_next_entity(cfs_rq_of(se), se, true); - se =3D parent_entity(se); - } - } - - put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se, true); - - __set_next_task_fair(rq, p, true); - } - - return p; - -simple: -#endif /* CONFIG_FAIR_GROUP_SCHED */ - put_prev_set_next_task(rq, prev, p); - return p; - -idle: - if (rf) { - new_tasks =3D sched_balance_newidle(rq, rf); - - /* - * Because sched_balance_newidle() releases (and re-acquires) - * rq->lock, it is possible for any higher priority task to - * appear. In that case we must re-start the pick_next_entity() - * loop. - */ - if (new_tasks < 0) - return RETRY_TASK; - - if (new_tasks > 0) - goto again; - } - - return NULL; -} - static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) __must_hold(__rq_lockp(dl_se->rq)) @@ -9042,10 +8957,28 @@ static void put_prev_task_fair(struct rq { struct sched_entity *se =3D &prev->se; struct cfs_rq *cfs_rq; + struct sched_entity *nse =3D NULL; =20 - for_each_sched_entity(se) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (next && next->sched_class =3D=3D &fair_sched_class) + nse =3D &next->se; +#endif + + while (se) { cfs_rq =3D cfs_rq_of(se); - put_prev_entity(cfs_rq, se); + if (!nse || cfs_rq->curr) + put_prev_entity(cfs_rq, se); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (nse) { + if (is_same_group(se, nse)) + break; + if (nse->depth >=3D se->depth) + nse =3D parent_entity(nse); + if (nse->depth > se->depth) + continue; + } +#endif + se =3D parent_entity(se); } } =20 @@ -13566,10 +13499,30 @@ static void switched_to_fair(struct rq * } } =20 -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, boo= l first) +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first) { struct sched_entity *se =3D &p->se; =20 + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq =3D cfs_rq_of(se); + + if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) && + first && cfs_rq->curr) + break; + + set_next_entity(cfs_rq, se, true); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } + + se =3D &p->se; + if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13589,27 +13542,6 @@ static void __set_next_task_fair(struct sched_fair_update_stop_tick(rq, p); } =20 -/* - * Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first) -{ - struct sched_entity *se =3D &p->se; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - - set_next_entity(cfs_rq, se, first); - /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); - } - - __set_next_task_fair(rq, p, first); -} - void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline =3D RB_ROOT_CACHED; @@ -13921,7 +13853,6 @@ DEFINE_SCHED_CLASS(fair) =3D { .wakeup_preempt =3D wakeup_preempt_fair, =20 .pick_task =3D pick_task_fair, - .pick_next_task =3D pick_next_task_fair, .put_prev_task =3D put_prev_task_fair, .set_next_task =3D set_next_task_fair, =20 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2538,17 +2538,6 @@ struct sched_class { * schedule/pick_next_task: rq->lock */ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); - /* - * Optional! When implemented pick_next_task() should be equivalent to: - * - * next =3D pick_task(); - * if (next) { - * put_prev_task(prev); - * set_next_task_first(next); - * } - */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *= prev, - struct rq_flags *rf); =20 /* * sched_change: @@ -2761,8 +2750,7 @@ static inline bool sched_fair_runnable(s return rq->cfs.nr_queued > 0; } =20 -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_= struct *prev, - struct rq_flags *rf); +extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *= rf); extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *= rf); =20 #define SCA_CHECK 0x01