From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E5788C83F2F for ; Thu, 31 Aug 2023 20:29:49 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347371AbjHaU3u (ORCPT ); Thu, 31 Aug 2023 16:29:50 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56604 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1347379AbjHaU3f (ORCPT ); Thu, 31 Aug 2023 16:29:35 -0400 Received: from ams.source.kernel.org (ams.source.kernel.org [145.40.68.75]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 70E91E5F for ; Thu, 31 Aug 2023 13:29:31 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by ams.source.kernel.org (Postfix) with ESMTPS id 28244B823BA for ; Thu, 31 Aug 2023 20:29:30 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id D9243C433C8; Thu, 31 Aug 2023 20:29:19 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513768; bh=D0g/ZKiOLsOo1ElsaJCupefa/OgAsnQlMOj3mittTys=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=XRr3c7MlN8VuBec9lhmVcOtSI8e6fPwLyRhFo0rNtWA214ibBd8dG+fwM5QBoSB52 5/s+s6vMb3Amtq99GEfePQT3ZQMNzQPYsSqQq+4DP97/xpezZTCjdTdhuyxcu/SvAz A4NFNYpZ7CM2djoKnQ60E0CXFVYHQ6lSsJowBWRx4DBM6VkVH90uxHzkRh+vG8Y3LF oAaABctVDB8vBYVzFPbyZqtJMDlwh2jus3TRVAGobBzSDl/IIaqnT6naB7V5Ht+oG6 i+UJ7oQyrIah5lO1v3kjB3rD6dplqXXowPEkijOMxZH8eyU+2729qZlyEFOkCxpcsu cALjflCjrhC5w== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 1/7] sched: Unify runtime accounting across classes Date: Thu, 31 Aug 2023 22:28:52 +0200 Message-Id: <093be922c23781bc90c2fde27eaad9ef6fc3051c.1693510979.git.bristot@kernel.org> X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Peter Zijlstra All classes use sched_entity::exec_start to track runtime and have copies of the exact same code around to compute runtime. Collapse all that. Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Steven Rostedt (Google) --- include/linux/sched.h | 2 +- kernel/sched/deadline.c | 15 +++-------- kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++---------- kernel/sched/rt.c | 15 +++-------- kernel/sched/sched.h | 12 ++------- kernel/sched/stop_task.c | 13 +-------- 6 files changed, 53 insertions(+), 61 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 177b3f3676ef..639f6eb9bd4f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -520,7 +520,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; =20 - u64 exec_max; + s64 exec_max; u64 slice_max; =20 u64 nr_migrations_cold; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58b542bf2893..9a09d9dafd88 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1299,9 +1299,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr =3D rq->curr; struct sched_dl_entity *dl_se =3D &curr->dl; - u64 delta_exec, scaled_delta_exec; + s64 delta_exec, scaled_delta_exec; int cpu =3D cpu_of(rq); - u64 now; =20 if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1314,21 +1313,13 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - now =3D rq_clock_task(rq); - delta_exec =3D now - curr->se.exec_start; - if (unlikely((s64)delta_exec <=3D 0)) { + delta_exec =3D update_curr_common(rq); + if (unlikely(delta_exec <=3D 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; return; } =20 - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (dl_entity_is_special(dl_se)) return; =20 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 911d0063763c..52c8219623b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1092,23 +1092,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_r= q) } #endif /* CONFIG_SMP */ =20 -/* - * Update the current task's runtime statistics. - */ -static void update_curr(struct cfs_rq *cfs_rq) +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { - struct sched_entity *curr =3D cfs_rq->curr; - u64 now =3D rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; - - if (unlikely(!curr)) - return; + u64 now =3D rq_clock_task(rq); + s64 delta_exec; =20 delta_exec =3D now - curr->exec_start; - if (unlikely((s64)delta_exec <=3D 0)) - return; + if (unlikely(delta_exec <=3D 0)) + return delta_exec; =20 curr->exec_start =3D now; + curr->sum_exec_runtime +=3D delta_exec; =20 if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1118,8 +1112,43 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, stats->exec_max)); } =20 - curr->sum_exec_runtime +=3D delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); + return delta_exec; +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *curr =3D rq->curr; + s64 delta_exec; + + delta_exec =3D update_curr_se(rq, &curr->se); + if (unlikely(delta_exec <=3D 0)) + return delta_exec; + + trace_sched_stat_runtime(curr, delta_exec, 0); + + account_group_exec_runtime(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); + + return delta_exec; +} + +/* + * Update the current task's runtime statistics. + */ +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr =3D cfs_rq->curr; + s64 delta_exec; + + if (unlikely(!curr)) + return; + + delta_exec =3D update_curr_se(rq_of(cfs_rq), curr); + if (unlikely(delta_exec <=3D 0)) + return; =20 curr->vruntime +=3D calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0597ba0f85ff..e23cc67c9467 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1046,24 +1046,15 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr =3D rq->curr; struct sched_rt_entity *rt_se =3D &curr->rt; - u64 delta_exec; - u64 now; + s64 delta_exec; =20 if (curr->sched_class !=3D &rt_sched_class) return; =20 - now =3D rq_clock_task(rq); - delta_exec =3D now - curr->se.exec_start; - if (unlikely((s64)delta_exec <=3D 0)) + delta_exec =3D update_curr_common(rq); + if (unlikely(delta_exec <=3D 0)) return; =20 - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (!rt_bandwidth_enabled()) return; =20 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..1def5b7fa1df 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2228,6 +2228,8 @@ struct affinity_context { unsigned int flags; }; =20 +extern s64 update_curr_common(struct rq *rq); + struct sched_class { =20 #ifdef CONFIG_UCLAMP_TASK @@ -3280,16 +3282,6 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif =20 -static inline void update_current_exec_runtime(struct task_struct *curr, - u64 now, u64 delta_exec) -{ - curr->se.sum_exec_runtime +=3D delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start =3D now; - cgroup_account_cputime(curr, delta_exec); -} - #ifdef CONFIG_SCHED_MM_CID =20 #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 85590599b4d6..7595494ceb6d 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq) =20 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { - struct task_struct *curr =3D rq->curr; - u64 now, delta_exec; - - now =3D rq_clock_task(rq); - delta_exec =3D now - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec =3D 0; - - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - update_current_exec_runtime(curr, now, delta_exec); + update_curr_common(rq); } =20 /* --=20 2.40.1 From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 57C8DC83F37 for ; Thu, 31 Aug 2023 20:29:55 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347382AbjHaU34 (ORCPT ); Thu, 31 Aug 2023 16:29:56 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:36590 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1347392AbjHaU3s (ORCPT ); Thu, 31 Aug 2023 16:29:48 -0400 Received: from ams.source.kernel.org (ams.source.kernel.org [IPv6:2604:1380:4601:e00::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1ACCF1B0 for ; Thu, 31 Aug 2023 13:29:45 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by ams.source.kernel.org (Postfix) with ESMTPS id C8906B823BA for ; Thu, 31 Aug 2023 20:29:43 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id AFABCC433C9; Thu, 31 Aug 2023 20:29:29 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513782; bh=yKq+K0AY5s6vg6cR2QhLxhtDuuzWAbIZ4RtywMq3NQM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=JKDYbvTQbOAmwq9KDUR9xC4Wzlpx/BIOCiqQR+ZL/cpXKzRDIDyOE51dDJ8hHbD7I K2T2BYptPpmzcG+K4vAqO5BGr0pEjz6uUHar29ResIQFgMSSwJtNIKD2+Ng5khmeeW //aWE4JB+JQmT0obRin3nJpMFEirloV4Rv9zh6bGCpHiuytsTEOibMTJS2O63udHp1 JVAAFvmrk7P6Mb8nuulFuNIBqSHlL2j7diONqs7FjaxAD5RvVnn4TFULETn7bXkMxH dYaqdq+Nc6eppzqfaPJrKL9xeZw13Rh5XLhY/Apbpq0BhP+Hz31zcYNwy/UFFYJzMh +zwZJ+FCoKSmA== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 2/7] sched/deadline: Collect sched_dl_entity initialization Date: Thu, 31 Aug 2023 22:28:53 +0200 Message-Id: <881b724c1657dae38a0af7a768c896a8c1f0b321.1693510979.git.bristot@kernel.org> X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Peter Zijlstra Create a single function that initializes a sched_dl_entity. Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira --- kernel/sched/core.c | 5 +---- kernel/sched/deadline.c | 22 +++++++++++++++------- kernel/sched/sched.h | 5 +---- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2299a5cfbfb9..b57746237a43 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4513,10 +4513,7 @@ static void __sched_fork(unsigned long clone_flags, = struct task_struct *p) memset(&p->stats, 0, sizeof(p->stats)); #endif =20 - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); =20 INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout =3D 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9a09d9dafd88..f8c402079404 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -335,6 +335,8 @@ static void dl_change_utilization(struct task_struct *p= , u64 new_bw) __add_rq_bw(new_bw, &rq->dl); } =20 +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -434,7 +436,7 @@ static void task_non_contending(struct task_struct *p) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); } =20 return; @@ -1207,7 +1209,7 @@ static enum hrtimer_restart dl_task_timer(struct hrti= mer *timer) return HRTIMER_NORESTART; } =20 -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer =3D &dl_se->dl_timer; =20 @@ -1413,7 +1415,7 @@ static enum hrtimer_restart inactive_task_timer(struc= t hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); =20 goto unlock; } @@ -1429,7 +1431,7 @@ static enum hrtimer_restart inactive_task_timer(struc= t hrtimer *timer) return HRTIMER_NORESTART; } =20 -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer =3D &dl_se->inactive_timer; =20 @@ -2984,10 +2986,8 @@ bool __checkparam_dl(const struct sched_attr *attr) /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se =3D &p->dl; - dl_se->dl_runtime =3D 0; dl_se->dl_deadline =3D 0; dl_se->dl_period =3D 0; @@ -3005,6 +3005,14 @@ void __dl_clear_params(struct task_struct *p) #endif } =20 +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); +} + bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se =3D &p->dl; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1def5b7fa1df..5e0df4bba476 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -284,8 +284,6 @@ struct rt_bandwidth { unsigned int rt_period_active; }; =20 -void __dl_clear_params(struct task_struct *p); - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >=3D 0; @@ -2443,8 +2441,7 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 r= untime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); =20 -extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_entity(struct sched_dl_entity *dl_se); =20 #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) --=20 2.40.1 From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id CFDE7C83F2F for ; Thu, 31 Aug 2023 20:30:20 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347344AbjHaUaV (ORCPT ); Thu, 31 Aug 2023 16:30:21 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54972 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1347386AbjHaU35 (ORCPT ); Thu, 31 Aug 2023 16:29:57 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id F0AEAE5F for ; Thu, 31 Aug 2023 13:29:52 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id 79E5862BA6 for ; Thu, 31 Aug 2023 20:29:52 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6BDBFC433C8; Thu, 31 Aug 2023 20:29:43 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513791; bh=QXStic4MSsKYlc1Pov7Gq0PWROsNHrJdpMwrKetBrjY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=A9HHDcjrWGDeKiC0BHj3oWVPP45TULmHuPIoTwlhl1QUbrOyTgWn/aZlJRXN7UuKv uTjzlcPNOY8cz5uM3bkdCaVtwMTRl7iGJkPXTGdr3/kr40uxO2Tms/wZaxWVmlVuYG k6guUGRJmHIuh6Y5KpCdKo/zDthW4q7h0S85oXZz32bu6nDZt2P5vJFxcSVPaqlnIg dTeATwyF5l4ABooFq/9SR5nyLo7H1LEbfGxpc3dHwWmDpFBk7b+xlJhZlpRWbj3urN o9O9UT/aQUCRfP5yGh7qgrnng+RyKm9H3cH2eXz6s6MFv0xBfRl6g4S3NUZGYn31o7 CLS0mhrn9WJGw== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 3/7] sched/deadline: Move bandwidth accounting into {en,de}queue_dl_entity Date: Thu, 31 Aug 2023 22:28:54 +0200 Message-Id: <25f974670a672d1c4ee98eb13ad6a4ab289ed33a.1693510979.git.bristot@kernel.org> X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Peter Zijlstra In preparation of introducing !task sched_dl_entity; move the bandwidth accounting into {en.de}queue_dl_entity(). Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira --- kernel/sched/deadline.c | 130 ++++++++++++++++++++++------------------ kernel/sched/sched.h | 6 ++ 2 files changed, 78 insertions(+), 58 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f8c402079404..957baaf6dc92 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -391,12 +391,12 @@ static void __dl_clear_params(struct sched_dl_entity = *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se =3D &p->dl; struct hrtimer *timer =3D &dl_se->inactive_timer; struct dl_rq *dl_rq =3D dl_rq_of_se(dl_se); struct rq *rq =3D rq_of_dl_rq(dl_rq); + struct task_struct *p =3D dl_task_of(dl_se); s64 zerolag_time; =20 /* @@ -428,13 +428,14 @@ static void task_non_contending(struct task_struct *p) if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); + if (!dl_task(p) || READ_ONCE(p->__state) =3D=3D TASK_DEAD) { struct dl_bw *dl_b =3D dl_bw_of(task_cpu(p)); =20 if (READ_ONCE(p->__state) =3D=3D TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); + sub_rq_bw(dl_se, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(dl_se); } @@ -1627,6 +1628,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int= flags) =20 update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); =20 + /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); + + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq =3D dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); + } + + /* + * If p is throttled, we do not enqueue it. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ + if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + + return; + } + /* * If this is a wakeup or a new instance, the scheduling * parameters of the task might need updating. Otherwise, @@ -1646,9 +1682,28 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int= flags) __enqueue_dl_entity(dl_se); } =20 -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { __dequeue_dl_entity(dl_se); + + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq =3D dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state =3D=3D TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(dl_se); } =20 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flag= s) @@ -1693,76 +1748,35 @@ static void enqueue_task_dl(struct rq *rq, struct t= ask_struct *p, int flags) return; } =20 - /* - * Check if a constrained deadline task was activated - * after the deadline but before the next period. - * If that is the case, the task will be throttled and - * the replenishment timer will be set to the next period. - */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); - - if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); - } - - /* - * If p is throttled, we do not enqueue it. In fact, if it exhausted - * its budget it needs a replenishment and, since it now is on - * its rq, the bandwidth timer callback (which clearly has not - * run yet) will take care of this. - * However, the active utilization does not depend on the fact - * that the task is on the runqueue or not (but depends on the - * task's state - in GRUB parlance, "inactive" vs "active contending"). - * In other words, even if a task is throttled its utilization must - * be counted in the active utilization; hence, we need to call - * add_running_bw(). - */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); - - return; - } - check_schedstat_required(); update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); =20 + if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING) + flags |=3D ENQUEUE_MIGRATING; + enqueue_dl_entity(&p->dl, flags); =20 - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } =20 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int fl= ags) { update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); + dequeue_dl_entity(&p->dl, flags); + + if (!p->dl.dl_throttled) + dequeue_pushable_dl_task(rq, p); } =20 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flag= s) { update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); =20 - if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); - } + if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING) + flags |=3D DEQUEUE_MIGRATING; =20 - /* - * This check allows to start the inactive timer (or to immediately - * decrease the active utilization, if needed) in two cases: - * when the task blocks and when it is terminating - * (p->state =3D=3D TASK_DEAD). We can handle the two cases in the same - * way, because from GRUB's point of view the same thing is happening - * (the task moves from "active contending" to "active non contending" - * or "inactive") - */ - if (flags & DEQUEUE_SLEEP) - task_non_contending(p); + __dequeue_task_dl(rq, p, flags); } =20 /* @@ -2578,7 +2592,7 @@ static void switched_from_dl(struct rq *rq, struct ta= sk_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); =20 /* * In case a task is setscheduled out from SCHED_DEADLINE we need to diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5e0df4bba476..9f48ed3e9028 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2193,6 +2193,10 @@ extern const u32 sched_prio_to_wmult[40]; * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the locat= ion * in the runqueue. * + * NOCLOCK - skip the update_rq_clock() (avoids double updates) + * + * MIGRATION - p->on_rq =3D=3D TASK_ON_RQ_MIGRATING (used for DEADLINE) + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup @@ -2203,6 +2207,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ =20 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2217,6 +2222,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_MIGRATED 0x00 #endif #define ENQUEUE_INITIAL 0x80 +#define ENQUEUE_MIGRATING 0x100 =20 #define RETRY_TASK ((void *)-1UL) =20 --=20 2.40.1 From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id EF4DEC83F37 for ; Thu, 31 Aug 2023 20:30:27 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347406AbjHaUa2 (ORCPT ); Thu, 31 Aug 2023 16:30:28 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48214 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1347437AbjHaUaI (ORCPT ); Thu, 31 Aug 2023 16:30:08 -0400 Received: from dfw.source.kernel.org (dfw.source.kernel.org [IPv6:2604:1380:4641:c500::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 53AE8E5D for ; Thu, 31 Aug 2023 13:30:04 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by dfw.source.kernel.org (Postfix) with ESMTPS id CD6FB629BA for ; Thu, 31 Aug 2023 20:30:03 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5B6B2C433CA; Thu, 31 Aug 2023 20:29:52 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513803; bh=mMMVXoFXZPyOUMWcl/REn8dPSNv+4KfdqApES9aNk6A=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=NJN+HGjQaL93L/XHza7ThwBcIIQtUGvmJkp0mTEy43w1UFCXzVzNFvGl4z90nWxHw TXfbs1vLbkJ14oa8AY0hpDUcuMv7/+yaBpJ4YFvdTDFnlMuilLExRRpinGoU2rEGoK HGvMKeY326TVkfZTiqx2lMjPsbAvcvdqRNSX4cGaKPDvuQOCkXchuvpaKOcHf65rl/ nJMPjVaV0VM7RwVlPfsTkGB4uQalMYp5DmLvP3UA8oeundc0Mv/KnfnDEy6AlIiuYY RgkdeoN3bQBkL5dJFZLBGSUrJU+yanj6cQdwFnH9Uh2Wwp2AKOXQ3WCKoRauirNDVt tua3FFRIIWUPA== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 4/7] sched/deadline: Introduce deadline servers Date: Thu, 31 Aug 2023 22:28:55 +0200 Message-Id: <6ee55548f2a39584a6f10bcf5c4d6c82bd133a45.1693510979.git.bristot@kernel.org> X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Peter Zijlstra Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks with higher priority (e.g., SCHED_FIFO) monopolize CPU(s). RT Throttling has been introduced a while ago as a (mostly debug) countermeasure one can utilize to reserve some CPU time for low priority tasks (usually background type of work, e.g. workqueues, timers, etc.). It however has its own problems (see documentation) and the undesired effect of unconditionally throttling FIFO tasks even when no lower priority activity needs to run (there are mechanisms to fix this issue as well, but, again, with their own problems). Introduce deadline servers to service low priority tasks needs under starvation conditions. Deadline servers are built extending SCHED_DEADLINE implementation to allow 2-level scheduling (a sched_deadline entity becomes a container for lower priority scheduling entities). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira --- include/linux/sched.h | 22 ++- kernel/sched/core.c | 17 ++ kernel/sched/deadline.c | 344 +++++++++++++++++++++++++++------------- kernel/sched/fair.c | 4 + kernel/sched/sched.h | 27 ++++ 5 files changed, 301 insertions(+), 113 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 639f6eb9bd4f..40fbf3f034e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,12 +63,14 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; +struct sched_dl_entity; struct sched_param; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; struct user_event_mm; =20 /* @@ -604,6 +606,9 @@ struct sched_rt_entity { #endif } __randomize_layout; =20 +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); + struct sched_dl_entity { struct rb_node rb_node; =20 @@ -651,6 +656,7 @@ struct sched_dl_entity { unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; =20 /* * Bandwidth enforcement timer. Each -deadline task has its @@ -665,7 +671,20 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + * + * @server_has_tasks() returns true if @server_pick return a + * runnable task. + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; + dl_server_pick_f server_pick; =20 #ifdef CONFIG_RT_MUTEXES /* @@ -794,6 +813,7 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *server; const struct sched_class *sched_class; =20 #ifdef CONFIG_SCHED_CORE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b57746237a43..c780707e1761 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3815,6 +3815,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p= , int wake_flags, rq->idle_stamp =3D 0; } #endif + + p->server =3D NULL; } =20 /* @@ -6008,12 +6010,27 @@ __pick_next_task(struct rq *rq, struct task_struct = *prev, struct rq_flags *rf) p =3D pick_next_task_idle(rq); } =20 + /* + * This is the fast path; it cannot be a DL server pick; + * therefore even if @p =3D=3D @prev, ->server must be NULL. + */ + if (p->server) + p->server =3D NULL; + return p; } =20 restart: put_prev_task_balance(rq, prev, rf); =20 + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->server. + */ + if (prev->server) + prev->server =3D NULL; + for_each_class(class) { p =3D class->pick_next_task(rq); if (p) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 957baaf6dc92..4dac16ed1317 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void) late_initcall(sched_dl_sysctl_init); #endif =20 +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } =20 @@ -64,14 +70,22 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_r= q) return container_of(dl_rq, struct rq, dl); } =20 -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) { - struct task_struct *p =3D dl_task_of(dl_se); - struct rq *rq =3D task_rq(p); + struct rq *rq =3D dl_se->rq; + + if (!dl_server(dl_se)) + rq =3D task_rq(dl_task_of(dl_se)); =20 - return &rq->dl; + return rq; } =20 +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + return &rq_of_dl_se(dl_se)->dl; +} + + static inline int on_dl_rq(struct sched_dl_entity *dl_se) { return !RB_EMPTY_NODE(&dl_se->rb_node); @@ -394,9 +408,8 @@ static void __dl_clear_params(struct sched_dl_entity *d= l_se); static void task_non_contending(struct sched_dl_entity *dl_se) { struct hrtimer *timer =3D &dl_se->inactive_timer; - struct dl_rq *dl_rq =3D dl_rq_of_se(dl_se); - struct rq *rq =3D rq_of_dl_rq(dl_rq); - struct task_struct *p =3D dl_task_of(dl_se); + struct rq *rq =3D rq_of_dl_se(dl_se); + struct dl_rq *dl_rq =3D &rq->dl; s64 zerolag_time; =20 /* @@ -426,25 +439,33 @@ static void task_non_contending(struct sched_dl_entit= y *dl_se) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); + } else { + struct task_struct *p =3D dl_task_of(dl_se); + + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); =20 - if (!dl_task(p) || READ_ONCE(p->__state) =3D=3D TASK_DEAD) { - struct dl_bw *dl_b =3D dl_bw_of(task_cpu(p)); + if (!dl_task(p) || READ_ONCE(p->__state) =3D=3D TASK_DEAD) { + struct dl_bw *dl_b =3D dl_bw_of(task_cpu(p)); =20 - if (READ_ONCE(p->__state) =3D=3D TASK_DEAD) - sub_rq_bw(dl_se, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); - raw_spin_unlock(&dl_b->lock); - __dl_clear_params(dl_se); + if (READ_ONCE(p->__state) =3D=3D TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } =20 return; } =20 dl_se->dl_non_contending =3D 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } =20 @@ -471,8 +492,10 @@ static void task_contending(struct sched_dl_entity *dl= _se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) =3D=3D 1) - put_task_struct(dl_task_of(dl_se)); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) =3D=3D 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } else { /* * Since "dl_non_contending" is not set, the @@ -485,10 +508,8 @@ static void task_contending(struct sched_dl_entity *dl= _se, int flags) } } =20 -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq = *dl_rq) { - struct sched_dl_entity *dl_se =3D &p->dl; - return rb_first_cached(&dl_rq->root) =3D=3D &dl_se->rb_node; } =20 @@ -575,8 +596,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl= _se, struct dl_rq *dl_rq) =20 if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); } =20 static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *= dl_rq) @@ -585,8 +604,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl= _se, struct dl_rq *dl_rq) =20 if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); } =20 #define __node_2_pdl(node) \ @@ -764,8 +781,10 @@ static inline void deadline_queue_pull_task(struct rq = *rq) } #endif /* CONFIG_SMP */ =20 +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flag= s); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int fl= ags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, in= t flags); =20 static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, @@ -1013,8 +1032,7 @@ static inline bool dl_is_implicit(struct sched_dl_ent= ity *dl_se) */ static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq =3D dl_rq_of_se(dl_se); - struct rq *rq =3D rq_of_dl_rq(dl_rq); + struct rq *rq =3D rq_of_dl_se(dl_se); =20 if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { @@ -1045,11 +1063,11 @@ static inline u64 dl_next_period(struct sched_dl_en= tity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se =3D &p->dl; struct hrtimer *timer =3D &dl_se->dl_timer; - struct rq *rq =3D task_rq(p); + struct dl_rq *dl_rq =3D dl_rq_of_se(dl_se); + struct rq *rq =3D rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; =20 @@ -1083,13 +1101,33 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } =20 return 1; } =20 +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1108,10 +1146,34 @@ static enum hrtimer_restart dl_task_timer(struct hr= timer *timer) struct sched_dl_entity *dl_se =3D container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p =3D dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; =20 + if (dl_server(dl_se)) { + struct rq *rq =3D rq_of_dl_se(dl_se); + struct rq_flags rf; + + rq_lock(rq, &rf); + if (dl_se->dl_throttled) { + sched_clock_tick(); + update_rq_clock(rq); + + if (dl_se->server_has_tasks(dl_se)) { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + resched_curr(rq); + __push_dl_task(rq, &rf); + } else { + replenish_dl_entity(dl_se); + } + + } + rq_unlock(rq, &rf); + + return HRTIMER_NORESTART; + } + + p =3D dl_task_of(dl_se); rq =3D task_rq_lock(p, &rf); =20 /* @@ -1182,21 +1244,7 @@ static enum hrtimer_restart dl_task_timer(struct hrt= imer *timer) else resched_curr(rq); =20 -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); =20 unlock: task_rq_unlock(rq, p, &rf); @@ -1238,12 +1286,11 @@ static void init_dl_task_timer(struct sched_dl_enti= ty *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p =3D dl_task_of(dl_se); - struct rq *rq =3D rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq =3D rq_of_dl_se(dl_se); =20 if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled =3D 1; if (dl_se->runtime > 0) @@ -1294,29 +1341,13 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, s= truct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } =20 -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se= , s64 delta_exec) { - struct task_struct *curr =3D rq->curr; - struct sched_dl_entity *dl_se =3D &curr->dl; - s64 delta_exec, scaled_delta_exec; - int cpu =3D cpu_of(rq); + s64 scaled_delta_exec; =20 - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; - - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - delta_exec =3D update_curr_common(rq); if (unlikely(delta_exec <=3D 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1334,10 +1365,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec =3D grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec =3D grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu =3D cpu_of(rq); unsigned long scale_freq =3D arch_scale_freq_capacity(cpu); unsigned long scale_cpu =3D arch_scale_cpu_capacity(cpu); =20 @@ -1356,11 +1386,20 @@ static void update_curr_dl(struct rq *rq) (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun =3D 1; =20 - __dequeue_task_dl(rq, curr, 0); - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) { + update_stats_dequeue_dl(&rq->dl, dl_se, 0); + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); + } + + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } =20 - if (!is_leftmost(curr, &rq->dl)) + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } =20 @@ -1390,20 +1429,82 @@ static void update_curr_dl(struct rq *rq) } } =20 +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + if (!dl_server(dl_se)) { + dl_se->dl_server =3D 1; + setup_new_dl_entity(dl_se); + } + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick) +{ + dl_se->rq =3D rq; + dl_se->server_has_tasks =3D has_tasks; + dl_se->server_pick =3D pick; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr =3D rq->curr; + struct sched_dl_entity *dl_se =3D &curr->dl; + s64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec =3D update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); +} + static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) { struct sched_dl_entity *dl_se =3D container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p =3D dl_task_of(dl_se); + struct task_struct *p =3D NULL; struct rq_flags rf; struct rq *rq; =20 - rq =3D task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p =3D dl_task_of(dl_se); + rq =3D task_rq_lock(p, &rf); + } else { + rq =3D dl_se->rq; + rq_lock(rq, &rf); + } =20 sched_clock_tick(); update_rq_clock(rq); =20 + if (dl_server(dl_se)) + goto no_task; + if (!dl_task(p) || READ_ONCE(p->__state) =3D=3D TASK_DEAD) { struct dl_bw *dl_b =3D dl_bw_of(task_cpu(p)); =20 @@ -1420,14 +1521,21 @@ static enum hrtimer_restart inactive_task_timer(str= uct hrtimer *timer) =20 goto unlock; } + +no_task: if (dl_se->dl_non_contending =3D=3D 0) goto unlock; =20 sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending =3D 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } =20 return HRTIMER_NORESTART; } @@ -1485,34 +1593,35 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u6= 4 deadline) static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} =20 +static inline void update_dl_migration(struct dl_rq *dl_rq) {} + #endif /* CONFIG_SMP */ =20 static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio =3D dl_task_of(dl_se)->prio; u64 deadline =3D dl_se->deadline; =20 - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); =20 inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); + if (!dl_server(dl_se)) + inc_dl_migration(dl_se, dl_rq); + update_dl_migration(dl_rq); } =20 static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio =3D dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); =20 dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); + if (!dl_server(dl_se)) + dec_dl_migration(dl_se, dl_rq); + update_dl_migration(dl_rq); } =20 static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) @@ -1674,8 +1783,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int = flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } =20 @@ -1760,14 +1868,6 @@ static void enqueue_task_dl(struct rq *rq, struct ta= sk_struct *p, int flags) enqueue_pushable_dl_task(rq, p); } =20 -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int fl= ags) -{ - update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl, flags); - - if (!p->dl.dl_throttled) - dequeue_pushable_dl_task(rq, p); -} =20 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flag= s) { @@ -1776,7 +1876,9 @@ static void dequeue_task_dl(struct rq *rq, struct tas= k_struct *p, int flags) if (p->on_rq =3D=3D TASK_ON_RQ_MIGRATING) flags |=3D DEQUEUE_MIGRATING; =20 - __dequeue_task_dl(rq, p, flags); + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled) + dequeue_pushable_dl_task(rq, p); } =20 /* @@ -1966,12 +2068,12 @@ static void check_preempt_curr_dl(struct rq *rq, st= ruct task_struct *p, } =20 #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif @@ -1991,9 +2093,6 @@ static void set_next_task_dl(struct rq *rq, struct ta= sk_struct *p, bool first) if (!first) return; =20 - if (hrtick_enabled_dl(rq)) - start_hrtick_dl(rq, p); - if (rq->curr->sched_class !=3D &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); =20 @@ -2016,12 +2115,26 @@ static struct task_struct *pick_task_dl(struct rq *= rq) struct dl_rq *dl_rq =3D &rq->dl; struct task_struct *p; =20 +again: if (!sched_dl_runnable(rq)) return NULL; =20 dl_se =3D pick_next_dl_entity(dl_rq); WARN_ON_ONCE(!dl_se); - p =3D dl_task_of(dl_se); + + + if (dl_server(dl_se)) { + p =3D dl_se->server_pick(dl_se); + if (!p) { + WARN_ON_ONCE(1); + dl_se->dl_yielded =3D 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } + p->server =3D dl_se; + } else { + p =3D dl_task_of(dl_se); + } =20 return p; } @@ -2031,9 +2144,15 @@ static struct task_struct *pick_next_task_dl(struct = rq *rq) struct task_struct *p; =20 p =3D pick_task_dl(rq); - if (p) + if (!p) + return p; + + if (!p->server) set_next_task_dl(rq, p, true); =20 + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, &p->dl); + return p; } =20 @@ -2071,8 +2190,8 @@ static void task_tick_dl(struct rq *rq, struct task_s= truct *p, int queued) * be set and schedule() will start a new hrtick for the next task. */ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } =20 static void task_fork_dl(struct task_struct *p) @@ -3013,6 +3132,7 @@ static void __dl_clear_params(struct sched_dl_entity = *dl_se) dl_se->dl_yielded =3D 0; dl_se->dl_non_contending =3D 0; dl_se->dl_overrun =3D 0; + dl_se->dl_server =3D 0; =20 #ifdef CONFIG_RT_MUTEXES dl_se->pi_se =3D dl_se; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52c8219623b1..5ded18e28609 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1131,6 +1131,8 @@ s64 update_curr_common(struct rq *rq) =20 account_group_exec_runtime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec); + if (curr->server) + dl_server_update(curr->server, delta_exec); =20 return delta_exec; } @@ -1160,6 +1162,8 @@ static void update_curr(struct cfs_rq *cfs_rq) trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); + if (curtask->server) + dl_server_update(curtask->server, delta_exec); } =20 account_cfs_rq_runtime(cfs_rq, delta_exec); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f48ed3e9028..f30be4ae4c22 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -324,6 +324,33 @@ extern bool dl_param_changed(struct task_struct *p, co= nst struct sched_attr *att extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const = struct cpumask *trial); extern int dl_bw_check_overflow(int cpu); =20 +/* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: + * + * dl_se::rq -- runqueue we belong to. + * + * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop'= the + * server when it runs out of tasks to run. + * + * dl_se::server_pick() -- nested pick_next_task(); we yield the period = if this + * returns NULL. + * + * dl_server_update() -- called from update_curr_common(), propagates ru= ntime + * to the server. + * + * dl_server_start() + * dl_server_stop() -- start/stop the server when it has (no) tasks. + * + * dl_server_init() -- initializes the server. + */ +extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec= ); +extern void dl_server_start(struct sched_dl_entity *dl_se); +extern void dl_server_stop(struct sched_dl_entity *dl_se); +extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick); + #ifdef CONFIG_CGROUP_SCHED =20 struct cfs_rq; --=20 2.40.1 From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id B26BBC83F2F for ; Thu, 31 Aug 2023 20:30:28 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347412AbjHaUa3 (ORCPT ); Thu, 31 Aug 2023 16:30:29 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:52956 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S241718AbjHaUaV (ORCPT ); Thu, 31 Aug 2023 16:30:21 -0400 Received: from sin.source.kernel.org (sin.source.kernel.org [145.40.73.55]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D7D62E76 for ; Thu, 31 Aug 2023 13:30:16 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by sin.source.kernel.org (Postfix) with ESMTPS id 35E85CE2215 for ; Thu, 31 Aug 2023 20:30:15 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 1EF8AC433C7; Thu, 31 Aug 2023 20:30:03 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513813; bh=/wbKPnJAFoq6XFYpNk+kSPyAAz6H3Sq24WXX8IZyA1k=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=K57txnU0Xv8M63sXZJqUJImryhs/4/VdLCThhEb8gRY072SgzEZOA2kXcf1OOf4Jz i5/Rq9EbOxW42D+9mRKxoUukTY/yn/bQ+IqKB/KQiwSbOkmAdf5hkUDXr81+O3JBPY 9acgopbleYa3N6D1owf7L6tyIya9RBWJV6SPrh+i04R20v7ohhiMitqitsmnZ93U4y ncjCWX5l1h8jKDNYIhMXNLV7dxw51gR7/yQejaaVy2RePjmtJNXW5HgPwSKdzzXfFZ YU+TA507m2YEQVaBAXC4WTzMBz0uAxamo+DvuHYa9ev2U/FFnwAMDSDkKyQe53TXU2 Wbs1hojcz9z3A== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 5/7] sched/fair: Add trivial fair server Date: Thu, 31 Aug 2023 22:28:56 +0200 Message-Id: X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Peter Zijlstra Use deadline servers to service fair tasks. This patch adds a fair_server deadline entity which acts as a container for fair entities and can be used to fix starvation when higher priority (wrt fair) tasks are monopolizing CPU(s). [ dl_server do not account for rt ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira --- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 7 +++++++ kernel/sched/fair.c | 29 +++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 ++++ 4 files changed, 41 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c780707e1761..4ba4f1e09a80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10055,6 +10055,7 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq); =20 #ifdef CONFIG_SCHED_CORE rq->core =3D rq; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4dac16ed1317..7844cfb73029 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1403,6 +1403,13 @@ static void update_curr_dl_se(struct rq *rq, struct = sched_dl_entity *dl_se, s64 resched_curr(rq); } =20 + /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_server(dl_se)) + return; + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ded18e28609..580e6764a68b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6499,6 +6499,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *= p, int flags) */ util_est_enqueue(&rq->cfs, p); =20 + if (!rq->cfs.h_nr_running) + dl_server_start(&rq->fair_server); + /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag @@ -6643,6 +6646,9 @@ static void dequeue_task_fair(struct rq *rq, struct t= ask_struct *p, int flags) rq->next_balance =3D jiffies; =20 dequeue_throttle: + if (!rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server); + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -8291,6 +8297,29 @@ static struct task_struct *__pick_next_task_fair(str= uct rq *rq) return pick_next_task_fair(rq, NULL, NULL); } =20 +static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +{ + return !!dl_se->rq->cfs.nr_running; +} + +static struct task_struct *fair_server_pick(struct sched_dl_entity *dl_se) +{ + return pick_next_task_fair(dl_se->rq, NULL, NULL); +} + +void fair_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se =3D &rq->fair_server; + + init_dl_entity(dl_se); + + dl_se->dl_runtime =3D 50 * NSEC_PER_MSEC; + dl_se->dl_deadline =3D 1000 * NSEC_PER_MSEC; + dl_se->dl_period =3D 1000 * NSEC_PER_MSEC; + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick); +} + /* * Account for a descheduled task: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f30be4ae4c22..ac94c386741c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -351,6 +351,8 @@ extern void dl_server_init(struct sched_dl_entity *dl_s= e, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick); =20 +extern void fair_server_init(struct rq *); + #ifdef CONFIG_CGROUP_SCHED =20 struct cfs_rq; @@ -1024,6 +1026,8 @@ struct rq { struct rt_rq rt; struct dl_rq dl; =20 + struct sched_dl_entity fair_server; + #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ struct list_head leaf_cfs_rq_list; --=20 2.40.1 From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 2F95FC83F37 for ; Thu, 31 Aug 2023 20:30:48 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1345793AbjHaUat (ORCPT ); Thu, 31 Aug 2023 16:30:49 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:51314 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1347415AbjHaUap (ORCPT ); Thu, 31 Aug 2023 16:30:45 -0400 Received: from sin.source.kernel.org (sin.source.kernel.org [145.40.73.55]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 339CFE5F for ; Thu, 31 Aug 2023 13:30:28 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by sin.source.kernel.org (Postfix) with ESMTPS id 3DC99CE2227 for ; Thu, 31 Aug 2023 20:30:26 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 72A84C433C9; Thu, 31 Aug 2023 20:30:14 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513824; bh=9ttkNcaoXlsBRfjZS07t2h1iweJEZgbKCykYEPAFygY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=G04MT2FTbabnx+BlpXYijKXzvYAgiubfk6ob7qtBOSuOTBascEBhz+HXeFfbkC05w H4BLZJiYKYWzWVnp7BoxAvF0XsVjoSbiWNhjhwL5Hh/xiubN9iN1ZCA9b84VF3ujwK 3WNrpuRCvE4cJAGPN4KW61j+YD7VevvbqtJw6KNINuIZeTz8IJUN/IVeAox0/7svrF LmuYVrMJr+zzyfbejfnV9cnjcF02a+msE5KKbjfHhycwY0/SwYfLRMRa1TSKZBTXdc gv62oXpBrr9apQwEFnzGeF1GP+R9rbKtiDKRbee2Bito7O2xlzgErbYVIJAs2052dJ Q2K+F8OfupJew== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 6/7] sched/deadline: Deferrable dl server Date: Thu, 31 Aug 2023 22:28:57 +0200 Message-Id: <754dab7f30695ca10a41613068bb63db3bfea003.1693510979.git.bristot@kernel.org> X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Among the motivations for the DL servers is the real-time throttling mechanism. This mechanism works by throttling the rt_rq after running for a long period without leaving space for fair tasks. The base dl server avoids this problem by boosting fair tasks instead of throttling the rt_rq. The point is that it boosts without waiting for potential starvation, causing some non-intuitive cases. For example, an IRQ dispatches two tasks on an idle system, a fair and an RT. The DL server will be activated, running the fair task before the RT one. This problem can be avoided by deferring the dl server activation. By passing the deferring option, the dl_server will dispatch an SCHED_DEADLINE reservation throttled, and the replenishment timer set for (period - runtime) ns from start time. Thus, boosting the fair rq on its 0-laxity time with respect to rt_rq. The fair server will be scheduled under EDF, with a new a period at the replenishment time, thus not breaking dl tasks. Signed-off-by: Daniel Bristot de Oliveira --- include/linux/sched.h | 7 +++++ kernel/sched/deadline.c | 61 ++++++++++++++++++++++++++++++++++++++--- kernel/sched/fair.c | 10 ++++--- kernel/sched/rt.c | 6 ++++ kernel/sched/sched.h | 12 +++++++- 5 files changed, 87 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 40fbf3f034e0..38d0b3de03b2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -609,6 +609,12 @@ struct sched_rt_entity { typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); =20 +enum dl_server_state { + DL_SERVER_STOPPED =3D 0, + DL_SERVER_DEFER, + DL_SERVER_RUNNING +}; + struct sched_dl_entity { struct rb_node rb_node; =20 @@ -685,6 +691,7 @@ struct sched_dl_entity { struct rq *rq; dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick; + enum dl_server_state server_state; =20 #ifdef CONFIG_RT_MUTEXES /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7844cfb73029..7f1c52bfe78f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -422,7 +422,7 @@ static void task_non_contending(struct sched_dl_entity = *dl_se) if (dl_entity_is_special(dl_se)) return; =20 - WARN_ON(dl_se->dl_non_contending); + WARN_ON_ONCE(dl_se->dl_non_contending); =20 zerolag_time =3D dl_se->deadline - div64_long((dl_se->runtime * dl_se->dl_period), @@ -1155,6 +1155,7 @@ static enum hrtimer_restart dl_task_timer(struct hrti= mer *timer) struct rq_flags rf; =20 rq_lock(rq, &rf); + if (dl_se->dl_throttled) { sched_clock_tick(); update_rq_clock(rq); @@ -1165,9 +1166,12 @@ static enum hrtimer_restart dl_task_timer(struct hrt= imer *timer) __push_dl_task(rq, &rf); } else { replenish_dl_entity(dl_se); + task_non_contending(dl_se); } =20 } + + dl_se->server_state =3D DL_SERVER_RUNNING; rq_unlock(rq, &rf); =20 return HRTIMER_NORESTART; @@ -1441,18 +1445,65 @@ void dl_server_update(struct sched_dl_entity *dl_se= , s64 delta_exec) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } =20 -void dl_server_start(struct sched_dl_entity *dl_se) +void dl_server_start(struct sched_dl_entity *dl_se, int defer) { + if (dl_se->server_state !=3D DL_SERVER_STOPPED) { + WARN_ON_ONCE(!(on_dl_rq(dl_se) || dl_se->dl_throttled)); + return; + } + + if (defer) { + /* + * Postpone the replenishment to the (next period - the execution time) + * + * With this in place, we have two cases: + * + * On the absence of DL tasks: + * The server will start at the replenishment time, getting + * its runtime before now + period. This is the expected + * throttling behavior. + * + * In the presense of DL tasks: + * The server will be replenished, and then it will be + * schedule according to EDF, not breaking SCHED_DEADLINE. + * + * In the first cycle the server will be postponed at most + * at period + period - runtime at most. But then the + * server will receive its runtime/period. + * + * The server will, however, run on top of any RT task, which + * is the expected throttling behavior. + */ + dl_se->deadline =3D rq_clock(dl_se->rq) + dl_se->dl_period - dl_se->dl_r= untime; + /* Zero the runtime */ + dl_se->runtime =3D 0; + /* throttle the server */ + dl_se->dl_throttled =3D 1; + + dl_se->server_state =3D DL_SERVER_DEFER; + start_dl_timer(dl_se); + return; + } + if (!dl_server(dl_se)) { dl_se->dl_server =3D 1; setup_new_dl_entity(dl_se); } + + dl_se->server_state =3D DL_SERVER_RUNNING; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); } =20 void dl_server_stop(struct sched_dl_entity *dl_se) { + if (dl_se->server_state =3D=3D DL_SERVER_STOPPED) + return; + + hrtimer_try_to_cancel(&dl_se->dl_timer); dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + + dl_se->dl_throttled =3D 0; + dl_se->server_state =3D DL_SERVER_STOPPED; } =20 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, @@ -1462,6 +1513,8 @@ void dl_server_init(struct sched_dl_entity *dl_se, st= ruct rq *rq, dl_se->rq =3D rq; dl_se->server_has_tasks =3D has_tasks; dl_se->server_pick =3D pick; + dl_se->server_state =3D DL_SERVER_STOPPED; + dl_se->dl_server =3D 1; } =20 /* @@ -1817,8 +1870,9 @@ static void dequeue_dl_entity(struct sched_dl_entity = *dl_se, int flags) * (the task moves from "active contending" to "active non contending" * or "inactive") */ - if (flags & DEQUEUE_SLEEP) + if (flags & DEQUEUE_SLEEP && !dl_server(dl_se)) task_non_contending(dl_se); + } =20 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flag= s) @@ -1875,7 +1929,6 @@ static void enqueue_task_dl(struct rq *rq, struct tas= k_struct *p, int flags) enqueue_pushable_dl_task(rq, p); } =20 - static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flag= s) { update_curr_dl(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 580e6764a68b..b9d0f08dc8ca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6499,9 +6499,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *= p, int flags) */ util_est_enqueue(&rq->cfs, p); =20 - if (!rq->cfs.h_nr_running) - dl_server_start(&rq->fair_server); - /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag @@ -6568,6 +6565,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *= p, int flags) update_overutilized_status(rq); =20 enqueue_throttle: + if (sched_fair_server_needed(rq)) + dl_server_start(&rq->fair_server, rq->fair_server_defer); + assert_list_leaf_cfs_rq(rq); =20 hrtick_update(rq); @@ -6646,7 +6646,7 @@ static void dequeue_task_fair(struct rq *rq, struct t= ask_struct *p, int flags) rq->next_balance =3D jiffies; =20 dequeue_throttle: - if (!rq->cfs.h_nr_running) + if (!sched_fair_server_needed(rq)) dl_server_stop(&rq->fair_server); =20 util_est_update(&rq->cfs, p, task_sleep); @@ -8317,6 +8317,8 @@ void fair_server_init(struct rq *rq) dl_se->dl_deadline =3D 1000 * NSEC_PER_MSEC; dl_se->dl_period =3D 1000 * NSEC_PER_MSEC; =20 + rq->fair_server_defer =3D 1; + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick); } =20 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e23cc67c9467..7595110a5a3e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1537,6 +1537,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p,= int flags) =20 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (sched_fair_server_needed(rq)) + dl_server_start(&rq->fair_server, rq->fair_server_defer); } =20 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flag= s) @@ -1547,6 +1550,9 @@ static void dequeue_task_rt(struct rq *rq, struct tas= k_struct *p, int flags) dequeue_rt_entity(rt_se, flags); =20 dequeue_pushable_task(rq, p); + + if (!sched_fair_server_needed(rq)) + dl_server_stop(&rq->fair_server); } =20 /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ac94c386741c..510c4db379be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -345,7 +345,7 @@ extern int dl_bw_check_overflow(int cpu); * dl_server_init() -- initializes the server. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec= ); -extern void dl_server_start(struct sched_dl_entity *dl_se); +extern void dl_server_start(struct sched_dl_entity *dl_se, int defer); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, @@ -1027,6 +1027,7 @@ struct rq { struct dl_rq dl; =20 struct sched_dl_entity fair_server; + int fair_server_defer; =20 #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -2394,6 +2395,15 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_running > 0; } =20 +static inline bool sched_fair_server_needed(struct rq *rq) +{ + /* + * The fair server will activate anytime a fair task can starve + * because real-time tasks. + */ + return (sched_rt_runnable(rq) && sched_fair_runnable(rq)); +} + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_= struct *prev, struct rq_flags *rf); extern struct task_struct *pick_next_task_idle(struct rq *rq); =20 --=20 2.40.1 From nobody Wed Feb 11 06:27:37 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 71C11C83F2F for ; Thu, 31 Aug 2023 20:31:10 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1347414AbjHaUbL (ORCPT ); Thu, 31 Aug 2023 16:31:11 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:32884 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232420AbjHaUbK (ORCPT ); Thu, 31 Aug 2023 16:31:10 -0400 Received: from ams.source.kernel.org (ams.source.kernel.org [145.40.68.75]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B6CCE1721 for ; Thu, 31 Aug 2023 13:30:39 -0700 (PDT) Received: from smtp.kernel.org (relay.kernel.org [52.25.139.140]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits)) (No client certificate requested) by ams.source.kernel.org (Postfix) with ESMTPS id B43D4B823C5 for ; Thu, 31 Aug 2023 20:30:37 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 48E03C433C7; Thu, 31 Aug 2023 20:30:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1693513836; bh=aQ4nhpCB5+LdRupp8kpZ3PR8UVwoGTWspDaPEo1f0cQ=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Qoj9BWv/Sabgffn1ain9/MQNbJdU3u7u3ak+NMiR3/HXmJ6pBRPdoI+Vei+nLCmYI 6M6v70LigBsh44jVsb6W3jYxb6pm6g/424+nmE8Wb/1VvkKVtpGDRBxWHE3pxTjpXD C2ITYtrjHpakSsMiKVDsSH0KCbUf0tZ48tVE87diB8q7Mi2sPA8i5CmGaqa8PY1bSq lhRWjUXWEFJ7b1kC/IMgfdQhnVHlSROWlL5CWN6JkeZS29awh8JZUV0AnVix6L2Xph R86IPcMWO86QSiE5CbyfwK7XHtAvKDHaqu6OJsfA9bq1hUkijaopJQeMTUyVYVI4+9 9ShIwcDzQyPww== From: Daniel Bristot de Oliveira To: Ingo Molnar , Peter Zijlstra , Juri Lelli , Vincent Guittot Cc: Dietmar Eggemann , Steven Rostedt , Ben Segall , Mel Gorman , Daniel Bristot de Oliveira , Valentin Schneider , linux-kernel@vger.kernel.org, Luca Abeni , Tommaso Cucinotta , Thomas Gleixner , Joel Fernandes , Vineeth Pillai , Shuah Khan , bristot@kernel.org, Phil Auld Subject: [PATCH v4 7/7] sched/fair: Fair server interface Date: Thu, 31 Aug 2023 22:28:58 +0200 Message-Id: X-Mailer: git-send-email 2.40.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Add an interface for fair server setup on debugfs. Each rq have three file under /sys/kernel/debug/sched/rq/CPU{ID}: - fair_server_runtime: set runtime in ns - fair_server_period: set period in ns - fair_server_defer: on/off for the defer mechanism Signed-off-by: Daniel Bristot de Oliveira --- kernel/sched/debug.c | 177 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6..dad7d5d073ef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -333,8 +333,183 @@ static const struct file_operations sched_debug_fops = =3D { .release =3D seq_release, }; =20 +static ssize_t +sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu =3D (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq =3D cpu_rq(cpu); + unsigned long flags; + u64 runtime; + int err; + + err =3D kstrtoull_from_user(ubuf, cnt, 10, &runtime); + if (err) + return err; + + raw_spin_rq_lock_irqsave(rq, flags); + if (runtime > rq->fair_server.dl_period) + err =3D -EINVAL; + else + rq->fair_server.dl_runtime =3D runtime; + raw_spin_rq_unlock_irqrestore(rq, flags); + + if (err) + return err; + + *ppos +=3D cnt; + return cnt; +} + +static int sched_fair_server_runtime_show(struct seq_file *m, void *v) +{ + unsigned long cpu =3D (unsigned long) m->private; + struct rq *rq =3D cpu_rq(cpu); + + seq_printf(m, "%llu\n", rq->fair_server.dl_runtime); + return 0; +} + +static int sched_fair_server_runtime_open(struct inode *inode, struct file= *filp) +{ + return single_open(filp, sched_fair_server_runtime_show, inode->i_private= ); +} + +static const struct file_operations fair_server_runtime_fops =3D { + .open =3D sched_fair_server_runtime_open, + .write =3D sched_fair_server_runtime_write, + .read =3D seq_read, + .llseek =3D seq_lseek, + .release =3D single_release, +}; + +static unsigned int fair_server_period_max =3D (1 << 22) * NSEC_PER_USEC; = /* ~4 seconds */ +static unsigned int fair_server_period_min =3D (100) * NSEC_PER_USEC; = /* 100 us */ + +static ssize_t +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu =3D (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq =3D cpu_rq(cpu); + unsigned long flags; + u64 period; + int err; + + err =3D kstrtoull_from_user(ubuf, cnt, 10, &period); + if (err) + return err; + + if (period < fair_server_period_min || period > fair_server_period_max) + return -EINVAL; + + raw_spin_rq_lock_irqsave(rq, flags); + if (period < rq->fair_server.dl_runtime) + err =3D -EINVAL; + else + rq->fair_server.dl_period =3D period; + raw_spin_rq_unlock_irqrestore(rq, flags); + + if (err) + return err; + + *ppos +=3D cnt; + return cnt; +} + +static int sched_fair_server_period_show(struct seq_file *m, void *v) +{ + unsigned long cpu =3D (unsigned long) m->private; + struct rq *rq =3D cpu_rq(cpu); + + seq_printf(m, "%llu\n", rq->fair_server.dl_period); + return 0; +} + +static int sched_fair_server_period_open(struct inode *inode, struct file = *filp) +{ + return single_open(filp, sched_fair_server_period_show, inode->i_private); +} + +static const struct file_operations fair_server_period_fops =3D { + .open =3D sched_fair_server_period_open, + .write =3D sched_fair_server_period_write, + .read =3D seq_read, + .llseek =3D seq_lseek, + .release =3D single_release, +}; + +static ssize_t +sched_fair_server_defer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu =3D (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq =3D cpu_rq(cpu); + unsigned long flags; + u64 defer; + int err; + + err =3D kstrtoull_from_user(ubuf, cnt, 10, &defer); + if (err) + return err; + + if (defer < 0 || defer > 1) + return -EINVAL; + + raw_spin_rq_lock_irqsave(rq, flags); + rq->fair_server_defer =3D defer; + raw_spin_rq_unlock_irqrestore(rq, flags); + + *ppos +=3D cnt; + return cnt; +} + +static int sched_fair_server_defer_show(struct seq_file *m, void *v) +{ + unsigned long cpu =3D (unsigned long) m->private; + struct rq *rq =3D cpu_rq(cpu); + + seq_printf(m, "%d\n", rq->fair_server_defer); + return 0; +} + +static int sched_fair_server_defer_open(struct inode *inode, struct file *= filp) +{ + return single_open(filp, sched_fair_server_defer_show, inode->i_private); +} + +static const struct file_operations fair_server_defer_fops =3D { + .open =3D sched_fair_server_defer_open, + .write =3D sched_fair_server_defer_write, + .read =3D seq_read, + .llseek =3D seq_lseek, + .release =3D single_release, +}; + static struct dentry *debugfs_sched; =20 +void debugfs_fair_server_init(void) +{ + long cpu; + struct dentry *rq_dentry; + + rq_dentry =3D debugfs_create_dir("rq", debugfs_sched); + if (!rq_dentry) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%ld", cpu); + d_cpu =3D debugfs_create_dir(buf, rq_dentry); + + debugfs_create_file("fair_server_runtime", 0644, d_cpu, (void *) cpu, &f= air_server_runtime_fops); + debugfs_create_file("fair_server_period", 0644, d_cpu, (void *) cpu, &fa= ir_server_period_fops); + debugfs_create_file("fair_server_defer", 0644, d_cpu, (void *) cpu, &fai= r_server_defer_fops); + } +} + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -374,6 +549,8 @@ static __init int sched_init_debug(void) =20 debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops= ); =20 + debugfs_fair_server_init(); + return 0; } late_initcall(sched_init_debug); --=20 2.40.1