From nobody Tue Dec 30 13:06:14 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3E4BBC47074 for ; Wed, 15 Nov 2023 11:34:00 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234942AbjKOLeA (ORCPT ); Wed, 15 Nov 2023 06:34:00 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41202 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229796AbjKOLd5 (ORCPT ); Wed, 15 Nov 2023 06:33:57 -0500 Received: from out30-100.freemail.mail.aliyun.com (out30-100.freemail.mail.aliyun.com [115.124.30.100]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5EC8ACC for ; Wed, 15 Nov 2023 03:33:54 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R161e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046059;MF=cruzzhao@linux.alibaba.com;NM=1;PH=DS;RN=12;SR=0;TI=SMTPD_---0VwSxxN2_1700048031; Received: from localhost.localdomain(mailfrom:CruzZhao@linux.alibaba.com fp:SMTPD_---0VwSxxN2_1700048031) by smtp.aliyun-inc.com; Wed, 15 Nov 2023 19:33:51 +0800 From: Cruz Zhao To: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, joel@joelfernandes.org Cc: linux-kernel@vger.kernel.org Subject: [PATCH 1/4] sched/core: introduce core_id to struct rq Date: Wed, 15 Nov 2023 19:33:38 +0800 Message-Id: <20231115113341.13261-2-CruzZhao@linux.alibaba.com> X-Mailer: git-send-email 2.39.3 In-Reply-To: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> References: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Introduce core_id to struct rq, indates the cpu id of the core, which is used for getting cpu id of rq->core quickly. Signed-off-by: Cruz Zhao --- kernel/sched/core.c | 16 ++++++++++++---- kernel/sched/sched.h | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a708d225c28e..7a685fae73c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6400,7 +6400,7 @@ static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask =3D cpu_smt_mask(cpu); struct rq *rq =3D cpu_rq(cpu), *core_rq =3D NULL; - int t; + int t, core_id; =20 guard(core_lock)(&cpu); =20 @@ -6417,6 +6417,7 @@ static void sched_core_cpu_starting(unsigned int cpu) rq =3D cpu_rq(t); if (rq->core =3D=3D rq) { core_rq =3D rq; + core_id =3D t; break; } } @@ -6428,8 +6429,10 @@ static void sched_core_cpu_starting(unsigned int cpu) for_each_cpu(t, smt_mask) { rq =3D cpu_rq(t); =20 - if (t =3D=3D cpu) + if (t =3D=3D cpu) { rq->core =3D core_rq; + rq->core_id =3D core_id; + } =20 WARN_ON_ONCE(rq->core !=3D core_rq); } @@ -6439,7 +6442,7 @@ static void sched_core_cpu_deactivate(unsigned int cp= u) { const struct cpumask *smt_mask =3D cpu_smt_mask(cpu); struct rq *rq =3D cpu_rq(cpu), *core_rq =3D NULL; - int t; + int t, core_id; =20 guard(core_lock)(&cpu); =20 @@ -6458,6 +6461,7 @@ static void sched_core_cpu_deactivate(unsigned int cp= u) if (t =3D=3D cpu) continue; core_rq =3D cpu_rq(t); + core_id =3D t; break; } =20 @@ -6483,6 +6487,7 @@ static void sched_core_cpu_deactivate(unsigned int cp= u) for_each_cpu(t, smt_mask) { rq =3D cpu_rq(t); rq->core =3D core_rq; + rq->core_id =3D core_id; } } =20 @@ -6490,8 +6495,10 @@ static inline void sched_core_cpu_dying(unsigned int= cpu) { struct rq *rq =3D cpu_rq(cpu); =20 - if (rq->core !=3D rq) + if (rq->core !=3D rq) { rq->core =3D rq; + rq->core_id =3D cpu; + } } =20 #else /* !CONFIG_SCHED_CORE */ @@ -10008,6 +10015,7 @@ void __init sched_init(void) =20 #ifdef CONFIG_SCHED_CORE rq->core =3D rq; + rq->core_id =3D i; rq->core_pick =3D NULL; rq->core_enabled =3D 0; rq->core_tree =3D RB_ROOT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e5a95486a42..1b62165fc840 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1121,6 +1121,7 @@ struct rq { #ifdef CONFIG_SCHED_CORE /* per rq */ struct rq *core; + unsigned int core_id; struct task_struct *core_pick; unsigned int core_enabled; unsigned int core_sched_seq; --=20 2.39.3 From nobody Tue Dec 30 13:06:14 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 1899EC07548 for ; Wed, 15 Nov 2023 11:34:03 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234949AbjKOLeD (ORCPT ); Wed, 15 Nov 2023 06:34:03 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41216 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234680AbjKOLd6 (ORCPT ); Wed, 15 Nov 2023 06:33:58 -0500 Received: from out30-124.freemail.mail.aliyun.com (out30-124.freemail.mail.aliyun.com [115.124.30.124]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BCB9AE9 for ; Wed, 15 Nov 2023 03:33:54 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R611e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046056;MF=cruzzhao@linux.alibaba.com;NM=1;PH=DS;RN=12;SR=0;TI=SMTPD_---0VwSxxNQ_1700048031; Received: from localhost.localdomain(mailfrom:CruzZhao@linux.alibaba.com fp:SMTPD_---0VwSxxNQ_1700048031) by smtp.aliyun-inc.com; Wed, 15 Nov 2023 19:33:52 +0800 From: Cruz Zhao To: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, joel@joelfernandes.org Cc: linux-kernel@vger.kernel.org Subject: [PATCH 2/4] sched/core: introduce core to struct cfs_rq Date: Wed, 15 Nov 2023 19:33:39 +0800 Message-Id: <20231115113341.13261-3-CruzZhao@linux.alibaba.com> X-Mailer: git-send-email 2.39.3 In-Reply-To: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> References: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Introduce core to struct cfs_rq, indicates the corresponding cfs_rq of rq->core. Signed-off-by: Cruz Zhao --- kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 11 +++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 16 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a685fae73c4..647a12af9172 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6432,6 +6432,7 @@ static void sched_core_cpu_starting(unsigned int cpu) if (t =3D=3D cpu) { rq->core =3D core_rq; rq->core_id =3D core_id; + rq->cfs.core =3D &core_rq->cfs; } =20 WARN_ON_ONCE(rq->core !=3D core_rq); @@ -6488,6 +6489,7 @@ static void sched_core_cpu_deactivate(unsigned int cp= u) rq =3D cpu_rq(t); rq->core =3D core_rq; rq->core_id =3D core_id; + rq->cfs.core =3D &core_rq->cfs; } } =20 @@ -6498,6 +6500,7 @@ static inline void sched_core_cpu_dying(unsigned int = cpu) if (rq->core !=3D rq) { rq->core =3D rq; rq->core_id =3D cpu; + rq->cfs.core =3D &rq->cfs; } } =20 @@ -10016,6 +10019,7 @@ void __init sched_init(void) #ifdef CONFIG_SCHED_CORE rq->core =3D rq; rq->core_id =3D i; + rq->cfs.core =3D &rq->cfs; rq->core_pick =3D NULL; rq->core_enabled =3D 0; rq->core_tree =3D RB_ROOT; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2048138ce54b..61cbaa3cc385 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12420,6 +12420,16 @@ bool cfs_prio_less(const struct task_struct *a, co= nst struct task_struct *b, return delta > 0; } =20 +void sched_core_init_cfs_rq(struct task_group *tg, struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq =3D rq_of(cfs_rq); + int core_id =3D rq->core_id; + + cfs_rq->core =3D tg->cfs_rq[core_id]; +#endif +} + static int task_is_throttled_fair(struct task_struct *p, int cpu) { struct cfs_rq *cfs_rq; @@ -12715,6 +12725,7 @@ int alloc_fair_sched_group(struct task_group *tg, s= truct task_group *parent) =20 init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + sched_core_init_cfs_rq(tg, cfs_rq); init_entity_runnable_average(se); } =20 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1b62165fc840..62fca54223a1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -547,6 +547,7 @@ struct cfs_rq { #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 min_vruntime_fi; + struct cfs_rq *core; #endif =20 #ifndef CONFIG_64BIT --=20 2.39.3 From nobody Tue Dec 30 13:06:14 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 1A888C07548 for ; Wed, 15 Nov 2023 11:34:06 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234965AbjKOLeG (ORCPT ); Wed, 15 Nov 2023 06:34:06 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41224 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234922AbjKOLd6 (ORCPT ); Wed, 15 Nov 2023 06:33:58 -0500 Received: from out30-133.freemail.mail.aliyun.com (out30-133.freemail.mail.aliyun.com [115.124.30.133]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0F7CD11C for ; Wed, 15 Nov 2023 03:33:54 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R191e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018045176;MF=cruzzhao@linux.alibaba.com;NM=1;PH=DS;RN=12;SR=0;TI=SMTPD_---0VwSxxNu_1700048032; Received: from localhost.localdomain(mailfrom:CruzZhao@linux.alibaba.com fp:SMTPD_---0VwSxxNu_1700048032) by smtp.aliyun-inc.com; Wed, 15 Nov 2023 19:33:52 +0800 From: Cruz Zhao To: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, joel@joelfernandes.org Cc: linux-kernel@vger.kernel.org Subject: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime Date: Wed, 15 Nov 2023 19:33:40 +0800 Message-Id: <20231115113341.13261-4-CruzZhao@linux.alibaba.com> X-Mailer: git-send-email 2.39.3 In-Reply-To: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> References: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" To compare the priority of sched_entity from different cpus of a core, we introduce core_vruntime to struct sched_entity and core_min_vruntime to struct cfs_rq. cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs of the same task_group among the core, and se->core_vruntime is the vruntime relative to se->cfs_rq->core->core_min_vruntime. Signed-off-by: Cruz Zhao --- include/linux/sched.h | 3 +++ kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 1 + 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..df481a8ebc07 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -562,6 +562,9 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; +#ifdef CONFIG_SCHED_CORE + u64 core_vruntime; +#endif s64 vlag; u64 slice; =20 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61cbaa3cc385..60b2fd437474 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -750,30 +750,58 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_r= q, u64 vruntime) return min_vruntime; } =20 +#ifdef CONFIG_SCHED_CORE +static u64 __update_core_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + u64 min_vruntime =3D cfs_rq->core_min_vruntime; + s64 delta =3D (s64)(vruntime - min_vruntime); + + return delta > 0 ? vruntime : min_vruntime; +} +#endif + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *se =3D __pick_first_entity(cfs_rq); struct sched_entity *curr =3D cfs_rq->curr; =20 u64 vruntime =3D cfs_rq->min_vruntime; +#ifdef CONFIG_SCHED_CORE + u64 core_vruntime =3D cfs_rq->core->min_vruntime; +#endif =20 if (curr) { - if (curr->on_rq) + if (curr->on_rq) { vruntime =3D curr->vruntime; - else +#ifdef CONFIG_SCHED_CORE + core_vruntime =3D curr->core_vruntime; +#endif + } else { curr =3D NULL; + } } =20 if (se) { - if (!curr) + if (!curr) { vruntime =3D se->vruntime; - else +#ifdef CONFIG_SCHED_CORE + core_vruntime =3D se->core_vruntime; +#endif + } else { vruntime =3D min_vruntime(vruntime, se->vruntime); +#ifdef CONFIG_SCHED_CORE + core_vruntime =3D min_vruntime(core_vruntime, se->core_vruntime); +#endif + } } =20 /* ensure we never gain time by being placed backwards. */ u64_u32_store(cfs_rq->min_vruntime, __update_min_vruntime(cfs_rq, vruntime)); +#ifdef CONFIG_SCHED_CORE + u64_u32_store(cfs_rq->core->core_min_vruntime, + __update_core_min_vruntime(cfs_rq->core, vruntime)); +#endif } =20 static inline bool __entity_less(struct rb_node *a, const struct rb_node *= b) @@ -1137,6 +1165,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct sched_entity *curr =3D cfs_rq->curr; u64 now =3D rq_clock_task(rq_of(cfs_rq)); u64 delta_exec; + u64 delta_exec_fair; =20 if (unlikely(!curr)) return; @@ -1158,7 +1187,11 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->sum_exec_runtime +=3D delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); =20 - curr->vruntime +=3D calc_delta_fair(delta_exec, curr); + delta_exec_fair =3D calc_delta_fair(delta_exec, curr); + curr->vruntime +=3D delta_exec_fair; +#ifdef CONFIG_SCHED_CORE + curr->core_vruntime +=3D delta_exec_fair; +#endif update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); =20 @@ -5009,6 +5042,9 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 vslice, vruntime =3D avg_vruntime(cfs_rq); +#ifdef CONFIG_SCHED_CORE + u64 core_vruntime =3D cfs_rq->core->core_min_vruntime + vruntime - cfs_rq= ->min_vruntime; +#endif s64 lag =3D 0; =20 se->slice =3D sysctl_sched_base_slice; @@ -5091,6 +5127,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_enti= ty *se, int flags) } =20 se->vruntime =3D vruntime - lag; +#ifdef CONFIG_SCHED_CORE + se->core_vruntime =3D core_vruntime - lag; +#endif =20 /* * When joining the competition; the exisiting tasks will be, @@ -12655,6 +12694,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline =3D RB_ROOT_CACHED; u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); +#ifdef CONFIG_SCHED_CORE + u64_u32_store(cfs_rq->core_min_vruntime, (u64)(-(1LL << 20))); +#endif #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 62fca54223a1..f9d3701481f1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -545,6 +545,7 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE + u64 core_min_vruntime; unsigned int forceidle_seq; u64 min_vruntime_fi; struct cfs_rq *core; --=20 2.39.3 From nobody Tue Dec 30 13:06:14 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D1F4CC07548 for ; Wed, 15 Nov 2023 11:34:08 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234962AbjKOLeJ (ORCPT ); Wed, 15 Nov 2023 06:34:09 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41240 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234934AbjKOLd7 (ORCPT ); Wed, 15 Nov 2023 06:33:59 -0500 Received: from out30-113.freemail.mail.aliyun.com (out30-113.freemail.mail.aliyun.com [115.124.30.113]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id ED369CC for ; Wed, 15 Nov 2023 03:33:55 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R841e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018045170;MF=cruzzhao@linux.alibaba.com;NM=1;PH=DS;RN=12;SR=0;TI=SMTPD_---0VwSxxO8_1700048033; Received: from localhost.localdomain(mailfrom:CruzZhao@linux.alibaba.com fp:SMTPD_---0VwSxxO8_1700048033) by smtp.aliyun-inc.com; Wed, 15 Nov 2023 19:33:53 +0800 From: Cruz Zhao To: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, joel@joelfernandes.org Cc: linux-kernel@vger.kernel.org Subject: [PATCH 4/4] sched/core: fix cfs_prio_less Date: Wed, 15 Nov 2023 19:33:41 +0800 Message-Id: <20231115113341.13261-5-CruzZhao@linux.alibaba.com> X-Mailer: git-send-email 2.39.3 In-Reply-To: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> References: <20231115113341.13261-1-CruzZhao@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The update of vruntime snapshot will cause unfair sched, especially when tasks enqueue/dequeue frequently. Consider the following case: - Task A1 and A2 share a cookie, and task B has another cookie. - A1 is a short task, waking up frequently but running short everytime. - A2 and B are long tasks. - A1 and B runs on ht0 and A2 runs on ht1. ht0 ht1 fi_before fi update switch to A1 switch to A2 0 0 1 A1 sleeps switch to B A2 force idle 0 1 1 A1 wakes up switch to A1 switch to A1 1 0 1 A1 sleeps switch to B A2 force idle 0 1 1 In this case, cfs_rq->min_vruntime_fi will update every schedule, and prio of B and A2 will be pulled to the same level, no matter how long A2 and B have run before, which is not fair enough. Extramely, we observed that the latency of a task became several minutes due to this reason, which should be 100ms. To fix this problem, we compare the priority of ses using core_vruntime directly, instead of vruntime snapshot. Fixes: c6047c2e3af6 ("sched/fair: Snapshot the min_vruntime of CPUs on forc= e idle") Signed-off-by: Cruz Zhao --- kernel/sched/core.c | 17 ----------------- kernel/sched/fair.c | 35 +---------------------------------- kernel/sched/sched.h | 2 -- 3 files changed, 1 insertion(+), 53 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 647a12af9172..22edf4bcc7e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6052,8 +6052,6 @@ static inline struct task_struct *pick_task(struct rq= *rq) BUG(); /* The idle class should always have a runnable task. */ } =20 -extern void task_vruntime_update(struct rq *rq, struct task_struct *p, boo= l in_fi); - static void queue_core_balance(struct rq *rq); =20 static struct task_struct * @@ -6154,7 +6152,6 @@ pick_next_task(struct rq *rq, struct task_struct *pre= v, struct rq_flags *rf) * unconstrained picks as well. */ WARN_ON_ONCE(fi_before); - task_vruntime_update(rq, next, false); goto out_set_next; } } @@ -6204,8 +6201,6 @@ pick_next_task(struct rq *rq, struct task_struct *pre= v, struct rq_flags *rf) if (p =3D=3D rq_i->idle) { if (rq_i->nr_running) { rq->core->core_forceidle_count++; - if (!fi_before) - rq->core->core_forceidle_seq++; } } else { occ++; @@ -6245,17 +6240,6 @@ pick_next_task(struct rq *rq, struct task_struct *pr= ev, struct rq_flags *rf) if (!rq_i->core_pick) continue; =20 - /* - * Update for new !FI->FI transitions, or if continuing to be in !FI: - * fi_before fi update? - * 0 0 1 - * 0 1 1 - * 1 0 1 - * 1 1 0 - */ - if (!(fi_before && rq->core->core_forceidle_count)) - task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_= count); - rq_i->core_pick->core_occupation =3D occ; =20 if (i =3D=3D cpu) { @@ -6474,7 +6458,6 @@ static void sched_core_cpu_deactivate(unsigned int cp= u) core_rq->core_pick_seq =3D rq->core_pick_seq; core_rq->core_cookie =3D rq->core_cookie; core_rq->core_forceidle_count =3D rq->core_forceidle_count; - core_rq->core_forceidle_seq =3D rq->core_forceidle_seq; core_rq->core_forceidle_occupation =3D rq->core_forceidle_occupation; =20 /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 60b2fd437474..15c350b7c34a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12382,35 +12382,6 @@ static inline void task_tick_core(struct rq *rq, s= truct task_struct *curr) resched_curr(rq); } =20 -/* - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if= needed. - */ -static void se_fi_update(const struct sched_entity *se, unsigned int fi_se= q, - bool forceidle) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - - if (forceidle) { - if (cfs_rq->forceidle_seq =3D=3D fi_seq) - break; - cfs_rq->forceidle_seq =3D fi_seq; - } - - cfs_rq->min_vruntime_fi =3D cfs_rq->min_vruntime; - } -} - -void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) -{ - struct sched_entity *se =3D &p->se; - - if (p->sched_class !=3D &fair_sched_class) - return; - - se_fi_update(se, rq->core->core_forceidle_seq, in_fi); -} - bool cfs_prio_less(const struct task_struct *a, const struct task_struct *= b, bool in_fi) { @@ -12438,9 +12409,6 @@ bool cfs_prio_less(const struct task_struct *a, con= st struct task_struct *b, seb =3D parent_entity(seb); } =20 - se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); - se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); - cfs_rqa =3D sea->cfs_rq; cfs_rqb =3D seb->cfs_rq; #else @@ -12453,8 +12421,7 @@ bool cfs_prio_less(const struct task_struct *a, con= st struct task_struct *b, * min_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ - delta =3D (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + delta =3D (s64)(sea->core_vruntime - seb->core_vruntime); =20 return delta > 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f9d3701481f1..2ac89eb20973 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -546,7 +546,6 @@ struct cfs_rq { u64 min_vruntime; #ifdef CONFIG_SCHED_CORE u64 core_min_vruntime; - unsigned int forceidle_seq; u64 min_vruntime_fi; struct cfs_rq *core; #endif @@ -1134,7 +1133,6 @@ struct rq { unsigned int core_pick_seq; unsigned long core_cookie; unsigned int core_forceidle_count; - unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; #endif --=20 2.39.3