From nobody Mon Apr 6 23:58:41 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E9AAB367F28; Tue, 17 Mar 2026 10:47:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=O9yHOVFptG7qeh1nQWdRNy4r7tfcNutwBegFN/ZndZXAWRT/sR03ye+mf1Hk7n5pTOsck5eOsQQziwazb9v7Rql6CP+vn42I8ybo3w4tX6GhGKO5JxG/36QjGWYbrhbCS9Z/4cubckijVSonzBpZ6YZYmfABBsOdZpp/8Z8nPfw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=RMQtBLhBRrnECy14R/UPmUDg8ctH2OwIAZ8x50M/5Zc=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=QRrMbv+N5h/un8thUj8/3Fsi8sYYKI7OkEyEhi2bOMhxRw0AO/VcY6wlccVv3/KS8YKntNqxkhfQY0NzFUbja0MYzdF/L84jAZwdXKqXaLR0Rf1F5LEXfP7UDBer8defhZRFLdcBa3Bjow9kLpBvDnBwiI84IkDyReWf4HMYp6k= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=Kwai421O; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="Kwai421O" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=DHfVDiTvL7Puoc1oBq/QUXOiKwIW1u/4kXnBI0NvDWE=; b=Kwai421OCN2DDVqJxiYqjWzJ48 PUDWWCBRMrTCjBj3f3dQnK218yXjiPNLkF1V52YPRUnJRKYixGqSoEXaYzkFBB5k1q6Wb+rc3petw +GqqOex8fNta1Y7qpnTY/WH9YiVXXfuoKmnJnifDDVRajqQmd8zODOYdcwzqdSEW5EXTYPsQkJb2M dktYFx7X0JqJuXSERsO2XUCd3+/ddfl95i6iYBLwLFFdongMRF4Lsc10GaekvkDiecjxbJEhFWKax Tr55DDqVTpXImP2T5MWQvMjFVvsb/9Ny7/vio3MxBpCCwgg+HEYBKqICSR5h3lGOltI9srNQdEY+B oFUX69gw==; Received: from 2001-1c00-8d85-5700-266e-96ff-fe07-7dcc.cable.dynamic.v6.ziggo.nl ([2001:1c00:8d85:5700:266e:96ff:fe07:7dcc] helo=noisy.programming.kicks-ass.net) by casper.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxY-00000002YWp-0uWV; Tue, 17 Mar 2026 10:47:36 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 1736330324D; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104342.467728645@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:14 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 1/8] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/debug.c | 92 ++++++++++++++++++++++++----------------------= ----- 1 file changed, 44 insertions(+), 48 deletions(-) --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -445,6 +445,8 @@ static const struct file_operations fair .release =3D single_release, }; =20 +static struct dentry *debugfs_sched; + #ifdef CONFIG_SCHED_CLASS_EXT static ssize_t sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, @@ -477,75 +479,92 @@ static const struct file_operations ext_ .llseek =3D seq_lseek, .release =3D single_release, }; -#endif /* CONFIG_SCHED_CLASS_EXT */ =20 static ssize_t -sched_fair_server_period_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { long cpu =3D (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq =3D cpu_rq(cpu); =20 return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, - &rq->fair_server); + &rq->ext_server); } =20 -static int sched_fair_server_period_show(struct seq_file *m, void *v) +static int sched_ext_server_period_show(struct seq_file *m, void *v) { unsigned long cpu =3D (unsigned long) m->private; struct rq *rq =3D cpu_rq(cpu); =20 - return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); } =20 -static int sched_fair_server_period_open(struct inode *inode, struct file = *filp) +static int sched_ext_server_period_open(struct inode *inode, struct file *= filp) { - return single_open(filp, sched_fair_server_period_show, inode->i_private); + return single_open(filp, sched_ext_server_period_show, inode->i_private); } =20 -static const struct file_operations fair_server_period_fops =3D { - .open =3D sched_fair_server_period_open, - .write =3D sched_fair_server_period_write, +static const struct file_operations ext_server_period_fops =3D { + .open =3D sched_ext_server_period_open, + .write =3D sched_ext_server_period_write, .read =3D seq_read, .llseek =3D seq_lseek, .release =3D single_release, }; =20 -#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext =3D debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu =3D debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_ru= ntime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_per= iod_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t -sched_ext_server_period_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { long cpu =3D (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq =3D cpu_rq(cpu); =20 return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, - &rq->ext_server); + &rq->fair_server); } =20 -static int sched_ext_server_period_show(struct seq_file *m, void *v) +static int sched_fair_server_period_show(struct seq_file *m, void *v) { unsigned long cpu =3D (unsigned long) m->private; struct rq *rq =3D cpu_rq(cpu); =20 - return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } =20 -static int sched_ext_server_period_open(struct inode *inode, struct file *= filp) +static int sched_fair_server_period_open(struct inode *inode, struct file = *filp) { - return single_open(filp, sched_ext_server_period_show, inode->i_private); + return single_open(filp, sched_fair_server_period_show, inode->i_private); } =20 -static const struct file_operations ext_server_period_fops =3D { - .open =3D sched_ext_server_period_open, - .write =3D sched_ext_server_period_write, +static const struct file_operations fair_server_period_fops =3D { + .open =3D sched_fair_server_period_open, + .write =3D sched_fair_server_period_write, .read =3D seq_read, .llseek =3D seq_lseek, .release =3D single_release, }; -#endif /* CONFIG_SCHED_CLASS_EXT */ - -static struct dentry *debugfs_sched; =20 static void debugfs_fair_server_init(void) { @@ -568,29 +587,6 @@ static void debugfs_fair_server_init(voi } } =20 -#ifdef CONFIG_SCHED_CLASS_EXT -static void debugfs_ext_server_init(void) -{ - struct dentry *d_ext; - unsigned long cpu; - - d_ext =3D debugfs_create_dir("ext_server", debugfs_sched); - if (!d_ext) - return; - - for_each_possible_cpu(cpu) { - struct dentry *d_cpu; - char buf[32]; - - snprintf(buf, sizeof(buf), "cpu%lu", cpu); - d_cpu =3D debugfs_create_dir(buf, d_ext); - - debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_ru= ntime_fops); - debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_per= iod_fops); - } -} -#endif /* CONFIG_SCHED_CLASS_EXT */ - static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; From nobody Mon Apr 6 23:58:41 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3A49E35E94E; Tue, 17 Mar 2026 10:47:47 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=pITqLtcH0vgFzpDvDwXdAsl2KVGnbo1y3p7ejnTCCpcQKRZbXAXZ5KbOIaUIOKB25/GiVlqN/gRvVxZf2zifCkM6xjcqEDH3T2ejcZjS1R/lpZXxhhcl1u/u8hVAIdnLgvyYzfyB1T7kso/sbcLvBKkF8nukXgbLA2uKOeqDhhY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=T27qzyYKG+E+ydXlU0Bgzv3ItERcVEWJQYTBrn4Bt90=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=fPF7vBlhKVZBvtmR4MRbPTrYZjdjXlKrMY3tIJcdaVreVmS4nwKD1ogwrk8BkqmRAyZZll67ODdz5jQSBRihVbrbY5SAYCKxPdwXorAadJx9K5zs0PMYkwnuA9hZKWt4ti6o+8wq16+2vLCFeNm7p2E844fYtbzmbedp0/7ygU0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=HAXxdJth; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="HAXxdJth" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=mdpH7IA9Zx1EXuFvTkkIpM3Kmg8UyMcBxa7Jp5moBW0=; b=HAXxdJthVVUwMkMB68l4lpUB/a N4ynQzE2vAY9TY9TUsH5HAegffHP9xL3ui6oDIXpSVOIgGecK4yfxnCjQ4j6l9G6RieT6FXJsZUtH 3j19ALUoGpOSGEY85C8yJvIu7heRW/4XF6+6ttk9zjILvzpMn1ZuQLMaca0KJC13561ZWkbK2plEq GHAfCEo5IBdeoGtiPMuByvoKy5F3ZT5FVgFWLMCyrZ/DvpQrZVif2yqV5TzJhrautIWisZqoPp2GH 2wsBo1oQQ/999ZKymRwPf+ErDQAUzVCe/TXQSStg5/0S2kbHH/heEdycn3g1NoF/CwRMDEEbWqzPM RqNU42mg==; Received: from 77-249-17-252.cable.dynamic.v4.ziggo.nl ([77.249.17.252] helo=noisy.programming.kicks-ass.net) by casper.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxY-00000002YWr-0uWy; Tue, 17 Mar 2026 10:47:36 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 1D2DA30325A; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104342.586950604@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:15 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 2/8] sched/fair: Add cgroup_mode switch References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Since calc_group_shares() has issues with 'many' CPUs, specifically the computed shares value gets to be roughly 1/nr_cpus, prepare to add a few alternative methods. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/debug.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++= +++++ kernel/sched/sched.h | 1=20 2 files changed, 73 insertions(+) --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -587,6 +587,74 @@ static void debugfs_fair_server_init(voi } } =20 +#ifdef CONFIG_FAIR_GROUP_SCHED +int cgroup_mode =3D 0; + +static const char *cgroup_mode_str[] =3D { + "smp", +}; + +static int sched_cgroup_mode(const char *str) +{ + for (int i =3D 0; i < ARRAY_SIZE(cgroup_mode_str); i++) { + if (!strcmp(str, cgroup_mode_str[i])) + return i; + } + return -EINVAL; +} + +static ssize_t sched_cgroup_write(struct file *filp, const char __user *ub= uf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt =3D 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] =3D 0; + mode =3D sched_cgroup_mode(strstrip(buf)); + if (mode < 0) + return mode; + + cgroup_mode =3D mode; + + *ppos +=3D cnt; + return cnt; +} + +static int sched_cgroup_show(struct seq_file *m, void *v) +{ + for (int i =3D 0; i < ARRAY_SIZE(cgroup_mode_str); i++) { + if (cgroup_mode =3D=3D i) + seq_puts(m, "("); + seq_puts(m, cgroup_mode_str[i]); + if (cgroup_mode =3D=3D i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + seq_puts(m, "\n"); + return 0; +} + +static int sched_cgroup_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_cgroup_show, NULL); +} + +static const struct file_operations sched_cgroup_fops =3D { + .open =3D sched_cgroup_open, + .write =3D sched_cgroup_write, + .read =3D seq_read, + .llseek =3D seq_lseek, + .release =3D single_release, +}; +#endif + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -624,6 +692,10 @@ static __init int sched_init_debug(void) =20 debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops= ); =20 +#ifdef CONFIG_FAIR_GROUP_SCHED + debugfs_create_file("cgroup_mode", 0444, debugfs_sched, NULL, &sched_cgro= up_fops); +#endif + debugfs_fair_server_init(); #ifdef CONFIG_SCHED_CLASS_EXT debugfs_ext_server_init(); --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,6 +565,7 @@ static inline struct task_group *css_tg( extern int tg_nop(struct task_group *tg, void *data); =20 #ifdef CONFIG_FAIR_GROUP_SCHED +extern int cgroup_mode; extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group= *parent); extern void online_fair_sched_group(struct task_group *tg); From nobody Mon Apr 6 23:58:41 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 830FA364925; Tue, 17 Mar 2026 10:47:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=M/74pH3YEMpRnYKQnVHARQcOraJr4VZwkQGkumdgmnAZm2BIGVdLvgpb+QW2AIgnzxc2HyDDVcrpGMLuTKu/ULS8q8M19fg21ZRsJPimuKu+i2VndwzUs98J4h85ZoKP67GZjUXX6bonkh95PfpPDq1SRaAG07efJbt7LC0MCK4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=BGCul55Vi9g6OEb/YK/4n28IrjDLTj1cMq11HW01sMA=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=GnyrU89YiX51HA7Y0fi07RHjHH6Nqj2gSPoXP3bD5+oJIorzBdENqlIq9dfY5YkpTRdWlQL8nWhz5jvhe2ZbI6kQS+PjA8/YZmzir7uWHuRjn5oQKkZbYE1L1ZmQ7WbZ00gq+qLHjj+8fUY+Xv1j+H4YHi6CFaGyA2j/no+VpvM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=bECMsyEM; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="bECMsyEM" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=1uNe0XdInvRjclNE9H5+/v0jGlGWKmLTRIC+ImkMMgE=; b=bECMsyEMEcS35UkOR0CvcVT6Jm OG/caR4uU65ZsfZopdqAp6ocB44xslaF8rE9D+xhPqP5iOYtyob2YE0bn4b85YeY3xsXxsiRkEfVw ysSXVk3bss3jzlYShCzg4NEogniQNNUBvZPGVJjz6GL7mE+Y4Ie/RUNX2UMvZeHaKgm8YfALOvjrr yNceYQsCpgk7cp3j7lendqD6JtRzYh5R1yUvFum0YGB5c3vdio4T7YplzwSu8HghUjJuc4Gyub/LK Cfv/qoGFXVzliTKyjvZIJb2BwADo0fFCvGLW4iXksTQcU+3sXCRV9X5QH6GZoM0PToObzEbLj1AAr ol0ccsmQ==; Received: from 77-249-17-252.cable.dynamic.v4.ziggo.nl ([77.249.17.252] helo=noisy.programming.kicks-ass.net) by casper.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxY-00000002YWq-0siI; Tue, 17 Mar 2026 10:47:36 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 21A083032BC; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104342.700967988@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:16 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 3/8] sched/fair: Add cgroup_mode: UP References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Instead of calculating the proportional fraction of tg->shares for each CPU, just give each CPU the full measure, ignoring these pesky SMP problems. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/debug.c | 3 ++- kernel/sched/fair.c | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -588,9 +588,10 @@ static void debugfs_fair_server_init(voi } =20 #ifdef CONFIG_FAIR_GROUP_SCHED -int cgroup_mode =3D 0; +int cgroup_mode =3D 1; =20 static const char *cgroup_mode_str[] =3D { + "up", "smp", }; =20 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4152,7 +4152,7 @@ static inline int throttled_hierarchy(st * * hence icky! */ -static long calc_group_shares(struct cfs_rq *cfs_rq) +static long calc_smp_shares(struct cfs_rq *cfs_rq) { long tg_weight, tg_shares, load, shares; struct task_group *tg =3D cfs_rq->tg; @@ -4187,6 +4187,23 @@ static long calc_group_shares(struct cfs } =20 /* + * Ignore this pesky SMP stuff, use (4). + */ +static long calc_up_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg =3D cfs_rq->tg; + return READ_ONCE(tg->shares); +} + +static long calc_group_shares(struct cfs_rq *cfs_rq) +{ + if (cgroup_mode =3D=3D 0) + return calc_up_shares(cfs_rq); + + return calc_smp_shares(cfs_rq); +} + +/* * Recomputes the group entity based on the current state of its group * runqueue. */ From nobody Mon Apr 6 23:58:41 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2A1FF36F409; Tue, 17 Mar 2026 10:47:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744470; cv=none; b=p00bSpEKjwpdOysDkVh0lcHM9LrvZyOvyWWmX8NhqyBH1K7xxu3FTO/tENWVhcd533rmlnisc07OwOBQ4bX233mmtGz7MWMCKvaHA5Trd1CNfalsn7rnrUaNcRbJ4MsnyrgLdsNEK0TwwexxdlrHfka92cIbUSmwSysm72tco2I= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744470; c=relaxed/simple; bh=Njtr4wetw4qnMu7EczptHyAJpUMZpiLKP/9d7iDF+C8=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=C+x7gQPovE0guaaWfN6x9rBQCstihOBJLGv3EYGUOc8IhbaEYusgVv/ud0GuSqGr0J39vk2uApu2no8qcZ0InPlq8OfEzD73UgvpuQdni0W3fFZB1hE6Cp77SxYlsvbIx7gouU5sZH+b6NWTQGFnawcY3HTb1EMHnpSD6ah3vNM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=ivcBvy+Y; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="ivcBvy+Y" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=1UIIxXcXTofvHqmLg2jEeMe7ZJOig4GgUp8eZAjd+Bo=; b=ivcBvy+Y1L1R2I5qROLM6gFZvt NG8FMMhjDoY77MzqUbshJubV6PWQcbZjZ7gfxDddkoRr6BBna0FdQdARZ5g2Vofm91A9X5RhH6yCT EmgyuTYh6VVwYS6wggjvIR1oHfOiHOdlUYIBrVbeQRb+OcE/ngbY9I7+qTwOxQGVma39e8OhRRvat jfZL2ojofWsi2Npd1BSzEIyGv4/gcXB+2i6uB/cm8gLD05JIFJyRpuhezj8WSsx9J7H+4SPi8IQlQ ntLRBVIMoXhd9kM/kfWrSiRpINzKAFiS5QUVgISBKOpd+ppZmxRGle9HU1cjauELzwgpyDzhn1Mpy JAVFoD5Q==; Received: from 2001-1c00-8d85-5700-266e-96ff-fe07-7dcc.cable.dynamic.v6.ziggo.nl ([2001:1c00:8d85:5700:266e:96ff:fe07:7dcc] helo=noisy.programming.kicks-ass.net) by casper.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxY-00000002YWo-0rFg; Tue, 17 Mar 2026 10:47:36 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 2624D3032C7; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104342.815599388@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:17 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 4/8] sched/fair: Add cgroup_mode: MAX References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" In order to avoid the CPU shares becoming tiny '1 / nr_cpus', assume each cgroup is maximally concurrent and distrubute 'nr_cpus * tg->shares', such that each CPU ends up with a 'tg->shares' sized fraction (on average). There is the corner case, when a cgroup is minimally loaded, eg a single spinner, therefore limit the CPU shares to that of a nice -20 task to avoid getting too much load. It was previously suggested to allow raising cpu.weight to '100 * nr_cpus' to combat this same problem, but the problem there is the above corner case, allowing multiple cgroups with such immense weight to the runqueue has significant problems. It would drown the kthreads, but it also risks overflowing the load values. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/cpuset.h | 6 +++++ kernel/cgroup/cpuset.c | 15 ++++++++++++++ kernel/sched/debug.c | 1=20 kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++= ----- 4 files changed, 67 insertions(+), 5 deletions(-) --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -80,6 +80,7 @@ extern void lockdep_assert_cpuset_lock_h extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpuma= sk *mask); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mas= k); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); +extern int cpuset_num_cpus(struct cgroup *cgroup); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -216,6 +217,11 @@ static inline bool cpuset_cpus_allowed_f return false; } =20 +static inline int cpuset_num_cpus(struct cgroup *cgroup) +{ + return num_online_cpus(); +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4097,6 +4097,21 @@ bool cpuset_cpus_allowed_fallback(struct return changed; } =20 +int cpuset_num_cpus(struct cgroup *cgrp) +{ + int nr =3D num_online_cpus(); + struct cpuset *cs; + + if (is_in_v2_mode()) { + guard(rcu)(); + cs =3D css_cs(cgroup_e_css(cgrp, &cpuset_cgrp_subsys)); + if (cs) + nr =3D cpumask_weight(cs->effective_cpus); + } + + return nr; +} + void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -593,6 +593,7 @@ int cgroup_mode =3D 1; static const char *cgroup_mode_str[] =3D { "up", "smp", + "max", }; =20 static int sched_cgroup_mode(const char *str) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4150,12 +4150,10 @@ static inline int throttled_hierarchy(st * * hence icky! */ -static long calc_smp_shares(struct cfs_rq *cfs_rq) +static long __calc_smp_shares(struct cfs_rq *cfs_rq, long tg_shares, long = shares_max) { - long tg_weight, tg_shares, load, shares; struct task_group *tg =3D cfs_rq->tg; - - tg_shares =3D READ_ONCE(tg->shares); + long tg_weight, load, shares; =20 load =3D max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); =20 @@ -4181,7 +4179,47 @@ static long calc_smp_shares(struct cfs_r * case no task is runnable on a CPU MIN_SHARES=3D2 should be returned * instead of 0. */ - return clamp_t(long, shares, MIN_SHARES, tg_shares); + return clamp_t(long, shares, MIN_SHARES, shares_max); +} + +static int tg_cpus(struct task_group *tg) +{ + int nr =3D num_online_cpus(); + + if (cpusets_enabled()) { + struct cgroup *cgrp =3D tg->css.cgroup; + nr =3D cpuset_num_cpus(cgrp); + } + + return nr; +} + +/* + * Func: min(fraction(num_cpus * tg->shares), nice -20) + * + * Scale tg->shares by the maximal number of CPUs; but clip the max shares= at + * nice -20, otherwise a single spinner on a 512 CPU machine would result = in + * 512*NICE_0_LOAD, which is also crazy. + */ +static long calc_max_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg =3D cfs_rq->tg; + int nr =3D tg_cpus(tg); + long tg_shares =3D READ_ONCE(tg->shares); + long max_shares =3D scale_load(sched_prio_to_weight[0]); + return __calc_smp_shares(cfs_rq, tg_shares * nr, max_shares); +} + +/* + * Func: fraction(tg->shares) + * + * This infamously results in tiny shares when you have many CPUs. + */ +static long calc_smp_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg =3D cfs_rq->tg; + long tg_shares =3D READ_ONCE(tg->shares); + return __calc_smp_shares(cfs_rq, tg_shares, tg_shares); } =20 /* @@ -4197,6 +4235,8 @@ static long calc_group_shares(struct cfs { if (cgroup_mode =3D=3D 0) return calc_up_shares(cfs_rq); + if (cgroup_mode =3D=3D 2) + return calc_max_shares(cfs_rq); =20 return calc_smp_shares(cfs_rq); } From nobody Mon Apr 6 23:58:41 2026 Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 943063659FA; Tue, 17 Mar 2026 10:47:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.92.199 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=UMG072a1Pa1MeAT8Ly3pUMSDDIJObzhuHO0IF29PYdgtrwudADRMXEJ65UN7QQpGPwvhTlKR1014n8lJR+qrLplouCepLetzTtrfcjUOhSEh/gIPlK/dgqP/TkpCd28lKRcbtnMRkIog8C9OX9yjtkmoNtaiWAgSvz26/v6as/Q= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=+pFF1AelRju298M6nOKw8/46MZRO/cTJx2ER7TBzv7M=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=KMGfdqswsF9Ovl+mIehPTNHgSlMHMGq+AjKMvF4fBi4VWOipenMjBZDAnlSCRm4iyZMxdq9DdxphSkYZWcfiJwVkTKEltz9JxAn8DRBhvI45g84j5+iiV3tpxor86u8kKxxZUqpAlv2WwcQPOYqZKffTkfcVePKtPoMeYbfmbEo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=j/WkfZSk; arc=none smtp.client-ip=90.155.92.199 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="j/WkfZSk" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=desiato.20200630; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=senJf2wakpmdGrsim4pMJJFM9+m5WIr95mcaSzWPbVY=; b=j/WkfZSkvwvSjSiz0uvv/G/gGI 2Pdj75YO4x6l2Rpc9iCkiaMV05cH862ToCeuTi5RwaJT6+zwGZQrt4sKRcIrskrYNamGOO2f1hvTL yGHZQc2BhptyaV8j49Iyq0ZTb3595Qnap9GbQMlmzKcRi0Pl+k/dSarO1SRLXcCvWlbMM5KFdWaJ6 pVfo0dBeHZeuSim8M1Dt57Y7R6uLsgdQyK6ZtgZZniSptHltQZZPhwIQkd+jD9/FXvjO+nxJOg2LC dCdRxRX3IMvF9IzGAbHLXZl6txt5OTTWAlEygu0rQ2ELvoZJG+xhB0I+zjMgPhdozOJmYw7qWQHjG vJWYeyvg==; Received: from 77-249-17-252.cable.dynamic.v4.ziggo.nl ([77.249.17.252] helo=noisy.programming.kicks-ass.net) by desiato.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxY-00000008kbs-1VAE; Tue, 17 Mar 2026 10:47:36 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 2B93B3032EB; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104342.931729160@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:18 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 5/8] sched/fair: Add cgroup_mode: CONCUR References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" A variation of MAX; where instead of assuming maximal concurrent, this scal= es with 'min(nr_tasks, nr_cpus)'. This handles the low concurrency cases more gracefully, with the exception of CPU affnity. Note: the tracking of tg->tasks is somewhat expensive :-/ Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/debug.c | 1 + kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 3 +++ 3 files changed, 39 insertions(+), 3 deletions(-) --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -593,6 +593,7 @@ int cgroup_mode =3D 1; static const char *cgroup_mode_str[] =3D { "up", "smp", + "concur", "max", }; =20 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4210,6 +4210,30 @@ static long calc_max_shares(struct cfs_r return __calc_smp_shares(cfs_rq, tg_shares * nr, max_shares); } =20 +static inline int tg_tasks(struct task_group *tg) +{ + return max(1, atomic_long_read(&tg->tasks)); +} + +/* + * Func: min(fraction(num * tg->shares), nice -20); where + * num =3D min(nr_tasks, nr_cpus) + * + * Similar to max, except scale with min(nr_tasks, nr_cpus), which gives + * a far more natural distrubution. Can still create edge case using CPU + * affinity. + */ +static long calc_concur_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg =3D cfs_rq->tg; + int nr_cpus =3D tg_cpus(tg); + int nr_tasks =3D tg_tasks(tg); + int nr =3D min(nr_tasks, nr_cpus); + long tg_shares =3D READ_ONCE(tg->shares); + long max_shares =3D scale_load(sched_prio_to_weight[0]); + return __calc_smp_shares(cfs_rq, nr * tg_shares, max_shares); +} + /* * Func: fraction(tg->shares) * @@ -4236,6 +4260,8 @@ static long calc_group_shares(struct cfs if (cgroup_mode =3D=3D 0) return calc_up_shares(cfs_rq); if (cgroup_mode =3D=3D 2) + return calc_concur_shares(cfs_rq); + if (cgroup_mode =3D=3D 3) return calc_max_shares(cfs_rq); =20 return calc_smp_shares(cfs_rq); @@ -4381,7 +4407,7 @@ static inline bool cfs_rq_is_decayed(str */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta; + long delta, dt; u64 now; =20 /* @@ -4403,16 +4429,19 @@ static inline void update_tg_load_avg(st return; =20 delta =3D cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + dt =3D cfs_rq->h_nr_queued - cfs_rq->tg_tasks_contrib; + if (dt || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); + atomic_long_add(dt, &cfs_rq->tg->tasks); cfs_rq->tg_load_avg_contrib =3D cfs_rq->avg.load_avg; + cfs_rq->tg_tasks_contrib =3D cfs_rq->h_nr_queued; cfs_rq->last_update_tg_load_avg =3D now; } } =20 static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta; + long delta, dt; u64 now; =20 /* @@ -4423,8 +4452,11 @@ static inline void clear_tg_load_avg(str =20 now =3D sched_clock_cpu(cpu_of(rq_of(cfs_rq))); delta =3D 0 - cfs_rq->tg_load_avg_contrib; + dt =3D 0 - cfs_rq->tg_tasks_contrib; atomic_long_add(delta, &cfs_rq->tg->load_avg); + atomic_long_add(dt, &cfs_rq->tg->tasks); cfs_rq->tg_load_avg_contrib =3D 0; + cfs_rq->tg_tasks_contrib =3D 0; cfs_rq->last_update_tg_load_avg =3D now; } =20 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -491,6 +491,8 @@ struct task_group { * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; + atomic_long_t tasks; + #endif /* CONFIG_FAIR_GROUP_SCHED */ =20 #ifdef CONFIG_RT_GROUP_SCHED @@ -720,6 +722,7 @@ struct cfs_rq { #ifdef CONFIG_FAIR_GROUP_SCHED u64 last_update_tg_load_avg; unsigned long tg_load_avg_contrib; + unsigned long tg_tasks_contrib; long propagate; long prop_runnable_sum; From nobody Mon Apr 6 23:58:41 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3A529368941; Tue, 17 Mar 2026 10:47:47 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=sPw7ykfPcxrUV9z26a+ouYzG14VDa9VyON3x08ub0GKV/vSmsrv4X6R3ynZPEuo6tAKcSeAYPXpdIVVbV1q4qZtKCDP93WDL/tnxmqpbT3gdJuAhn6OlMolKjZPD4io5PCR3pleQnBsAzTuLZdn4zJ0kbgDftLwImX6NPtkwo1o= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=oz/nKY2JY2XxmQP/gISL7Loky6Je7txW6d2OIdUr/kE=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=Yb71hQ/XGlDcThVc2gQqLZVZH9TK4Z7klyfkSfZJpJIgLh2qf1XWj9KoeCD16LNje28ZV57I/gqB9y/h6iYhi4uAi5kt2HOwsTnjunPMU91QiKjs/ec4gEJD3ee0xz75OoObxrPAbf0qknHIAvl5XY61yvp/dh+ssqWCbl8MhNY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=MvcxCoip; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="MvcxCoip" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=LnEpHpYBQaWBKPY9OFV1cbVCVT8OOVFifXCq/B8ezh4=; b=MvcxCoipidWfptwr+HKRyouzJp 1E7XI/4OYkFYDg31PdPgW+Mz7tO8c4jUlLdfVcgf+gwFio6LZjCxK7f0uwLAQU7buOhGXdFweTZON vOniltKienhNdXVEwOEYKUtWZYEQu+BO5eQkkWP0+KuhKyVy9BOfg4nECvHh+OUDIk0sZei7vB8tq JYSQWu8jxgZmAWxwAJ/wapG5aVTSmEIPEh+JpIf1IGR9yPuKs51uzTCCejINUsUnGJpS8NhKCX3+c 1AFglnqX3Ijsqx+xAvuwb1z2w2qRrFdihs54OlW1dXfQnejrdmbJGhV4md+YqD6gR6aWb56eLisHc FHWLVQ/A==; Received: from 77-249-17-252.cable.dynamic.v4.ziggo.nl ([77.249.17.252] helo=noisy.programming.kicks-ass.net) by casper.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxY-00000002YWu-1SN9; Tue, 17 Mar 2026 10:47:36 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 2FE103032ED; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104343.103915618@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:19 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 6/8] sched/fair: Add newidle balance to pick_task_fair() References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" With commit 50653216e4ff ("sched: Add support to pick functions to take rf") removing the balance callback, the pick_task() callback is in charge of newidle balancing. This means pick_task_fair() should do so too. This hasn't been a problem in practise because pick_next_task_fair() is used. However, since we'll be removing that one shortly, make sure pick_next_task() is up to scratch. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8892,16 +8892,18 @@ static void wakeup_preempt_fair(struct r } =20 static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *= rf) + __must_hold(__rq_lockp(rq)) { struct sched_entity *se; struct cfs_rq *cfs_rq; struct task_struct *p; bool throttled; + int new_tasks; =20 again: cfs_rq =3D &rq->cfs; if (!cfs_rq->nr_queued) - return NULL; + goto idle; =20 throttled =3D false; =20 @@ -8922,6 +8924,14 @@ static struct task_struct *pick_task_fai if (unlikely(throttled)) task_throttle_setup_work(p); return p; + +idle: + new_tasks =3D sched_balance_newidle(rq, rf); + if (new_tasks < 0) + return RETRY_TASK; + if (new_tasks > 0) + goto again; + return NULL; } =20 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, boo= l first); @@ -9011,6 +9021,7 @@ pick_next_task_fair(struct rq *rq, struc =20 static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) + __must_hold(__rq_lockp(dl_se->rq)) { return pick_task_fair(dl_se->rq, rf); } From nobody Mon Apr 6 23:58:41 2026 Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 94962365A1C; Tue, 17 Mar 2026 10:47:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.92.199 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; cv=none; b=Z4mF3BpqCgEjTagdjVU60GNsjnGHouEH2iY21cDQC1LetNVr8hYSGObOvByM0EtT90P4t5ANGKKy94kuv5t2BlVXA8oOuRc/8MpALRbLyt58w5zJYWHinsAqQv5gvizTrPdZpObZfCJCNPCkxHtQV1xy5U86usfNGeKzO22cjWw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744469; c=relaxed/simple; bh=dqCcAXFRNfuoGgrf+WlIR8eJDUCDHGl7msCw3V+8qhY=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=HLTNO2dEDuKGEU+Y1VR+uadqDAxLm5FP87S10kdJnzmVCfK6Yg3a3LlDgYB81pSNfXT5PsjLS5CCga2OvCXH+MyOIgZXiLMGO4IhYnc2Y2tvysQ/8CwdbhDfYWIqSbOlshsl9ujkcP+MUcQZ3XLGuR824s5VkPIJb8Fj4V4W5bk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=cu1np/3E; arc=none smtp.client-ip=90.155.92.199 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="cu1np/3E" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=desiato.20200630; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=MUmdy1hmWZji1oYBTrFsPfb5N9nygfW8f4vys4PyEfk=; b=cu1np/3EZEVETHsWafF27pp1ER dil3HSl72/YGi6ycGQBiEUNoTpvkQrrwj2doz7m9JGnvcqMbHWhaUBbLyIBqQwyRyylb+d9SyVtr9 u6/Dk5r31YHTGuNKbVruXiz08GvSnwSLZLWt9doLHHvtmg9cIwMZ81MM47mJ94Z9fnP6quDuIB8lU B7+xylUdEZQ3vD13tQLyxmcwBOHX0GALWPu1ZFZgChxUt6Dldw6iivsQHyDw527ztIM5XOk5V64qs r7ETeAuWm/WM54i51AI5pC7O7EeW7hScz2+g50OtVLYNEexalgQQncuJyG9kZOoMT99qllOAamKvV hfeJiVog==; Received: from 2001-1c00-8d85-5700-266e-96ff-fe07-7dcc.cable.dynamic.v6.ziggo.nl ([2001:1c00:8d85:5700:266e:96ff:fe07:7dcc] helo=noisy.programming.kicks-ass.net) by desiato.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxZ-00000008kbw-0ZFl; Tue, 17 Mar 2026 10:47:37 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 3423D3032FC; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104343.225156112@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:20 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 7/8] sched: Remove sched_class::pick_next_task() References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The reason for pick_next_task_fair() is the put/set optimization that avoids touching the common ancestors. However, it is possible to implement this in the put_prev_task() and set_next_task() calls as used in put_prev_set_next_task(). Notably, put_prev_set_next_task() is the only site that: - calls put_prev_task() with a .next argument; - calls set_next_task() with .first =3D true. This means that put_prev_task() can determine the common hierarchy and stop there, and then set_next_task() can terminate where put_prev_task stopped. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 27 +++------ kernel/sched/fair.c | 153 ++++++++++++++--------------------------------= ----- kernel/sched/sched.h | 14 ---- 3 files changed, 52 insertions(+), 142 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5924,16 +5924,15 @@ __pick_next_task(struct rq *rq, struct t if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running =3D=3D rq->cfs.h_nr_queued)) { =20 - p =3D pick_next_task_fair(rq, prev, rf); + p =3D pick_task_fair(rq, rf); if (unlikely(p =3D=3D RETRY_TASK)) goto restart; =20 /* Assume the next prioritized class is idle_sched_class */ - if (!p) { + if (!p) p =3D pick_task_idle(rq, rf); - put_prev_set_next_task(rq, prev, p); - } =20 + put_prev_set_next_task(rq, prev, p); return p; } =20 @@ -5941,20 +5940,12 @@ __pick_next_task(struct rq *rq, struct t prev_balance(rq, prev, rf); =20 for_each_active_class(class) { - if (class->pick_next_task) { - p =3D class->pick_next_task(rq, prev, rf); - if (unlikely(p =3D=3D RETRY_TASK)) - goto restart; - if (p) - return p; - } else { - p =3D class->pick_task(rq, rf); - if (unlikely(p =3D=3D RETRY_TASK)) - goto restart; - if (p) { - put_prev_set_next_task(rq, prev, p); - return p; - } + p =3D class->pick_task(rq, rf); + if (unlikely(p =3D=3D RETRY_TASK)) + goto restart; + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; } } =20 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8891,7 +8891,7 @@ static void wakeup_preempt_fair(struct r resched_curr_lazy(rq); } =20 -static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *= rf) +struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { struct sched_entity *se; @@ -8934,91 +8934,6 @@ static struct task_struct *pick_task_fai return NULL; } =20 -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, boo= l first); -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first); - -struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_fla= gs *rf) - __must_hold(__rq_lockp(rq)) -{ - struct sched_entity *se; - struct task_struct *p; - int new_tasks; - -again: - p =3D pick_task_fair(rq, rf); - if (!p) - goto idle; - se =3D &p->se; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class !=3D &fair_sched_class) - goto simple; - - __put_prev_set_next_dl_server(rq, prev, p); - - /* - * Because of the set_next_buddy() in dequeue_task_fair() it is rather - * likely that a next task is from the same cgroup as the current. - * - * Therefore attempt to avoid putting and setting the entire cgroup - * hierarchy, only change the part that actually changes. - * - * Since we haven't yet done put_prev_entity and if the selected task - * is a different task than we started out with, try and touch the - * least amount of cfs_rqs. - */ - if (prev !=3D p) { - struct sched_entity *pse =3D &prev->se; - struct cfs_rq *cfs_rq; - - while (!(cfs_rq =3D is_same_group(se, pse))) { - int se_depth =3D se->depth; - int pse_depth =3D pse->depth; - - if (se_depth <=3D pse_depth) { - put_prev_entity(cfs_rq_of(pse), pse); - pse =3D parent_entity(pse); - } - if (se_depth >=3D pse_depth) { - set_next_entity(cfs_rq_of(se), se, true); - se =3D parent_entity(se); - } - } - - put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se, true); - - __set_next_task_fair(rq, p, true); - } - - return p; - -simple: -#endif /* CONFIG_FAIR_GROUP_SCHED */ - put_prev_set_next_task(rq, prev, p); - return p; - -idle: - if (rf) { - new_tasks =3D sched_balance_newidle(rq, rf); - - /* - * Because sched_balance_newidle() releases (and re-acquires) - * rq->lock, it is possible for any higher priority task to - * appear. In that case we must re-start the pick_next_entity() - * loop. - */ - if (new_tasks < 0) - return RETRY_TASK; - - if (new_tasks > 0) - goto again; - } - - return NULL; -} - static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) __must_hold(__rq_lockp(dl_se->rq)) @@ -9042,10 +8957,28 @@ static void put_prev_task_fair(struct rq { struct sched_entity *se =3D &prev->se; struct cfs_rq *cfs_rq; + struct sched_entity *nse =3D NULL; =20 - for_each_sched_entity(se) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (next && next->sched_class =3D=3D &fair_sched_class) + nse =3D &next->se; +#endif + + while (se) { cfs_rq =3D cfs_rq_of(se); - put_prev_entity(cfs_rq, se); + if (!nse || cfs_rq->curr) + put_prev_entity(cfs_rq, se); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (nse) { + if (is_same_group(se, nse)) + break; + if (nse->depth >=3D se->depth) + nse =3D parent_entity(nse); + if (nse->depth > se->depth) + continue; + } +#endif + se =3D parent_entity(se); } } =20 @@ -13566,10 +13499,30 @@ static void switched_to_fair(struct rq * } } =20 -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, boo= l first) +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first) { struct sched_entity *se =3D &p->se; =20 + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq =3D cfs_rq_of(se); + + if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) && + first && cfs_rq->curr) + break; + + set_next_entity(cfs_rq, se, true); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } + + se =3D &p->se; + if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13589,27 +13542,6 @@ static void __set_next_task_fair(struct sched_fair_update_stop_tick(rq, p); } =20 -/* - * Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first) -{ - struct sched_entity *se =3D &p->se; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - - set_next_entity(cfs_rq, se, first); - /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); - } - - __set_next_task_fair(rq, p, first); -} - void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline =3D RB_ROOT_CACHED; @@ -13921,7 +13853,6 @@ DEFINE_SCHED_CLASS(fair) =3D { .wakeup_preempt =3D wakeup_preempt_fair, =20 .pick_task =3D pick_task_fair, - .pick_next_task =3D pick_next_task_fair, .put_prev_task =3D put_prev_task_fair, .set_next_task =3D set_next_task_fair, =20 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2538,17 +2538,6 @@ struct sched_class { * schedule/pick_next_task: rq->lock */ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); - /* - * Optional! When implemented pick_next_task() should be equivalent to: - * - * next =3D pick_task(); - * if (next) { - * put_prev_task(prev); - * set_next_task_first(next); - * } - */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *= prev, - struct rq_flags *rf); =20 /* * sched_change: @@ -2761,8 +2750,7 @@ static inline bool sched_fair_runnable(s return rq->cfs.nr_queued > 0; } =20 -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_= struct *prev, - struct rq_flags *rf); +extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *= rf); extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *= rf); =20 #define SCA_CHECK 0x01 From nobody Mon Apr 6 23:58:41 2026 Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 10618369239; Tue, 17 Mar 2026 10:47:48 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.92.199 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744471; cv=none; b=pG5GepvEAgfTpb5H7AUwmw5YYtXj9cNnuKKos4ekyjv8OYTPFAzDHH+E9/5UkWql2CCzQccF8nWVVgqgWgaiJMl5TjdD58i0D6T2A8z4u4Ztp1QsKoltQeG0b5Q2P4RBb2tQhFVMdhSCAYhRGYwkiWggxo775ce0di0Uf86+osc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773744471; c=relaxed/simple; bh=7P25UE6S8RoXqmag/PCnKjbsN5mal1/08o0Z1efP1F4=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=Tenv7hts9HD92xPkXcfbpbl6ldoUVXURarThuR0COsXiQ9cwanFFQjmERtY4VVw1XkuZZRD0aZmzkBNy42ko5E7tgM4uAG10Z0bmwC9u0qgkT5/2pL6rC3ws7m8K13jG8U5blK4KsIFiF9CaFEp8HHnz0MAvyZAVqV4S+nms0z0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=m2+t5n/t; arc=none smtp.client-ip=90.155.92.199 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="m2+t5n/t" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=desiato.20200630; h=Content-Type:MIME-Version:References: Subject:Cc:To:From:Date:Message-ID:Sender:Reply-To:Content-Transfer-Encoding: Content-ID:Content-Description:In-Reply-To; bh=PPU88KCYcFvenIxKTUc2FVGZlR/VTRyGBt2Hgt29noI=; b=m2+t5n/tJZsXK/SLfTFVgADkgN wlKmKhau0LEfjv5hK+sGeY8KO0lunsFAYMAEA/6ihx9AF/dSUrXJv4jhVDuGskN9FFd/DGMz7+qyh R2s6iiux7TqfnLlknHDn5ND+F1WJTMUH1qigCYc9WRzWYGO2AFWOBXy7sf3emqz8YGahWcDPyn8r1 Clo9mwp4pS+1/qe+8sIStDwWn0aDAQP/ZSu9xhfNsx2YobMVR4GZ+qCEX5SOxDSCnSF3DCsWYEpsu iwsHrs1rKZlHXgzvnEA/pJOxhICM83NSbLo3PjqzMxDDXuY6yUiZeG6RlFUD+UFZXIOF1sXPO9xkt CaEZJcig==; Received: from 77-249-17-252.cable.dynamic.v4.ziggo.nl ([77.249.17.252] helo=noisy.programming.kicks-ass.net) by desiato.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux)) id 1w2RxZ-00000008kbx-06Ob; Tue, 17 Mar 2026 10:47:37 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 0) id 38C363032FE; Tue, 17 Mar 2026 11:47:35 +0100 (CET) Message-ID: <20260317104343.338573840@infradead.org> User-Agent: quilt/0.68 Date: Tue, 17 Mar 2026 10:51:21 +0100 From: Peter Zijlstra To: mingo@kernel.org Cc: longman@redhat.com, chenridong@huaweicloud.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, jstultz@google.com, kprateek.nayak@amd.com Subject: [RFC][PATCH 8/8] sched/eevdf: Move to a single runqueue References: <20260317095113.387450089@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Change fair/cgroup to a single runqueue. Infamously fair/cgroup isn't working for a number of people; typically the complaint is latencies and/or overhead. The latency issue is due to the intermediate entries that represent a combination of tasks and thereby obfuscate the runnability of tasks. The approach here is to leave the cgroup hierarchy as is; including the intermediate enqueue/dequeue but move the actual EEVDF runqueue outside. This means things like the shares_weight approximation are fully preserved. That is, given a hierarchy like: R | se--G1 / \ G2--se se--G3 / \ | T1--se se--T2 se--T3 This is fully maintained for load tracking, however the EEVDF parts of cfs_rq/se go unused for the intermediates and are instead connected like: _R_ / | \ T1 T2 T3 Since the effective weight of the entities is determined by the hierarchy, this gets recomputed on enqueue,set_next_task and tick. Notably, the effective weight (se->h_load) is computed from the hierarchical fraction: se->load / cfs_rq->load. Since EEVDF is now exclusive operating on rq->cfs, it needs to consider cfs_rq->h_nr_queued rather than cfs_rq->nr_queued. Similarly, only tasks can get delayed, simplifying some of the cgroup cleanup. One place where additional information was required was set_next_task() / put_prev_task(), where we need to track 'current' both in the hierarchical sense (cfs_rq->h_curr) and in the flat sense (cfs_rq->curr). As a result of only having a single level to pick from, much of the complications in pick_next_task() and preemption go away. Since many of the hierarchical operations are still there, this won't immediately fix the performance issues, but hopefully it will fix some of the latency issues. TODO: split struct cfs_rq / struct sched_entity TODO: try and get rid of h_curr Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched.h | 1=20 kernel/sched/debug.c | 7=20 kernel/sched/fair.c | 795 +++++++++++++++++++++------------------------= ----- kernel/sched/sched.h | 2=20 4 files changed, 346 insertions(+), 459 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -575,6 +575,7 @@ struct sched_statistics { struct sched_entity { /* For load-balancing: */ struct load_weight load; + struct load_weight h_load; struct rb_node run_node; u64 deadline; u64 min_vruntime; --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -908,8 +908,9 @@ print_task(struct seq_file *m, struct rq else SEQ_printf(m, " %c", task_state_to_char(p)); =20 - SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.= %06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %10ld %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld= %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), + p->se.h_load.weight, SPLIT_NS(p->se.vruntime), entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', SPLIT_NS(p->se.deadline), @@ -940,7 +941,7 @@ static void print_rq(struct seq_file *m, =20 SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID vruntime eligible " + SEQ_printf(m, " S task PID weight vruntime eligi= ble " "deadline slice sum-exec switches " "prio wait-time sum-sleep sum-block" #ifdef CONFIG_NUMA_BALANCING @@ -1046,6 +1047,8 @@ void print_cfs_rq(struct seq_file *m, in cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lu\n", "h_load", + cfs_rq->h_load); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -296,8 +296,8 @@ static u64 __calc_delta(u64 delta_exec, */ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { - if (unlikely(se->load.weight !=3D NICE_0_LOAD)) - delta =3D __calc_delta(delta, NICE_0_LOAD, &se->load); + if (se->h_load.weight !=3D NICE_0_LOAD) + delta =3D __calc_delta(delta, NICE_0_LOAD, &se->h_load); =20 return delta; } @@ -427,38 +427,6 @@ static inline struct sched_entity *paren return se->parent; } =20 -static void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ - int se_depth, pse_depth; - - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth =3D (*se)->depth; - pse_depth =3D (*pse)->depth; - - while (se_depth > pse_depth) { - se_depth--; - *se =3D parent_entity(*se); - } - - while (pse_depth > se_depth) { - pse_depth--; - *pse =3D parent_entity(*pse); - } - - while (!is_same_group(*se, *pse)) { - *se =3D parent_entity(*se); - *pse =3D parent_entity(*pse); - } -} - static int tg_is_idle(struct task_group *tg) { return tg->idle > 0; @@ -502,11 +470,6 @@ static inline struct sched_entity *paren return NULL; } =20 -static inline void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ -} - static inline int tg_is_idle(struct task_group *tg) { return 0; @@ -685,7 +648,7 @@ static inline unsigned long avg_vruntime static inline void __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight =3D avg_vruntime_weight(cfs_rq, se->load.weight); + unsigned long weight =3D avg_vruntime_weight(cfs_rq, se->h_load.weight); s64 w_vruntime, key =3D entity_key(cfs_rq, se); =20 w_vruntime =3D key * weight; @@ -702,7 +665,7 @@ sum_w_vruntime_add_paranoid(struct cfs_r s64 key, tmp; =20 again: - weight =3D avg_vruntime_weight(cfs_rq, se->load.weight); + weight =3D avg_vruntime_weight(cfs_rq, se->h_load.weight); key =3D entity_key(cfs_rq, se); =20 if (check_mul_overflow(key, weight, &key)) @@ -748,7 +711,7 @@ sum_w_vruntime_add(struct cfs_rq *cfs_rq static void sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight =3D avg_vruntime_weight(cfs_rq, se->load.weight); + unsigned long weight =3D avg_vruntime_weight(cfs_rq, se->h_load.weight); s64 key =3D entity_key(cfs_rq, se); =20 cfs_rq->sum_w_vruntime -=3D key * weight; @@ -790,7 +753,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) s64 runtime =3D cfs_rq->sum_w_vruntime; =20 if (curr) { - unsigned long w =3D avg_vruntime_weight(cfs_rq, curr->load.weight); + unsigned long w =3D avg_vruntime_weight(cfs_rq, curr->h_load.weight); =20 runtime +=3D entity_key(cfs_rq, curr) * w; weight +=3D w; @@ -842,8 +805,6 @@ static s64 entity_lag(struct cfs_rq *cfs =20 static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *= se) { - WARN_ON_ONCE(!se->on_rq); - se->vlag =3D entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); } =20 @@ -871,7 +832,7 @@ static int vruntime_eligible(struct cfs_ long load =3D cfs_rq->sum_weight; =20 if (curr && curr->on_rq) { - unsigned long weight =3D avg_vruntime_weight(cfs_rq, curr->load.weight); + unsigned long weight =3D avg_vruntime_weight(cfs_rq, curr->h_load.weight= ); =20 avg +=3D entity_key(cfs_rq, curr) * weight; load +=3D weight; @@ -983,6 +944,9 @@ RB_DECLARE_CALLBACKS(static, min_vruntim */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *s= e) { + WARN_ON_ONCE(&rq_of(cfs_rq)->cfs !=3D cfs_rq); + WARN_ON_ONCE(!entity_is_task(se)); + sum_w_vruntime_add(cfs_rq, se); se->min_vruntime =3D se->vruntime; se->min_slice =3D se->slice; @@ -992,6 +956,9 @@ static void __enqueue_entity(struct cfs_ =20 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *s= e) { + WARN_ON_ONCE(&rq_of(cfs_rq)->cfs !=3D cfs_rq); + WARN_ON_ONCE(!entity_is_task(se)); + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); sum_w_vruntime_sub(cfs_rq, se); @@ -1077,7 +1044,7 @@ static inline void cancel_protect_slice( * * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool prote= ct) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node =3D cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se =3D __pick_first_entity(cfs_rq); @@ -1088,7 +1055,7 @@ static struct sched_entity *__pick_eevdf * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles. */ - if (cfs_rq->nr_queued =3D=3D 1) + if (cfs_rq->h_nr_queued =3D=3D 1) return curr && curr->on_rq ? curr : se; =20 /* @@ -1148,11 +1115,6 @@ static struct sched_entity *__pick_eevdf return best; } =20 -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - return __pick_eevdf(cfs_rq, true); -} - struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last =3D rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -1339,8 +1301,6 @@ static s64 update_se(struct rq *rq, stru return delta_exec; } =20 -static void set_next_buddy(struct sched_entity *se); - /* * Used by other classes to account runtime. */ @@ -1360,7 +1320,7 @@ static void update_curr(struct cfs_rq *c * not necessarily be the actual task running * (rq->curr.se). This is easy to confuse! */ - struct sched_entity *curr =3D cfs_rq->curr; + struct sched_entity *curr =3D cfs_rq->h_curr; struct rq *rq =3D rq_of(cfs_rq); s64 delta_exec; bool resched; @@ -1372,26 +1332,29 @@ static void update_curr(struct cfs_rq *c if (unlikely(delta_exec <=3D 0)) return; =20 + account_cfs_rq_runtime(cfs_rq, delta_exec); + + if (!entity_is_task(curr)) + return; + + cfs_rq =3D &rq->cfs; + curr->vruntime +=3D calc_delta_fair(delta_exec, curr); resched =3D update_deadline(cfs_rq, curr); =20 - if (entity_is_task(curr)) { - /* - * If the fair_server is active, we need to account for the - * fair_server time whether or not the task is running on - * behalf of fair_server or not: - * - If the task is running on behalf of fair_server, we need - * to limit its time based on the assigned runtime. - * - Fair task that runs outside of fair_server should account - * against fair_server such that it can account for this time - * and possibly avoid running this period. - */ - dl_server_update(&rq->fair_server, delta_exec); - } - - account_cfs_rq_runtime(cfs_rq, delta_exec); + /* + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on + * behalf of fair_server or not: + * - If the task is running on behalf of fair_server, we need + * to limit its time based on the assigned runtime. + * - Fair task that runs outside of fair_server should account + * against fair_server such that it can account for this time + * and possibly avoid running this period. + */ + dl_server_update(&rq->fair_server, delta_exec); =20 - if (cfs_rq->nr_queued =3D=3D 1) + if (cfs_rq->h_nr_queued =3D=3D 1) return; =20 if (resched || !protect_slice(curr)) { @@ -1402,7 +1365,10 @@ static void update_curr(struct cfs_rq *c =20 static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->donor->se)); + struct sched_entity *se =3D &rq->donor->se; + + for_each_sched_entity(se) + update_curr(cfs_rq_of(se)); } =20 static inline void @@ -1478,7 +1444,7 @@ update_stats_enqueue_fair(struct cfs_rq * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ - if (se !=3D cfs_rq->curr) + if (se !=3D cfs_rq->h_curr) update_stats_wait_start_fair(cfs_rq, se); =20 if (flags & ENQUEUE_WAKEUP) @@ -1496,7 +1462,7 @@ update_stats_dequeue_fair(struct cfs_rq * Mark the end of the wait period if dequeueing a * waiting task: */ - if (se !=3D cfs_rq->curr) + if (se !=3D cfs_rq->h_curr) update_stats_wait_end_fair(cfs_rq, se); =20 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { @@ -3823,6 +3789,7 @@ static inline void update_scan_period(st static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { + WARN_ON_ONCE(cfs_rq !=3D cfs_rq_of(se)); update_load_add(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { struct rq *rq =3D rq_of(cfs_rq); @@ -3836,6 +3803,7 @@ account_entity_enqueue(struct cfs_rq *cf static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { + WARN_ON_ONCE(cfs_rq !=3D cfs_rq_of(se)); update_load_sub(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); @@ -3913,7 +3881,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, static void rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vpr= ot) { - unsigned long old_weight =3D se->load.weight; + long old_weight =3D se->h_load.weight; =20 /* * VRUNTIME @@ -4013,16 +3981,17 @@ rescale_entity(struct sched_entity *se, se->vprot =3D div64_long(se->vprot * old_weight, weight); } =20 -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight, bool on_rq) { bool curr =3D cfs_rq->curr =3D=3D se; bool rel_vprot =3D false; u64 avruntime =3D 0; =20 - if (se->on_rq) { - /* commit outstanding execution time */ - update_curr(cfs_rq); + if (se->h_load.weight =3D=3D weight) + return; + + if (on_rq) { avruntime =3D avg_vruntime(cfs_rq); se->vlag =3D entity_lag(cfs_rq, se, avruntime); se->deadline -=3D avruntime; @@ -4032,46 +4001,79 @@ static void reweight_entity(struct cfs_r rel_vprot =3D true; } =20 - cfs_rq->nr_queued--; + cfs_rq->h_nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); - update_load_sub(&cfs_rq->load, se->load.weight); } - dequeue_load_avg(cfs_rq, se); =20 rescale_entity(se, weight, rel_vprot); =20 - update_load_set(&se->load, weight); + update_load_set(&se->h_load, weight); =20 - do { - u32 divider =3D get_pelt_divider(&se->avg); - se->avg.load_avg =3D div_u64(se_weight(se) * se->avg.load_sum, divider); - } while (0); - - enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { + if (on_rq) { if (rel_vprot) se->vprot +=3D avruntime; se->deadline +=3D avruntime; se->rel_deadline =3D 0; se->vruntime =3D avruntime - se->vlag; =20 - update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); - cfs_rq->nr_queued++; + cfs_rq->h_nr_queued++; } } =20 +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->load.weight =3D=3D weight) + return; + + if (se->on_rq) { + WARN_ON_ONCE(cfs_rq !=3D cfs_rq_of(se)); + update_load_sub(&cfs_rq->load, se->load.weight); + } + dequeue_load_avg(cfs_rq, se); + + update_load_set(&se->load, weight); + + do { + u32 divider =3D get_pelt_divider(&se->avg); + se->avg.load_avg =3D div_u64(se_weight(se) * se->avg.load_sum, divider); + } while (0); + + enqueue_load_avg(cfs_rq, se); + + if (se->on_rq) + update_load_add(&cfs_rq->load, se->load.weight); +} + static void reweight_task_fair(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { struct sched_entity *se =3D &p->se; - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - struct load_weight *load =3D &se->load; + unsigned long weight =3D NICE_0_LOAD; =20 - reweight_entity(cfs_rq, se, lw->weight); - load->inv_weight =3D lw->inv_weight; + if (se->on_rq) + update_curr_fair(rq); + + reweight_entity(cfs_rq_of(se), se, lw->weight); + se->load.inv_weight =3D lw->inv_weight; + + if (!se->on_rq) + return; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq =3D cfs_rq_of(se); + + weight *=3D se->load.weight; + if (parent_entity(se)) + weight /=3D cfs_rq->load.weight; + } + + weight /=3D NICE_0_LOAD; + + reweight_eevdf(&rq->cfs, &p->se, weight, p->se.on_rq); } =20 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -4272,7 +4274,6 @@ static long calc_group_shares(struct cfs static void update_cfs_group(struct sched_entity *se) { struct cfs_rq *gcfs_rq =3D group_cfs_rq(se); - long shares; =20 /* * When a group becomes empty, preserve its weight. This matters for @@ -4281,9 +4282,7 @@ static void update_cfs_group(struct sche if (!gcfs_rq || !gcfs_rq->load.weight) return; =20 - shares =3D calc_group_shares(gcfs_rq); - if (unlikely(se->load.weight !=3D shares)) - reweight_entity(cfs_rq_of(se), se, shares); + reweight_entity(cfs_rq_of(se), se, calc_group_shares(gcfs_rq)); } =20 #else /* !CONFIG_FAIR_GROUP_SCHED: */ @@ -4401,7 +4400,7 @@ static inline bool cfs_rq_is_decayed(str * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * - * Updating tg's load_avg is necessary before update_cfs_share(). + * Updating tg's load_avg is necessary before update_cfs_group(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { @@ -4867,7 +4866,7 @@ static void migrate_se_pelt_lag(struct s * The cfs_rq avg is the direct sum of all its entities (blocked and runna= ble) * avg. The immediate corollary is that all (fair) tasks must be attached. * - * cfs_rq->avg is used for task_h_load() and update_cfs_share() for exampl= e. + * cfs_rq->avg is used for task_h_load() and update_cfs_group() for exampl= e. * * Return: true if the load decayed or we removed load. * @@ -5416,12 +5415,16 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 vslice, vruntime =3D avg_vruntime(cfs_rq); + unsigned int nr_queued =3D cfs_rq->h_nr_queued; s64 lag =3D 0; =20 if (!se->custom_slice) se->slice =3D sysctl_sched_base_slice; vslice =3D calc_delta_fair(se->slice, se); =20 + if (flags & ENQUEUE_QUEUED) + nr_queued -=3D 1; + /* * Due to how V is constructed as the weighted average of entities, * adding tasks with positive lag, or removing tasks with negative lag @@ -5430,7 +5433,7 @@ place_entity(struct cfs_rq *cfs_rq, stru * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { + if (sched_feat(PLACE_LAG) && nr_queued && se->vlag) { struct sched_entity *curr =3D cfs_rq->curr; long load; =20 @@ -5490,9 +5493,9 @@ place_entity(struct cfs_rq *cfs_rq, stru */ load =3D cfs_rq->sum_weight; if (curr && curr->on_rq) - load +=3D avg_vruntime_weight(cfs_rq, curr->load.weight); + load +=3D avg_vruntime_weight(cfs_rq, curr->h_load.weight); =20 - lag *=3D load + avg_vruntime_weight(cfs_rq, se->load.weight); + lag *=3D load + avg_vruntime_weight(cfs_rq, se->h_load.weight); if (WARN_ON_ONCE(!load)) load =3D 1; lag =3D div64_long(lag, load); @@ -5524,22 +5527,8 @@ static void check_enqueue_throttle(struc static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); =20 static void -requeue_delayed_entity(struct sched_entity *se); - -static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool curr =3D cfs_rq->curr =3D=3D se; - - /* - * If we're the current task, we must renormalise before calling - * update_curr(). - */ - if (curr) - place_entity(cfs_rq, se, flags); - - update_curr(cfs_rq); - /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. @@ -5558,13 +5547,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st */ update_cfs_group(se); =20 - /* - * XXX now that the entity has been re-weighted, and it's lag adjusted, - * we can place the entity. - */ - if (!curr) - place_entity(cfs_rq, se, flags); - account_entity_enqueue(cfs_rq, se); =20 /* Entity has migrated, no longer consider this task hot */ @@ -5573,8 +5555,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st =20 check_schedstat_required(); update_stats_enqueue_fair(cfs_rq, se, flags); - if (!curr) - __enqueue_entity(cfs_rq, se); se->on_rq =3D 1; =20 if (cfs_rq->nr_queued =3D=3D 1) { @@ -5592,21 +5572,19 @@ enqueue_entity(struct cfs_rq *cfs_rq, st } } =20 -static void __clear_buddies_next(struct sched_entity *se) +static void set_next_buddy(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - if (cfs_rq->next !=3D se) - break; - - cfs_rq->next =3D NULL; - } + if (WARN_ON_ONCE(!se->on_rq || se->sched_delayed)) + return; + if (se_is_idle(se)) + return; + cfs_rq->next =3D se; } =20 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->next =3D=3D se) - __clear_buddies_next(se); + cfs_rq->next =3D NULL; } =20 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -5617,7 +5595,7 @@ static void set_delayed(struct sched_ent =20 /* * Delayed se of cfs_rq have no tasks queued on them. - * Do not adjust h_nr_runnable since dequeue_entities() + * Do not adjust h_nr_runnable since __dequeue_task() * will account it for blocked tasks. */ if (!entity_is_task(se)) @@ -5657,36 +5635,11 @@ static inline void finish_delayed_dequeu se->vlag =3D 0; } =20 -static bool +static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool sleep =3D flags & DEQUEUE_SLEEP; int action =3D UPDATE_TG; =20 - update_curr(cfs_rq); - clear_buddies(cfs_rq, se); - - if (flags & DEQUEUE_DELAYED) { - WARN_ON_ONCE(!se->sched_delayed); - } else { - bool delay =3D sleep; - /* - * DELAY_DEQUEUE relies on spurious wakeups, special task - * states must not suffer spurious wakeups, excempt them. - */ - if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) - delay =3D false; - - WARN_ON_ONCE(delay && se->sched_delayed); - - if (sched_feat(DELAY_DEQUEUE) && delay && - !entity_eligible(cfs_rq, se)) { - update_load_avg(cfs_rq, se, 0); - set_delayed(se); - return false; - } - } - if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |=3D DO_DETACH; =20 @@ -5704,14 +5657,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st =20 update_stats_dequeue_fair(cfs_rq, se, flags); =20 - update_entity_lag(cfs_rq, se); - if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { - se->deadline -=3D se->vruntime; - se->rel_deadline =3D 1; - } - - if (se !=3D cfs_rq->curr) - __dequeue_entity(cfs_rq, se); se->on_rq =3D 0; account_entity_dequeue(cfs_rq, se); =20 @@ -5720,9 +5665,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st =20 update_cfs_group(se); =20 - if (flags & DEQUEUE_DELAYED) - finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued =3D=3D 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH @@ -5735,15 +5677,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st } #endif } - - return true; } =20 static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first) +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - clear_buddies(cfs_rq, se); - /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -5752,16 +5690,12 @@ set_next_entity(struct cfs_rq *cfs_rq, s * runqueue. */ update_stats_wait_end_fair(cfs_rq, se); - __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - - if (first) - set_protect_slice(cfs_rq, se); } =20 update_stats_curr_start(cfs_rq, se); - WARN_ON_ONCE(cfs_rq->curr); - cfs_rq->curr =3D se; + WARN_ON_ONCE(cfs_rq->h_curr); + cfs_rq->h_curr =3D se; =20 /* * Track our maximum slice length, if the CPU's load is at @@ -5781,31 +5715,6 @@ set_next_entity(struct cfs_rq *cfs_rq, s se->prev_sum_exec_runtime =3D se->sum_exec_runtime; } =20 -static int dequeue_entities(struct rq *rq, struct sched_entity *se, int fl= ags); - -/* - * Pick the next process, keeping these things in mind, in this order: - * 1) keep things fair between processes/task groups - * 2) pick the "next" process, since someone really wants that to run - * 3) pick the "last" process, for cache locality - * 4) do not run the "skip" process, if something else is available - */ -static struct sched_entity * -pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) -{ - struct sched_entity *se; - - se =3D pick_eevdf(cfs_rq); - if (se->sched_delayed) { - dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - /* - * Must not reference @se again, see __block_task(). - */ - return NULL; - } - return se; -} - static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); =20 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *pr= ev) @@ -5822,13 +5731,11 @@ static void put_prev_entity(struct cfs_r =20 if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); - /* Put 'current' back into the tree. */ - __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } - WARN_ON_ONCE(cfs_rq->curr !=3D prev); - cfs_rq->curr =3D NULL; + WARN_ON_ONCE(cfs_rq->h_curr !=3D prev); + cfs_rq->h_curr =3D NULL; } =20 static void @@ -5986,7 +5893,7 @@ static void __account_cfs_rq_runtime(str * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->h_curr)) resched_curr(rq_of(cfs_rq)); } =20 @@ -6344,7 +6251,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cf assert_list_leaf_cfs_rq(rq); =20 /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr =3D=3D rq->idle && rq->cfs.nr_queued) + if (rq->curr =3D=3D rq->idle && rq->cfs.h_nr_queued) resched_curr(rq); } =20 @@ -6685,7 +6592,7 @@ static void check_enqueue_throttle(struc return; =20 /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) + if (!cfs_rq->runtime_enabled || cfs_rq->h_curr) return; =20 /* ensure the group is not already throttled */ @@ -7080,7 +6987,7 @@ static void hrtick_start_fair(struct rq resched_curr(rq); return; } - delta =3D (se->load.weight * vdelta) / NICE_0_LOAD; + delta =3D (se->h_load.weight * vdelta) / NICE_0_LOAD; =20 /* * Correct for instantaneous load of other classes. @@ -7180,10 +7087,8 @@ static int choose_idle_cpu(int cpu, stru } =20 static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); - /* * se->sched_delayed should imply: se->on_rq =3D=3D 1. * Because a delayed entity is one that is still on @@ -7195,14 +7100,14 @@ requeue_delayed_entity(struct sched_enti if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); if (se->vlag > 0) { - cfs_rq->nr_queued--; + cfs_rq->h_nr_queued--; if (se !=3D cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag =3D 0; place_entity(cfs_rq, se, 0); if (se !=3D cfs_rq->curr) __enqueue_entity(cfs_rq, se); - cfs_rq->nr_queued++; + cfs_rq->h_nr_queued++; } } =20 @@ -7210,6 +7115,47 @@ requeue_delayed_entity(struct sched_enti clear_delayed(se); } =20 +static unsigned long enqueue_hierarchy(struct task_struct *p, int flags) +{ + unsigned long weight =3D NICE_0_LOAD; + int task_new =3D !(flags & ENQUEUE_WAKEUP); + struct sched_entity *se =3D &p->se; + int h_nr_idle =3D task_has_idle_policy(p); + int h_nr_runnable =3D 1; + + if (task_new && se->sched_delayed) + h_nr_runnable =3D 0; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq =3D cfs_rq_of(se); + + update_curr(cfs_rq); + + if (!se->on_rq) { + enqueue_entity(cfs_rq, se, flags); + } else { + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); + } + + cfs_rq->h_nr_runnable +=3D h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle +=3D h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle =3D 1; + + weight *=3D se->load.weight; + if (parent_entity(se)) + weight /=3D cfs_rq->load.weight; + + flags =3D ENQUEUE_WAKEUP; + } + + return weight / NICE_0_LOAD; +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -7218,13 +7164,12 @@ requeue_delayed_entity(struct sched_enti static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { - struct cfs_rq *cfs_rq; - struct sched_entity *se =3D &p->se; - int h_nr_idle =3D task_has_idle_policy(p); - int h_nr_runnable =3D 1; - int task_new =3D !(flags & ENQUEUE_WAKEUP); int rq_h_nr_queued =3D rq->cfs.h_nr_queued; - u64 slice =3D 0; + int task_new =3D !(flags & ENQUEUE_WAKEUP); + struct sched_entity *se =3D &p->se; + struct cfs_rq *cfs_rq =3D &rq->cfs; + unsigned long weight; + bool curr; =20 if (task_is_throttled(p) && enqueue_throttled_task(p)) return; @@ -7236,10 +7181,10 @@ enqueue_task_fair(struct rq *rq, struct * estimated utilization, before we update schedutil. */ if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) - util_est_enqueue(&rq->cfs, p); + util_est_enqueue(cfs_rq, p); =20 if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(cfs_rq, se); return; } =20 @@ -7251,57 +7196,22 @@ enqueue_task_fair(struct rq *rq, struct if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); =20 - if (task_new && se->sched_delayed) - h_nr_runnable =3D 0; - - for_each_sched_entity(se) { - if (se->on_rq) { - if (se->sched_delayed) - requeue_delayed_entity(se); - break; - } - cfs_rq =3D cfs_rq_of(se); - - /* - * Basically set the slice of group entries to the min_slice of - * their respective cfs_rq. This ensures the group can service - * its entities in the desired time-frame. - */ - if (slice) { - se->slice =3D slice; - se->custom_slice =3D 1; - } - enqueue_entity(cfs_rq, se, flags); - slice =3D cfs_rq_min_slice(cfs_rq); - - cfs_rq->h_nr_runnable +=3D h_nr_runnable; - cfs_rq->h_nr_queued++; - cfs_rq->h_nr_idle +=3D h_nr_idle; - - if (cfs_rq_is_idle(cfs_rq)) - h_nr_idle =3D 1; - - flags =3D ENQUEUE_WAKEUP; - } - - for_each_sched_entity(se) { - cfs_rq =3D cfs_rq_of(se); - - update_load_avg(cfs_rq, se, UPDATE_TG); - se_update_runnable(se); - update_cfs_group(se); + /* + * XXX comment on the curr thing + */ + curr =3D (cfs_rq->curr =3D=3D se); + if (curr) + place_entity(cfs_rq, se, flags); =20 - se->slice =3D slice; - if (se !=3D cfs_rq->curr) - min_vruntime_cb_propagate(&se->run_node, NULL); - slice =3D cfs_rq_min_slice(cfs_rq); + if (se->on_rq && se->sched_delayed) + requeue_delayed_entity(cfs_rq, se); =20 - cfs_rq->h_nr_runnable +=3D h_nr_runnable; - cfs_rq->h_nr_queued++; - cfs_rq->h_nr_idle +=3D h_nr_idle; + weight =3D enqueue_hierarchy(p, flags); =20 - if (cfs_rq_is_idle(cfs_rq)) - h_nr_idle =3D 1; + if (!curr) { + reweight_eevdf(cfs_rq, se, weight, false); + place_entity(cfs_rq, se, flags | ENQUEUE_QUEUED); + __enqueue_entity(cfs_rq, se); } =20 if (!rq_h_nr_queued && rq->cfs.h_nr_queued) @@ -7332,105 +7242,107 @@ enqueue_task_fair(struct rq *rq, struct hrtick_update(rq); } =20 -/* - * Basically dequeue_task_fair(), except it can deal with dequeue_entity() - * failing half-way through and resume the dequeue later. - * - * Returns: - * -1 - dequeue delayed - * 0 - dequeue throttled - * 1 - dequeue complete - */ -static int dequeue_entities(struct rq *rq, struct sched_entity *se, int fl= ags) +static void dequeue_hierarchy(struct task_struct *p, int flags) { - bool was_sched_idle =3D sched_idle_rq(rq); + struct sched_entity *se =3D &p->se; bool task_sleep =3D flags & DEQUEUE_SLEEP; bool task_delayed =3D flags & DEQUEUE_DELAYED; bool task_throttled =3D flags & DEQUEUE_THROTTLE; - struct task_struct *p =3D NULL; - int h_nr_idle =3D 0; - int h_nr_queued =3D 0; int h_nr_runnable =3D 0; - struct cfs_rq *cfs_rq; - u64 slice =3D 0; + int h_nr_idle =3D task_has_idle_policy(p); + bool dequeue =3D true; =20 - if (entity_is_task(se)) { - p =3D task_of(se); - h_nr_queued =3D 1; - h_nr_idle =3D task_has_idle_policy(p); - if (task_sleep || task_delayed || !se->sched_delayed) - h_nr_runnable =3D 1; - } + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable =3D 1; =20 for_each_sched_entity(se) { - cfs_rq =3D cfs_rq_of(se); + struct cfs_rq *cfs_rq =3D cfs_rq_of(se); =20 - if (!dequeue_entity(cfs_rq, se, flags)) { - if (p && &p->se =3D=3D se) - return -1; + update_curr(cfs_rq); =20 - slice =3D cfs_rq_min_slice(cfs_rq); - break; + if (dequeue) { + dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) + dequeue =3D false; + } else { + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + update_cfs_group(se); } =20 cfs_rq->h_nr_runnable -=3D h_nr_runnable; - cfs_rq->h_nr_queued -=3D h_nr_queued; + cfs_rq->h_nr_queued--; cfs_rq->h_nr_idle -=3D h_nr_idle; =20 if (cfs_rq_is_idle(cfs_rq)) - h_nr_idle =3D h_nr_queued; + h_nr_idle =3D 1; =20 if (throttled_hierarchy(cfs_rq) && task_throttled) record_throttle_clock(cfs_rq); =20 - /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) { - slice =3D cfs_rq_min_slice(cfs_rq); - - /* Avoid re-evaluating load for this entity: */ - se =3D parent_entity(se); - /* - * Bias pick_next to pick a task from this cfs_rq, as - * p is sleeping when it is within its sched_slice. - */ - if (task_sleep && se) - set_next_buddy(se); - break; - } flags |=3D DEQUEUE_SLEEP; flags &=3D ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); } +} =20 - for_each_sched_entity(se) { - cfs_rq =3D cfs_rq_of(se); +/* + * The part of dequeue_task_fair() that is needed to dequeue delayed tasks. + * + * Returns: + * true - dequeued + * false - delayed + */ +static bool __dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_entity *se =3D &p->se; + struct cfs_rq *cfs_rq =3D &rq->cfs; + bool was_sched_idle =3D sched_idle_rq(rq); + bool task_sleep =3D flags & DEQUEUE_SLEEP; + bool task_delayed =3D flags & DEQUEUE_DELAYED; =20 - update_load_avg(cfs_rq, se, UPDATE_TG); - se_update_runnable(se); - update_cfs_group(se); + clear_buddies(cfs_rq, se); =20 - se->slice =3D slice; - if (se !=3D cfs_rq->curr) - min_vruntime_cb_propagate(&se->run_node, NULL); - slice =3D cfs_rq_min_slice(cfs_rq); + if (flags & DEQUEUE_DELAYED) { + WARN_ON_ONCE(!se->sched_delayed); + } else { + bool delay =3D task_sleep; + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) + delay =3D false; =20 - cfs_rq->h_nr_runnable -=3D h_nr_runnable; - cfs_rq->h_nr_queued -=3D h_nr_queued; - cfs_rq->h_nr_idle -=3D h_nr_idle; + WARN_ON_ONCE(delay && se->sched_delayed); =20 - if (cfs_rq_is_idle(cfs_rq)) - h_nr_idle =3D h_nr_queued; + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + update_load_avg(cfs_rq_of(se), se, 0); + set_delayed(se); + return false; + } + } =20 - if (throttled_hierarchy(cfs_rq) && task_throttled) - record_throttle_clock(cfs_rq); + dequeue_hierarchy(p, flags); + + update_entity_lag(cfs_rq, se); + if (sched_feat(PLACE_REL_DEADLINE) && !task_sleep) { + se->deadline -=3D se->vruntime; + se->rel_deadline =3D 1; } + if (se !=3D cfs_rq->curr) + __dequeue_entity(cfs_rq, se); =20 - sub_nr_running(rq, h_nr_queued); + sub_nr_running(rq, 1); =20 /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance =3D jiffies; =20 - if (p && task_delayed) { + if (task_delayed) { + finish_delayed_dequeue_entity(se); + WARN_ON_ONCE(!task_sleep); WARN_ON_ONCE(p->on_rq !=3D 1); =20 @@ -7442,7 +7354,7 @@ static int dequeue_entities(struct rq *r __block_task(rq, p); } =20 - return 1; + return true; } =20 /* @@ -7461,11 +7373,11 @@ static bool dequeue_task_fair(struct rq util_est_dequeue(&rq->cfs, p); =20 util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); - if (dequeue_entities(rq, &p->se, flags) < 0) + if (!__dequeue_task(rq, p, flags)) return false; =20 /* - * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + * Must not reference @p after __dequeue_task(DEQUEUE_DELAYED). */ return true; } @@ -8953,19 +8865,6 @@ static void migrate_task_rq_fair(struct static void task_dead_fair(struct task_struct *p) { struct sched_entity *se =3D &p->se; - - if (se->sched_delayed) { - struct rq_flags rf; - struct rq *rq; - - rq =3D task_rq_lock(p, &rf); - if (se->sched_delayed) { - update_rq_clock(rq); - dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - } - task_rq_unlock(rq, p, &rf); - } - remove_entity_load_avg(se); } =20 @@ -8999,21 +8898,10 @@ static void set_cpus_allowed_fair(struct set_task_max_allowed_capacity(p); } =20 -static void set_next_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) { - if (WARN_ON_ONCE(!se->on_rq)) - return; - if (se_is_idle(se)) - return; - cfs_rq_of(se)->next =3D se; - } -} - enum preempt_wakeup_action { PREEMPT_WAKEUP_NONE, /* No preemption. */ PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ - PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ + PREEMPT_WAKEUP_PICK, /* Let pick_eevdf() decide. */ PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ }; =20 @@ -9030,7 +8918,7 @@ set_preempt_buddy(struct cfs_rq *cfs_rq, if (cfs_rq->next && entity_before(cfs_rq->next, pse)) return false; =20 - set_next_buddy(pse); + set_next_buddy(cfs_rq, pse); return true; } =20 @@ -9083,8 +8971,8 @@ static void wakeup_preempt_fair(struct r enum preempt_wakeup_action preempt_action =3D PREEMPT_WAKEUP_PICK; struct task_struct *donor =3D rq->donor; struct sched_entity *se =3D &donor->se, *pse =3D &p->se; - struct cfs_rq *cfs_rq =3D task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + struct cfs_rq *cfs_rq =3D &rq->cfs; =20 /* * XXX Getting preempted by higher class, try and find idle CPU? @@ -9120,7 +9008,6 @@ static void wakeup_preempt_fair(struct r if (!sched_feat(WAKEUP_PREEMPTION)) return; =20 - find_matching_se(&se, &pse); WARN_ON_ONCE(!pse); =20 cse_is_idle =3D se_is_idle(se); @@ -9148,8 +9035,7 @@ static void wakeup_preempt_fair(struct r if (unlikely(!normal_policy(p->policy))) return; =20 - cfs_rq =3D cfs_rq_of(se); - update_curr(cfs_rq); + update_curr_fair(rq); /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. @@ -9196,7 +9082,7 @@ static void wakeup_preempt_fair(struct r /* * If @p has become the most eligible task, force preemption. */ - if (__pick_eevdf(cfs_rq, preempt_action !=3D PREEMPT_WAKEUP_SHORT) =3D=3D= pse) + if (pick_eevdf(cfs_rq, preempt_action !=3D PREEMPT_WAKEUP_SHORT) =3D=3D p= se) goto preempt; =20 if (sched_feat(RUN_TO_PARITY)) @@ -9214,35 +9100,34 @@ static void wakeup_preempt_fair(struct r struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { + struct cfs_rq *cfs_rq =3D &rq->cfs; struct sched_entity *se; - struct cfs_rq *cfs_rq; struct task_struct *p; - bool throttled; int new_tasks; =20 again: - cfs_rq =3D &rq->cfs; - if (!cfs_rq->nr_queued) + if (!cfs_rq->h_nr_queued) goto idle; =20 - throttled =3D false; - - do { - /* Might not have done put_prev_entity() */ - if (cfs_rq->curr && cfs_rq->curr->on_rq) - update_curr(cfs_rq); - - throttled |=3D check_cfs_rq_runtime(cfs_rq); + /* Might not have done put_prev_entity() */ + if (cfs_rq->curr && cfs_rq->curr->on_rq) + update_curr(cfs_rq); =20 - se =3D pick_next_entity(rq, cfs_rq); - if (!se) - goto again; - cfs_rq =3D group_cfs_rq(se); - } while (cfs_rq); + se =3D pick_eevdf(cfs_rq, true); + if (WARN_ON_ONCE(!se)) + return NULL; =20 p =3D task_of(se); - if (unlikely(throttled)) + if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se)))) task_throttle_setup_work(p); + + if (se->sched_delayed) { + __dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + /* + * Must not reference @se again, see __block_task(). + */ + goto again; + } return p; =20 idle: @@ -9276,7 +9161,7 @@ void fair_server_init(struct rq *rq) static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, st= ruct task_struct *next) { struct sched_entity *se =3D &prev->se; - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq =3D &rq->cfs; struct sched_entity *nse =3D NULL; =20 #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9286,7 +9171,7 @@ static void put_prev_task_fair(struct rq =20 while (se) { cfs_rq =3D cfs_rq_of(se); - if (!nse || cfs_rq->curr) + if (!nse || cfs_rq->h_curr) put_prev_entity(cfs_rq, se); #ifdef CONFIG_FAIR_GROUP_SCHED if (nse) { @@ -9300,6 +9185,14 @@ static void put_prev_task_fair(struct rq #endif se =3D parent_entity(se); } + + /* Put 'current' back into the tree. */ + cfs_rq =3D &rq->cfs; + se =3D &prev->se; + WARN_ON_ONCE(cfs_rq->curr !=3D se); + cfs_rq->curr =3D NULL; + if (se->on_rq) + __enqueue_entity(cfs_rq, se); } =20 /* @@ -9308,8 +9201,8 @@ static void put_prev_task_fair(struct rq static void yield_task_fair(struct rq *rq) { struct task_struct *curr =3D rq->donor; - struct cfs_rq *cfs_rq =3D task_cfs_rq(curr); struct sched_entity *se =3D &curr->se; + struct cfs_rq *cfs_rq =3D &rq->cfs; =20 /* * Are we the only task in the tree? @@ -9350,11 +9243,11 @@ static bool yield_to_task_fair(struct rq struct sched_entity *se =3D &p->se; =20 /* !se->on_rq also covers throttled task */ - if (!se->on_rq) + if (!se->on_rq || se->sched_delayed) return false; =20 /* Tell the scheduler that we'd really like se to run next. */ - set_next_buddy(se); + set_next_buddy(&task_rq(p)->cfs, se); =20 yield_task_fair(rq); =20 @@ -9680,15 +9573,10 @@ static inline long migrate_degrades_loca */ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int= dest_cpu) { - struct cfs_rq *dst_cfs_rq; + struct cfs_rq *dst_cfs_rq =3D &cpu_rq(dest_cpu)->cfs; =20 -#ifdef CONFIG_FAIR_GROUP_SCHED - dst_cfs_rq =3D task_group(p)->cfs_rq[dest_cpu]; -#else - dst_cfs_rq =3D &cpu_rq(dest_cpu)->cfs; -#endif - if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && - !entity_eligible(task_cfs_rq(p), &p->se)) + if (sched_feat(PLACE_LAG) && dst_cfs_rq->h_nr_queued && + !entity_eligible(&task_rq(p)->cfs, &p->se)) return 1; =20 return 0; @@ -10184,7 +10072,7 @@ static void update_cfs_rq_h_load(struct while ((se =3D READ_ONCE(cfs_rq->h_load_next)) !=3D NULL) { load =3D cfs_rq->h_load; load =3D div64_ul(load * se->avg.load_avg, - cfs_rq_load_avg(cfs_rq) + 1); + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq =3D group_cfs_rq(se); cfs_rq->h_load =3D load; cfs_rq->last_h_load_update =3D now; @@ -13405,7 +13293,7 @@ static inline void task_tick_core(struct * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle_count && rq->cfs.nr_queued =3D=3D 1 && + if (rq->core->core_forceidle_count && rq->cfs.h_nr_queued =3D=3D 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } @@ -13614,30 +13502,8 @@ bool cfs_prio_less(const struct task_str =20 WARN_ON_ONCE(task_rq(b)->core !=3D rq->core); =20 -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Find an se in the hierarchy for tasks a and b, such that the se's - * are immediate siblings. - */ - while (sea->cfs_rq->tg !=3D seb->cfs_rq->tg) { - int sea_depth =3D sea->depth; - int seb_depth =3D seb->depth; - - if (sea_depth >=3D seb_depth) - sea =3D parent_entity(sea); - if (sea_depth <=3D seb_depth) - seb =3D parent_entity(seb); - } - - se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); - se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); - - cfs_rqa =3D sea->cfs_rq; - cfs_rqb =3D seb->cfs_rq; -#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa =3D &task_rq(a)->cfs; cfs_rqb =3D &task_rq(b)->cfs; -#endif /* !CONFIG_FAIR_GROUP_SCHED */ =20 /* * Find delta after normalizing se's vruntime with its cfs_rq's @@ -13675,14 +13541,24 @@ static inline void task_tick_core(struct */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int qu= eued) { - struct cfs_rq *cfs_rq; struct sched_entity *se =3D &curr->se; + unsigned long weight =3D NICE_0_LOAD; + struct cfs_rq *cfs_rq; =20 for_each_sched_entity(se) { cfs_rq =3D cfs_rq_of(se); entity_tick(cfs_rq, se, queued); + + weight *=3D se->load.weight; + if (parent_entity(se)) + weight /=3D cfs_rq->load.weight; } =20 + weight /=3D NICE_0_LOAD; + + se =3D &curr->se; + reweight_eevdf(cfs_rq, se, weight, se->on_rq); + if (queued) return; =20 @@ -13718,7 +13594,7 @@ prio_changed_fair(struct rq *rq, struct if (p->prio =3D=3D oldprio) return; =20 - if (rq->cfs.nr_queued =3D=3D 1) + if (rq->cfs.h_nr_queued =3D=3D 1) return; =20 /* @@ -13847,29 +13723,43 @@ static void switched_to_fair(struct rq * } } =20 -/* - * Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool = first) { struct sched_entity *se =3D &p->se; + struct cfs_rq *cfs_rq =3D &rq->cfs; + unsigned long weight =3D NICE_0_LOAD; + bool on_rq =3D se->on_rq; + + clear_buddies(cfs_rq, se); + + if (on_rq) + __dequeue_entity(cfs_rq, se); =20 for_each_sched_entity(se) { - struct cfs_rq *cfs_rq =3D cfs_rq_of(se); + cfs_rq =3D cfs_rq_of(se); =20 - if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) && - first && cfs_rq->curr) - break; + if (!IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) || + !first || !cfs_rq->h_curr) + set_next_entity(cfs_rq, se); =20 - set_next_entity(cfs_rq, se, true); /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); + + if (on_rq) { + weight *=3D se->load.weight; + if (parent_entity(se)) + weight /=3D cfs_rq->load.weight; + } } =20 se =3D &p->se; + cfs_rq->curr =3D se; + + if (on_rq) { + reweight_eevdf(cfs_rq, se, weight/NICE_0_LOAD, se->on_rq); + if (first) + set_protect_slice(cfs_rq, se); + } =20 if (task_on_rq_queued(p)) { /* @@ -14000,17 +13890,8 @@ void unregister_fair_sched_group(struct struct sched_entity *se =3D tg->se[cpu]; struct rq *rq =3D cpu_rq(cpu); =20 - if (se) { - if (se->sched_delayed) { - guard(rq_lock_irqsave)(rq); - if (se->sched_delayed) { - update_rq_clock(rq); - dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - } - list_del_leaf_cfs_rq(cfs_rq); - } + if (se) remove_entity_load_avg(se); - } =20 /* * Only empty task groups can be destroyed; so we can speculatively --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -707,6 +707,7 @@ struct cfs_rq { /* * CFS load tracking */ + struct sched_entity *h_curr; struct sched_avg avg; #ifndef CONFIG_64BIT u64 last_update_time_copy; @@ -2493,6 +2494,7 @@ extern const u32 sched_prio_to_wmult[40 #define ENQUEUE_MIGRATED 0x00040000 #define ENQUEUE_INITIAL 0x00080000 #define ENQUEUE_RQ_SELECTED 0x00100000 +#define ENQUEUE_QUEUED 0x00200000 =20 #define RETRY_TASK ((void *)-1UL)