commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks")
introduce a TASK_NEW state and an unnessary limitation that would fail
when changing cgroup of new forked task.
Because at that time, we can't handle task_change_group_fair() for new
forked fair task which hasn't been woken up by wake_up_new_task(),
which will cause detach on an unattached task sched_avg problem.
This patch delete this unnessary limitation by adding check before do
detach or attach in task_change_group_fair().
So cpu_cgrp_subsys.can_attach() has nothing to do for fair tasks,
only define it in #ifdef CONFIG_RT_GROUP_SCHED.
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
include/linux/sched.h | 5 ++---
kernel/sched/core.c | 30 +++++++-----------------------
kernel/sched/fair.c | 7 +++++++
3 files changed, 16 insertions(+), 26 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 88b8817b827d..b504e55bbf7a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -95,10 +95,9 @@ struct task_group;
#define TASK_WAKEKILL 0x0100
#define TASK_WAKING 0x0200
#define TASK_NOLOAD 0x0400
-#define TASK_NEW 0x0800
/* RT specific auxilliary flag to mark RT lock waiters */
-#define TASK_RTLOCK_WAIT 0x1000
-#define TASK_STATE_MAX 0x2000
+#define TASK_RTLOCK_WAIT 0x0800
+#define TASK_STATE_MAX 0x1000
/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e74e79f783af..d5faa1700bd7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4500,11 +4500,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
__sched_fork(clone_flags, p);
/*
- * We mark the process as NEW here. This guarantees that
+ * We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->__state = TASK_NEW;
+ p->__state = TASK_RUNNING;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -4622,7 +4622,6 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -10238,36 +10237,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if it's
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (READ_ONCE(task->__state) == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+ return 0;
}
+#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -11103,7 +11085,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
+#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
+#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bc76d95a99d..90aba33a3780 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11669,6 +11669,13 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_change_group_fair(struct task_struct *p)
{
+ /*
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
+ */
+ if (!p->on_rq && !se->sum_exec_runtime)
+ return;
+
detach_task_cfs_rq(p);
#ifdef CONFIG_SMP
--
2.36.1
On Mon, Aug 08, 2022 at 08:57:43PM +0800, Chengming Zhou wrote:
> commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks")
> introduce a TASK_NEW state and an unnessary limitation that would fail
> when changing cgroup of new forked task.
>
> Because at that time, we can't handle task_change_group_fair() for new
> forked fair task which hasn't been woken up by wake_up_new_task(),
> which will cause detach on an unattached task sched_avg problem.
>
> This patch delete this unnessary limitation by adding check before do
> detach or attach in task_change_group_fair().
>
> So cpu_cgrp_subsys.can_attach() has nothing to do for fair tasks,
> only define it in #ifdef CONFIG_RT_GROUP_SCHED.
>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
I don't know cfs enough to review this but it'd be really great to remove
this restriction.
Thanks.
--
tejun
On 2022/8/16 05:11, Tejun Heo wrote:
> On Mon, Aug 08, 2022 at 08:57:43PM +0800, Chengming Zhou wrote:
>> commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks")
>> introduce a TASK_NEW state and an unnessary limitation that would fail
>> when changing cgroup of new forked task.
>>
>> Because at that time, we can't handle task_change_group_fair() for new
>> forked fair task which hasn't been woken up by wake_up_new_task(),
>> which will cause detach on an unattached task sched_avg problem.
>>
>> This patch delete this unnessary limitation by adding check before do
>> detach or attach in task_change_group_fair().
>>
>> So cpu_cgrp_subsys.can_attach() has nothing to do for fair tasks,
>> only define it in #ifdef CONFIG_RT_GROUP_SCHED.
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>
> I don't know cfs enough to review this but it'd be really great to remove
> this restriction.
Thanks for your reply!
Friendly ping :-)
Hi Chengming,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on linus/master next-20220808]
[cannot apply to v5.19]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Chengming-Zhou/sched-fair-task-load-tracking-optimization-and-cleanup/20220808-210012
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 8648f92a66a323ed01903d2cbb248cdbe2f312d9
config: um-i386_defconfig (https://download.01.org/0day-ci/archive/20220809/202208090027.Lo1M3CoX-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-3) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/05baf61c579ea60e2b6447a012edcc5bf5f43835
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Chengming-Zhou/sched-fair-task-load-tracking-optimization-and-cleanup/20220808-210012
git checkout 05baf61c579ea60e2b6447a012edcc5bf5f43835
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
kernel/sched/fair.c:672:5: warning: no previous prototype for 'sched_update_scaling' [-Wmissing-prototypes]
672 | int sched_update_scaling(void)
| ^~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c: In function 'task_change_group_fair':
>> kernel/sched/fair.c:11676:27: error: 'se' undeclared (first use in this function); did you mean 'sem'?
11676 | if (!p->on_rq && !se->sum_exec_runtime)
| ^~
| sem
kernel/sched/fair.c:11676:27: note: each undeclared identifier is reported only once for each function it appears in
vim +11676 kernel/sched/fair.c
11668
11669 #ifdef CONFIG_FAIR_GROUP_SCHED
11670 static void task_change_group_fair(struct task_struct *p)
11671 {
11672 /*
11673 * We couldn't detach or attach a forked task which
11674 * hasn't been woken up by wake_up_new_task().
11675 */
11676 if (!p->on_rq && !se->sum_exec_runtime)
11677 return;
11678
11679 detach_task_cfs_rq(p);
11680
--
0-DAY CI Kernel Test Service
https://01.org/lkp
On 2022/8/8 20:57, Chengming Zhou wrote:
> commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks")
> introduce a TASK_NEW state and an unnessary limitation that would fail
> when changing cgroup of new forked task.
>
> Because at that time, we can't handle task_change_group_fair() for new
> forked fair task which hasn't been woken up by wake_up_new_task(),
> which will cause detach on an unattached task sched_avg problem.
>
> This patch delete this unnessary limitation by adding check before do
> detach or attach in task_change_group_fair().
>
> So cpu_cgrp_subsys.can_attach() has nothing to do for fair tasks,
> only define it in #ifdef CONFIG_RT_GROUP_SCHED.
>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
> include/linux/sched.h | 5 ++---
> kernel/sched/core.c | 30 +++++++-----------------------
> kernel/sched/fair.c | 7 +++++++
> 3 files changed, 16 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 88b8817b827d..b504e55bbf7a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -95,10 +95,9 @@ struct task_group;
> #define TASK_WAKEKILL 0x0100
> #define TASK_WAKING 0x0200
> #define TASK_NOLOAD 0x0400
> -#define TASK_NEW 0x0800
> /* RT specific auxilliary flag to mark RT lock waiters */
> -#define TASK_RTLOCK_WAIT 0x1000
> -#define TASK_STATE_MAX 0x2000
> +#define TASK_RTLOCK_WAIT 0x0800
> +#define TASK_STATE_MAX 0x1000
>
> /* Convenience macros for the sake of set_current_state: */
> #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e74e79f783af..d5faa1700bd7 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4500,11 +4500,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
> {
> __sched_fork(clone_flags, p);
> /*
> - * We mark the process as NEW here. This guarantees that
> + * We mark the process as running here. This guarantees that
> * nobody will actually run it, and a signal or other external
> * event cannot wake it up and insert it on the runqueue either.
> */
> - p->__state = TASK_NEW;
> + p->__state = TASK_RUNNING;
>
> /*
> * Make sure we do not leak PI boosting priority to the child.
> @@ -4622,7 +4622,6 @@ void wake_up_new_task(struct task_struct *p)
> struct rq *rq;
>
> raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
> - WRITE_ONCE(p->__state, TASK_RUNNING);
> #ifdef CONFIG_SMP
> /*
> * Fork balancing, do it here and not earlier because:
> @@ -10238,36 +10237,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
> sched_unregister_group(tg);
> }
>
> +#ifdef CONFIG_RT_GROUP_SCHED
> static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
> {
> struct task_struct *task;
> struct cgroup_subsys_state *css;
> - int ret = 0;
>
> cgroup_taskset_for_each(task, css, tset) {
> -#ifdef CONFIG_RT_GROUP_SCHED
> if (!sched_rt_can_attach(css_tg(css), task))
> return -EINVAL;
> -#endif
> - /*
> - * Serialize against wake_up_new_task() such that if it's
> - * running, we're sure to observe its full state.
> - */
> - raw_spin_lock_irq(&task->pi_lock);
> - /*
> - * Avoid calling sched_move_task() before wake_up_new_task()
> - * has happened. This would lead to problems with PELT, due to
> - * move wanting to detach+attach while we're not attached yet.
> - */
> - if (READ_ONCE(task->__state) == TASK_NEW)
> - ret = -EINVAL;
> - raw_spin_unlock_irq(&task->pi_lock);
> -
> - if (ret)
> - break;
> }
> - return ret;
> + return 0;
> }
> +#endif
>
> static void cpu_cgroup_attach(struct cgroup_taskset *tset)
> {
> @@ -11103,7 +11085,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
> .css_released = cpu_cgroup_css_released,
> .css_free = cpu_cgroup_css_free,
> .css_extra_stat_show = cpu_extra_stat_show,
> +#ifdef CONFIG_RT_GROUP_SCHED
> .can_attach = cpu_cgroup_can_attach,
> +#endif
> .attach = cpu_cgroup_attach,
> .legacy_cftypes = cpu_legacy_files,
> .dfl_cftypes = cpu_files,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4bc76d95a99d..90aba33a3780 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11669,6 +11669,13 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
> #ifdef CONFIG_FAIR_GROUP_SCHED
> static void task_change_group_fair(struct task_struct *p)
> {
> + /*
> + * We couldn't detach or attach a forked task which
> + * hasn't been woken up by wake_up_new_task().
> + */
> + if (!p->on_rq && !se->sum_exec_runtime)
should be: if (!p->on_rq && !p->se.sum_exec_runtime)
sorry for my carelessness...
> + return;
> +
> detach_task_cfs_rq(p);
>
> #ifdef CONFIG_SMP
© 2016 - 2026 Red Hat, Inc.