[PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy

Tianchen Ding posted 1 patch 1 year, 5 months ago
kernel/sched/fair.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
[PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Tianchen Ding 1 year, 5 months ago
Consider the following cgroup:

                       root
                        |
             ------------------------
             |                      |
       normal_cgroup            idle_cgroup
             |                      |
   SCHED_IDLE task_A           SCHED_NORMAL task_B

According to the cgroup hierarchy, A should preempt B. But current
check_preempt_wakeup_fair() treats cgroup se and task separately, so B
will preempt A unexpectedly.
Unify the wakeup logic by {c,p}se_is_idle only. This makes SCHED_IDLE of
a task a relative policy that is effective only within its own cgroup,
similar to the behavior of NICE.

Also fix se_is_idle() definition when !CONFIG_FAIR_GROUP_SCHED.

Fixes: 304000390f88 ("sched: Cgroup SCHED_IDLE support")
Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Reviewed-by: Josh Don <joshdon@google.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
---
v2:
Use entity_is_task() to check whether pse is a task.
Improve comments and commit log.

v1: https://lore.kernel.org/all/20240624073900.10343-1-dtcccc@linux.alibaba.com/
---
 kernel/sched/fair.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41b58387023d..f0b038de99ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
 
 static int se_is_idle(struct sched_entity *se)
 {
-	return 0;
+	return task_has_idle_policy(task_of(se));
 }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
@@ -8382,16 +8382,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (test_tsk_need_resched(curr))
 		return;
 
-	/* Idle tasks are by definition preempted by non-idle tasks. */
-	if (unlikely(task_has_idle_policy(curr)) &&
-	    likely(!task_has_idle_policy(p)))
-		goto preempt;
-
-	/*
-	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
-	 * is driven by the tick):
-	 */
-	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+	if (!sched_feat(WAKEUP_PREEMPTION))
 		return;
 
 	find_matching_se(&se, &pse);
@@ -8401,7 +8392,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	pse_is_idle = se_is_idle(pse);
 
 	/*
-	 * Preempt an idle group in favor of a non-idle group (and don't preempt
+	 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
 	 * in the inverse case).
 	 */
 	if (cse_is_idle && !pse_is_idle)
@@ -8409,6 +8400,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	if (cse_is_idle != pse_is_idle)
 		return;
 
+	/*
+	 * Batch tasks do not preempt non-idle tasks (their preemption
+	 * is driven by the tick).
+	 * We've done the check about "only one of the entities is idle",
+	 * so cse must be non-idle if p is a batch task.
+	 */
+	if (unlikely(entity_is_task(pse) && p->policy == SCHED_BATCH))
+		return;
+
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
 
-- 
2.39.3
Re: [PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Peter Zijlstra 1 year, 5 months ago
On Wed, Jun 26, 2024 at 10:35:05AM +0800, Tianchen Ding wrote:
> Consider the following cgroup:
> 
>                        root
>                         |
>              ------------------------
>              |                      |
>        normal_cgroup            idle_cgroup
>              |                      |
>    SCHED_IDLE task_A           SCHED_NORMAL task_B
> 
> According to the cgroup hierarchy, A should preempt B. But current
> check_preempt_wakeup_fair() treats cgroup se and task separately, so B
> will preempt A unexpectedly.
> Unify the wakeup logic by {c,p}se_is_idle only. This makes SCHED_IDLE of
> a task a relative policy that is effective only within its own cgroup,
> similar to the behavior of NICE.
> 
> Also fix se_is_idle() definition when !CONFIG_FAIR_GROUP_SCHED.
> 
> Fixes: 304000390f88 ("sched: Cgroup SCHED_IDLE support")
> Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
> Reviewed-by: Josh Don <joshdon@google.com>
> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> v2:
> Use entity_is_task() to check whether pse is a task.
> Improve comments and commit log.
> 
> v1: https://lore.kernel.org/all/20240624073900.10343-1-dtcccc@linux.alibaba.com/
> ---
>  kernel/sched/fair.c | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 41b58387023d..f0b038de99ce 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
>  
>  static int se_is_idle(struct sched_entity *se)
>  {
> -	return 0;
> +	return task_has_idle_policy(task_of(se));
>  }
>  
>  #endif	/* CONFIG_FAIR_GROUP_SCHED */
> @@ -8382,16 +8382,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	if (test_tsk_need_resched(curr))
>  		return;
>  
> -	/* Idle tasks are by definition preempted by non-idle tasks. */
> -	if (unlikely(task_has_idle_policy(curr)) &&
> -	    likely(!task_has_idle_policy(p)))
> -		goto preempt;
> -
> -	/*
> -	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
> -	 * is driven by the tick):
> -	 */
> -	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
> +	if (!sched_feat(WAKEUP_PREEMPTION))
>  		return;
>  
>  	find_matching_se(&se, &pse);
> @@ -8401,7 +8392,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	pse_is_idle = se_is_idle(pse);
>  
>  	/*
> -	 * Preempt an idle group in favor of a non-idle group (and don't preempt
> +	 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
>  	 * in the inverse case).
>  	 */
>  	if (cse_is_idle && !pse_is_idle)
> @@ -8409,6 +8400,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	if (cse_is_idle != pse_is_idle)
>  		return;
>  
> +	/*
> +	 * Batch tasks do not preempt non-idle tasks (their preemption
> +	 * is driven by the tick).
> +	 * We've done the check about "only one of the entities is idle",
> +	 * so cse must be non-idle if p is a batch task.
> +	 */
> +	if (unlikely(entity_is_task(pse) && p->policy == SCHED_BATCH))
> +		return;

I'm not convinced this condition is right. The current behaviour of
SCHED_BATCH doesn't care about pse, only p.

That is, if p is SCHED_BATCH it will not preempt -- except an
SCHED_IDLE.

So I'm tempted to delete this first part of your condition and have it
be:

	if (p->policy == SCHED_BATCH)
		return;

That is, suppose you have:

                        root
                         |
              ------------------------
              |                      |
        normal_cgroup          normal_cgroup
              |                      |
        SCHED_BATCH task_A     SCHED_BATCH task_B

Then the preemption crud will end up comparing the groups to one another
and still allow A to preempt B -- except we explicitly do not want this.

The 'problem' is that the whole BATCH thing isn't cgroup aware ofcourse,
but I'm not sure we want to go fix that -- esp. not in this patch.

Hmm?
Re: [PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Vincent Guittot 1 year, 5 months ago
On Mon, 8 Jul 2024 at 14:02, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jun 26, 2024 at 10:35:05AM +0800, Tianchen Ding wrote:
> > Consider the following cgroup:
> >
> >                        root
> >                         |
> >              ------------------------
> >              |                      |
> >        normal_cgroup            idle_cgroup
> >              |                      |
> >    SCHED_IDLE task_A           SCHED_NORMAL task_B
> >
> > According to the cgroup hierarchy, A should preempt B. But current
> > check_preempt_wakeup_fair() treats cgroup se and task separately, so B
> > will preempt A unexpectedly.
> > Unify the wakeup logic by {c,p}se_is_idle only. This makes SCHED_IDLE of
> > a task a relative policy that is effective only within its own cgroup,
> > similar to the behavior of NICE.
> >
> > Also fix se_is_idle() definition when !CONFIG_FAIR_GROUP_SCHED.
> >
> > Fixes: 304000390f88 ("sched: Cgroup SCHED_IDLE support")
> > Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
> > Reviewed-by: Josh Don <joshdon@google.com>
> > Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> > v2:
> > Use entity_is_task() to check whether pse is a task.
> > Improve comments and commit log.
> >
> > v1: https://lore.kernel.org/all/20240624073900.10343-1-dtcccc@linux.alibaba.com/
> > ---
> >  kernel/sched/fair.c | 24 ++++++++++++------------
> >  1 file changed, 12 insertions(+), 12 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 41b58387023d..f0b038de99ce 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
> >
> >  static int se_is_idle(struct sched_entity *se)
> >  {
> > -     return 0;
> > +     return task_has_idle_policy(task_of(se));
> >  }
> >
> >  #endif       /* CONFIG_FAIR_GROUP_SCHED */
> > @@ -8382,16 +8382,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> >       if (test_tsk_need_resched(curr))
> >               return;
> >
> > -     /* Idle tasks are by definition preempted by non-idle tasks. */
> > -     if (unlikely(task_has_idle_policy(curr)) &&
> > -         likely(!task_has_idle_policy(p)))
> > -             goto preempt;
> > -
> > -     /*
> > -      * Batch and idle tasks do not preempt non-idle tasks (their preemption
> > -      * is driven by the tick):
> > -      */
> > -     if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
> > +     if (!sched_feat(WAKEUP_PREEMPTION))
> >               return;
> >
> >       find_matching_se(&se, &pse);
> > @@ -8401,7 +8392,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> >       pse_is_idle = se_is_idle(pse);
> >
> >       /*
> > -      * Preempt an idle group in favor of a non-idle group (and don't preempt
> > +      * Preempt an idle entity in favor of a non-idle entity (and don't preempt
> >        * in the inverse case).
> >        */
> >       if (cse_is_idle && !pse_is_idle)
> > @@ -8409,6 +8400,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> >       if (cse_is_idle != pse_is_idle)
> >               return;
> >
> > +     /*
> > +      * Batch tasks do not preempt non-idle tasks (their preemption
> > +      * is driven by the tick).
> > +      * We've done the check about "only one of the entities is idle",
> > +      * so cse must be non-idle if p is a batch task.
> > +      */
> > +     if (unlikely(entity_is_task(pse) && p->policy == SCHED_BATCH))
> > +             return;
>
> I'm not convinced this condition is right. The current behaviour of
> SCHED_BATCH doesn't care about pse, only p.
>
> That is, if p is SCHED_BATCH it will not preempt -- except an
> SCHED_IDLE.
>
> So I'm tempted to delete this first part of your condition and have it
> be:
>
>         if (p->policy == SCHED_BATCH)
>                 return;
>
> That is, suppose you have:
>
>                         root
>                          |
>               ------------------------
>               |                      |
>         normal_cgroup          normal_cgroup
>               |                      |
>         SCHED_BATCH task_A     SCHED_BATCH task_B
>
> Then the preemption crud will end up comparing the groups to one another
> and still allow A to preempt B -- except we explicitly do not want this.
>
> The 'problem' is that the whole BATCH thing isn't cgroup aware ofcourse,
> but I'm not sure we want to go fix that -- esp. not in this patch.
>
> Hmm?

Good question, but do we want to make SCHED_BATCH tasks behave
differently than SCHED_IDLE tasks in a group in this case ?

With this patch, we don't want task_B to preempt task_A for the case
but task_A will preempt task_B whereas task A is SCHED_IDLE

                         root
                          |
               ------------------------
               |                      |
         normal_cgroup          idle_cgroup
               |                      |
         SCHED_IDLE task_A     SCHED_NORMAL task_B

As we only look at the common level of hierarchy between the tasks,
the below will make A to preempt B whereas both are SCHED_IDLE

                         root
                          |
               ------------------------
               |                      |
         normal_cgroup          normal_cgroup
               |                      |
         SCHED_IDLE task_A     SCHED_IDLE task_B
Re: [PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Peter Zijlstra 1 year, 5 months ago
On Mon, Jul 08, 2024 at 02:47:34PM +0200, Vincent Guittot wrote:
> On Mon, 8 Jul 2024 at 14:02, Peter Zijlstra <peterz@infradead.org> wrote:

> > > @@ -8409,6 +8400,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> > >       if (cse_is_idle != pse_is_idle)
> > >               return;
> > >
> > > +     /*
> > > +      * Batch tasks do not preempt non-idle tasks (their preemption
> > > +      * is driven by the tick).
> > > +      * We've done the check about "only one of the entities is idle",
> > > +      * so cse must be non-idle if p is a batch task.
> > > +      */
> > > +     if (unlikely(entity_is_task(pse) && p->policy == SCHED_BATCH))
> > > +             return;
> >
> > I'm not convinced this condition is right. The current behaviour of
> > SCHED_BATCH doesn't care about pse, only p.
> >
> > That is, if p is SCHED_BATCH it will not preempt -- except an
> > SCHED_IDLE.
> >
> > So I'm tempted to delete this first part of your condition and have it
> > be:
> >
> >         if (p->policy == SCHED_BATCH)
> >                 return;
> >
> > That is, suppose you have:
> >
> >                         root
> >                          |
> >               ------------------------
> >               |                      |
> >         normal_cgroup          normal_cgroup
> >               |                      |
> >         SCHED_BATCH task_A     SCHED_BATCH task_B
> >
> > Then the preemption crud will end up comparing the groups to one another
> > and still allow A to preempt B -- except we explicitly do not want this.
> >
> > The 'problem' is that the whole BATCH thing isn't cgroup aware ofcourse,
> > but I'm not sure we want to go fix that -- esp. not in this patch.
> >
> > Hmm?
> 
> Good question, but do we want to make SCHED_BATCH tasks behave
> differently than SCHED_IDLE tasks in a group in this case ?

I suspect we'll have to. People added the idle-cgroup thing, but never
did the same for batch. With the result that they're now fundamentally
different.

> With this patch, we don't want task_B to preempt task_A for the case
> but task_A will preempt task_B whereas task A is SCHED_IDLE
> 
>                          root
>                           |
>                ------------------------
>                |                      |
>          normal_cgroup          idle_cgroup
>                |                      |
>          SCHED_IDLE task_A     SCHED_NORMAL task_B
> 
> As we only look at the common level of hierarchy between the tasks,
> the below will make A to preempt B whereas both are SCHED_IDLE
> 
>                          root
>                           |
>                ------------------------
>                |                      |
>          normal_cgroup          normal_cgroup
>                |                      |
>          SCHED_IDLE task_A     SCHED_IDLE task_B

So we can make the last test be:

	if (unlikely(p->policy != SCHED_NORMAL))
		return;

Much like the original condition removed here:

-       if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+       if (!sched_feat(WAKEUP_PREEMPTION))

Except now after all that cgroup nonsense. Then the OP case will preempt
because normal_cgroup vs idle_cgroup, my BATCH example will not preempt,
because BATCH != NORMAL, your IDLE example will not preempt either,
because IDLE != NORMAL.
Re: [PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Josh Don 1 year, 5 months ago
On Mon, Jul 8, 2024 at 7:28 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Jul 08, 2024 at 02:47:34PM +0200, Vincent Guittot wrote:
> > On Mon, 8 Jul 2024 at 14:02, Peter Zijlstra <peterz@infradead.org> wrote:
<snip>
> > > The 'problem' is that the whole BATCH thing isn't cgroup aware ofcourse,
> > > but I'm not sure we want to go fix that -- esp. not in this patch.
> > >
> > > Hmm?
> >
> > Good question, but do we want to make SCHED_BATCH tasks behave
> > differently than SCHED_IDLE tasks in a group in this case ?
>
> I suspect we'll have to. People added the idle-cgroup thing, but never
> did the same for batch. With the result that they're now fundamentally
> different.

It isn't clear to me that cgroup batch behavior is really a useful
thing that is worth adding. After the EEVDF changes, the only real
difference between normal and batch is that batch don't preempt normal
on wakeup. Contrast that to idle, where we have a pretty meaningful
difference from sched_normal, especially with sched_idle_cpu feeding
into wakeup placement and load balancing.

Happy to be proven wrong if there's a use case for batch wherein the
wakeup preempt behavior is useful at the group level as well. Honestly
it feels like it would make sense to revisit the cgroup batch question
when/if additional behaviors were added to further differentiate
batch. For example, maybe a batch cgroup hierarchy could internally
use longer slices and have a slower round-robin preemption rate
amongst its processes. The wakeup bit alone is limited, and the
supposed target workload of low-priority cpu intensive threads are
unlikely to have many wakeup edges anyway.
Re: [PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Tianchen Ding 1 year, 5 months ago
On 2024/7/8 22:28, Peter Zijlstra wrote:
> On Mon, Jul 08, 2024 at 02:47:34PM +0200, Vincent Guittot wrote:
>> On Mon, 8 Jul 2024 at 14:02, Peter Zijlstra <peterz@infradead.org> wrote:
> 
>>>> @@ -8409,6 +8400,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>>>>        if (cse_is_idle != pse_is_idle)
>>>>                return;
>>>>
>>>> +     /*
>>>> +      * Batch tasks do not preempt non-idle tasks (their preemption
>>>> +      * is driven by the tick).
>>>> +      * We've done the check about "only one of the entities is idle",
>>>> +      * so cse must be non-idle if p is a batch task.
>>>> +      */
>>>> +     if (unlikely(entity_is_task(pse) && p->policy == SCHED_BATCH))
>>>> +             return;
>>>
>>> I'm not convinced this condition is right. The current behaviour of
>>> SCHED_BATCH doesn't care about pse, only p.
>>>
>>> That is, if p is SCHED_BATCH it will not preempt -- except an
>>> SCHED_IDLE.
>>>
>>> So I'm tempted to delete this first part of your condition and have it
>>> be:
>>>
>>>          if (p->policy == SCHED_BATCH)
>>>                  return;
>>>
>>> That is, suppose you have:
>>>
>>>                          root
>>>                           |
>>>                ------------------------
>>>                |                      |
>>>          normal_cgroup          normal_cgroup
>>>                |                      |
>>>          SCHED_BATCH task_A     SCHED_BATCH task_B
>>>
>>> Then the preemption crud will end up comparing the groups to one another
>>> and still allow A to preempt B -- except we explicitly do not want this.
>>>
>>> The 'problem' is that the whole BATCH thing isn't cgroup aware ofcourse,

Agree.

>>> but I'm not sure we want to go fix that -- esp. not in this patch.
>>>
>>> Hmm?
>>
>> Good question, but do we want to make SCHED_BATCH tasks behave
>> differently than SCHED_IDLE tasks in a group in this case ?
> 
> I suspect we'll have to. People added the idle-cgroup thing, but never
> did the same for batch. With the result that they're now fundamentally
> different.
> 
>> With this patch, we don't want task_B to preempt task_A for the case
>> but task_A will preempt task_B whereas task A is SCHED_IDLE
>>
>>                           root
>>                            |
>>                 ------------------------
>>                 |                      |
>>           normal_cgroup          idle_cgroup
>>                 |                      |
>>           SCHED_IDLE task_A     SCHED_NORMAL task_B
>>
>> As we only look at the common level of hierarchy between the tasks,
>> the below will make A to preempt B whereas both are SCHED_IDLE
>>
>>                           root
>>                            |
>>                 ------------------------
>>                 |                      |
>>           normal_cgroup          normal_cgroup
>>                 |                      |
>>           SCHED_IDLE task_A     SCHED_IDLE task_B
> 
> So we can make the last test be:
> 
> 	if (unlikely(p->policy != SCHED_NORMAL))
> 		return;
> 
> Much like the original condition removed here:
> 
> -       if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
> +       if (!sched_feat(WAKEUP_PREEMPTION))
> 
> Except now after all that cgroup nonsense. Then the OP case will preempt
> because normal_cgroup vs idle_cgroup, my BATCH example will not preempt,
> because BATCH != NORMAL, your IDLE example will not preempt either,
> because IDLE != NORMAL.
> 

So there may be 2 interesting issues. Let's take the example of:

                            root
                             |
                  ------------------------
                  |                      |
            normal_cgroup          normal_cgroup
                  |                      |
            SCHED_IDLE task_A     SCHED_IDLE task_B


The first issue is the scope of task policy. My original proposal is to only 
compare the common level of hierarchy (pse and cse), and ignore all their 
children. So that the scope of task policy will be limited within its own 
cgroup, and A may preempt B.

However, If we simply check

         if (p->policy == SCHED_BATCH)
                 return;

or

  	if (p->policy != SCHED_NORMAL)
  		return;

Then the scope of task policy will "overflow" and take effect "across" the 
cgroups. So A will not preempt B.

I don't know which one is better, and both are reasonable and acceptable for me.


The second issue is about SCHED_BATCH. Now let's make task_A be SCHED_BATCH.

In vanilla kernel, A will preempt B because "Idle tasks are by definition 
preempted by non-idle tasks."

However, with my original patch, A may preempt B. (depending on pse and cse)

With your modification, A will not preempt B.

Again, which one should be preferred?

Thanks.
Re: [PATCH v2] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by Tianchen Ding 1 year, 5 months ago
gentle ping...

On 2024/6/26 10:35, Tianchen Ding wrote:
> Consider the following cgroup:
> 
>                         root
>                          |
>               ------------------------
>               |                      |
>         normal_cgroup            idle_cgroup
>               |                      |
>     SCHED_IDLE task_A           SCHED_NORMAL task_B
> 
> According to the cgroup hierarchy, A should preempt B. But current
> check_preempt_wakeup_fair() treats cgroup se and task separately, so B
> will preempt A unexpectedly.
> Unify the wakeup logic by {c,p}se_is_idle only. This makes SCHED_IDLE of
> a task a relative policy that is effective only within its own cgroup,
> similar to the behavior of NICE.
> 
> Also fix se_is_idle() definition when !CONFIG_FAIR_GROUP_SCHED.
> 
> Fixes: 304000390f88 ("sched: Cgroup SCHED_IDLE support")
> Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
> Reviewed-by: Josh Don <joshdon@google.com>
> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> v2:
> Use entity_is_task() to check whether pse is a task.
> Improve comments and commit log.
> 
> v1: https://lore.kernel.org/all/20240624073900.10343-1-dtcccc@linux.alibaba.com/
> ---
>   kernel/sched/fair.c | 24 ++++++++++++------------
>   1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 41b58387023d..f0b038de99ce 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
>   
>   static int se_is_idle(struct sched_entity *se)
>   {
> -	return 0;
> +	return task_has_idle_policy(task_of(se));
>   }
>   
>   #endif	/* CONFIG_FAIR_GROUP_SCHED */
> @@ -8382,16 +8382,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>   	if (test_tsk_need_resched(curr))
>   		return;
>   
> -	/* Idle tasks are by definition preempted by non-idle tasks. */
> -	if (unlikely(task_has_idle_policy(curr)) &&
> -	    likely(!task_has_idle_policy(p)))
> -		goto preempt;
> -
> -	/*
> -	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
> -	 * is driven by the tick):
> -	 */
> -	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
> +	if (!sched_feat(WAKEUP_PREEMPTION))
>   		return;
>   
>   	find_matching_se(&se, &pse);
> @@ -8401,7 +8392,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>   	pse_is_idle = se_is_idle(pse);
>   
>   	/*
> -	 * Preempt an idle group in favor of a non-idle group (and don't preempt
> +	 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
>   	 * in the inverse case).
>   	 */
>   	if (cse_is_idle && !pse_is_idle)
> @@ -8409,6 +8400,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>   	if (cse_is_idle != pse_is_idle)
>   		return;
>   
> +	/*
> +	 * Batch tasks do not preempt non-idle tasks (their preemption
> +	 * is driven by the tick).
> +	 * We've done the check about "only one of the entities is idle",
> +	 * so cse must be non-idle if p is a batch task.
> +	 */
> +	if (unlikely(entity_is_task(pse) && p->policy == SCHED_BATCH))
> +		return;
> +
>   	cfs_rq = cfs_rq_of(se);
>   	update_curr(cfs_rq);
>
[tip: sched/core] sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy
Posted by tip-bot2 for Tianchen Ding 1 year, 4 months ago
The following commit has been merged into the sched/core branch of tip:

Commit-ID:     faa42d29419def58d3c3e5b14ad4037f0af3b496
Gitweb:        https://git.kernel.org/tip/faa42d29419def58d3c3e5b14ad4037f0af3b496
Author:        Tianchen Ding <dtcccc@linux.alibaba.com>
AuthorDate:    Wed, 26 Jun 2024 10:35:05 +08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 29 Jul 2024 12:22:35 +02:00

sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy

Consider the following cgroup:

                       root
                        |
             ------------------------
             |                      |
       normal_cgroup            idle_cgroup
             |                      |
   SCHED_IDLE task_A           SCHED_NORMAL task_B

According to the cgroup hierarchy, A should preempt B. But current
check_preempt_wakeup_fair() treats cgroup se and task separately, so B
will preempt A unexpectedly.
Unify the wakeup logic by {c,p}se_is_idle only. This makes SCHED_IDLE of
a task a relative policy that is effective only within its own cgroup,
similar to the behavior of NICE.

Also fix se_is_idle() definition when !CONFIG_FAIR_GROUP_SCHED.

Fixes: 304000390f88 ("sched: Cgroup SCHED_IDLE support")
Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josh Don <joshdon@google.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20240626023505.1332596-1-dtcccc@linux.alibaba.com
---
 kernel/sched/fair.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02694fc..99c80ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
 
 static int se_is_idle(struct sched_entity *se)
 {
-	return 0;
+	return task_has_idle_policy(task_of(se));
 }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
@@ -8381,16 +8381,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	if (test_tsk_need_resched(curr))
 		return;
 
-	/* Idle tasks are by definition preempted by non-idle tasks. */
-	if (unlikely(task_has_idle_policy(curr)) &&
-	    likely(!task_has_idle_policy(p)))
-		goto preempt;
-
-	/*
-	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
-	 * is driven by the tick):
-	 */
-	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+	if (!sched_feat(WAKEUP_PREEMPTION))
 		return;
 
 	find_matching_se(&se, &pse);
@@ -8400,7 +8391,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	pse_is_idle = se_is_idle(pse);
 
 	/*
-	 * Preempt an idle group in favor of a non-idle group (and don't preempt
+	 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
 	 * in the inverse case).
 	 */
 	if (cse_is_idle && !pse_is_idle)
@@ -8408,9 +8399,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int 
 	if (cse_is_idle != pse_is_idle)
 		return;
 
+	/*
+	 * BATCH and IDLE tasks do not preempt others.
+	 */
+	if (unlikely(p->policy != SCHED_NORMAL))
+		return;
+
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
-
 	/*
 	 * XXX pick_eevdf(cfs_rq) != se ?
 	 */