[PATCH v2] sched/deadline: only set free_cpus for online runqueues

Doug Berger posted 1 patch 1 month, 2 weeks ago
kernel/sched/cpudeadline.c | 34 +++++++++-------------------------
kernel/sched/cpudeadline.h |  4 +---
kernel/sched/deadline.c    |  8 ++++----
3 files changed, 14 insertions(+), 32 deletions(-)
[PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Doug Berger 1 month, 2 weeks ago
Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
to reflect rd->online") introduced the cpudl_set/clear_freecpu
functions to allow the cpu_dl::free_cpus mask to be manipulated
by the deadline scheduler class rq_on/offline callbacks so the
mask would also reflect this state.

Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
from cpudl_find()") removed the check of the cpu_active_mask to
save some processing on the premise that the cpudl::free_cpus
mask already reflected the runqueue online state.

Unfortunately, there are cases where it is possible for the
cpudl_clear function to set the free_cpus bit for a CPU when the
deadline runqueue is offline. When this occurs while a CPU is
connected to the default root domain the flag may retain the bad
state after the CPU has been unplugged. Later, a different CPU
that is transitioning through the default root domain may push a
deadline task to the powered down CPU when cpudl_find sees its
free_cpus bit is set. If this happens the task will not have the
opportunity to run.

One example is outlined here:
https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com

Another occurs when the last deadline task is migrated from a
CPU that has an offlined runqueue. The dequeue_task member of
the deadline scheduler class will eventually call cpudl_clear
and set the free_cpus bit for the CPU.

This commit modifies the cpudl_clear function to be aware of the
online state of the deadline runqueue so that the free_cpus mask
can be updated appropriately.

It is no longer necessary to manage the mask outside of the
cpudl_set/clear functions so the cpudl_set/clear_freecpu
functions are removed. In addition, since the free_cpus mask is
now only updated under the cpudl lock the code was changed to
use the non-atomic __cpumask functions.

Signed-off-by: Doug Berger <opendmb@gmail.com>
---
Change in v2:
  Inverted the logic of the online check in cpudl_clear.

 kernel/sched/cpudeadline.c | 34 +++++++++-------------------------
 kernel/sched/cpudeadline.h |  4 +---
 kernel/sched/deadline.c    |  8 ++++----
 3 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index cdd740b3f774..37b572cc8aca 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
  * cpudl_clear - remove a CPU from the cpudl max-heap
  * @cp: the cpudl max-heap context
  * @cpu: the target CPU
+ * @online: the online state of the deadline runqueue
  *
  * Notes: assumes cpu_rq(cpu)->lock is locked
  *
  * Returns: (void)
  */
-void cpudl_clear(struct cpudl *cp, int cpu)
+void cpudl_clear(struct cpudl *cp, int cpu, bool online)
 {
 	int old_idx, new_cpu;
 	unsigned long flags;
@@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu)
 	if (old_idx == IDX_INVALID) {
 		/*
 		 * Nothing to remove if old_idx was invalid.
-		 * This could happen if a rq_offline_dl is
+		 * This could happen if rq_online_dl or rq_offline_dl is
 		 * called for a CPU without -dl tasks running.
 		 */
 	} else {
@@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu)
 		cp->elements[new_cpu].idx = old_idx;
 		cp->elements[cpu].idx = IDX_INVALID;
 		cpudl_heapify(cp, old_idx);
-
-		cpumask_set_cpu(cpu, cp->free_cpus);
 	}
+	if (likely(online))
+		__cpumask_set_cpu(cpu, cp->free_cpus);
+	else
+		__cpumask_clear_cpu(cpu, cp->free_cpus);
+
 	raw_spin_unlock_irqrestore(&cp->lock, flags);
 }
 
@@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
 		cp->elements[new_idx].cpu = cpu;
 		cp->elements[cpu].idx = new_idx;
 		cpudl_heapify_up(cp, new_idx);
-		cpumask_clear_cpu(cpu, cp->free_cpus);
+		__cpumask_clear_cpu(cpu, cp->free_cpus);
 	} else {
 		cp->elements[old_idx].dl = dl;
 		cpudl_heapify(cp, old_idx);
@@ -237,26 +241,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
 	raw_spin_unlock_irqrestore(&cp->lock, flags);
 }
 
-/*
- * cpudl_set_freecpu - Set the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_set_freecpu(struct cpudl *cp, int cpu)
-{
-	cpumask_set_cpu(cpu, cp->free_cpus);
-}
-
-/*
- * cpudl_clear_freecpu - Clear the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
-{
-	cpumask_clear_cpu(cpu, cp->free_cpus);
-}
-
 /*
  * cpudl_init - initialize the cpudl structure
  * @cp: the cpudl max-heap context
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 11c0f1faa7e1..d7699468eedd 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -19,8 +19,6 @@ struct cpudl {
 
 int  cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
-void cpudl_clear(struct cpudl *cp, int cpu);
+void cpudl_clear(struct cpudl *cp, int cpu, bool online);
 int  cpudl_init(struct cpudl *cp);
-void cpudl_set_freecpu(struct cpudl *cp, int cpu);
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e2d51f4306b3..ef91871b14d8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1832,7 +1832,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 	if (!dl_rq->dl_nr_running) {
 		dl_rq->earliest_dl.curr = 0;
 		dl_rq->earliest_dl.next = 0;
-		cpudl_clear(&rq->rd->cpudl, rq->cpu);
+		cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online);
 		cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 	} else {
 		struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
@@ -2878,9 +2878,10 @@ static void rq_online_dl(struct rq *rq)
 	if (rq->dl.overloaded)
 		dl_set_overload(rq);
 
-	cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
 	if (rq->dl.dl_nr_running > 0)
 		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
+	else
+		cpudl_clear(&rq->rd->cpudl, rq->cpu, true);
 }
 
 /* Assumes rq->lock is held */
@@ -2889,8 +2890,7 @@ static void rq_offline_dl(struct rq *rq)
 	if (rq->dl.overloaded)
 		dl_clear_overload(rq);
 
-	cpudl_clear(&rq->rd->cpudl, rq->cpu);
-	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+	cpudl_clear(&rq->rd->cpudl, rq->cpu, false);
 }
 
 void __init init_sched_dl_class(void)
-- 
2.34.1
Re: [PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Juri Lelli 1 month, 2 weeks ago
Hello,

On 14/08/25 18:22, Doug Berger wrote:
> Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
> to reflect rd->online") introduced the cpudl_set/clear_freecpu
> functions to allow the cpu_dl::free_cpus mask to be manipulated
> by the deadline scheduler class rq_on/offline callbacks so the
> mask would also reflect this state.
> 
> Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
> from cpudl_find()") removed the check of the cpu_active_mask to
> save some processing on the premise that the cpudl::free_cpus
> mask already reflected the runqueue online state.
> 
> Unfortunately, there are cases where it is possible for the
> cpudl_clear function to set the free_cpus bit for a CPU when the
> deadline runqueue is offline. When this occurs while a CPU is
> connected to the default root domain the flag may retain the bad
> state after the CPU has been unplugged. Later, a different CPU
> that is transitioning through the default root domain may push a
> deadline task to the powered down CPU when cpudl_find sees its
> free_cpus bit is set. If this happens the task will not have the
> opportunity to run.
> 
> One example is outlined here:
> https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com
> 
> Another occurs when the last deadline task is migrated from a
> CPU that has an offlined runqueue. The dequeue_task member of
> the deadline scheduler class will eventually call cpudl_clear
> and set the free_cpus bit for the CPU.
> 
> This commit modifies the cpudl_clear function to be aware of the
> online state of the deadline runqueue so that the free_cpus mask
> can be updated appropriately.
> 
> It is no longer necessary to manage the mask outside of the
> cpudl_set/clear functions so the cpudl_set/clear_freecpu
> functions are removed. In addition, since the free_cpus mask is
> now only updated under the cpudl lock the code was changed to
> use the non-atomic __cpumask functions.
> 
> Signed-off-by: Doug Berger <opendmb@gmail.com>
> ---

This looks now good to me.

Acked-by: Juri Lelli <juri.lelli@redhat.com>

Thanks,
Juri
Re: [PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Peter Zijlstra 1 month ago
On Mon, Aug 18, 2025 at 02:58:19PM +0200, Juri Lelli wrote:
> Hello,
> 
> On 14/08/25 18:22, Doug Berger wrote:
> > Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
> > to reflect rd->online") introduced the cpudl_set/clear_freecpu
> > functions to allow the cpu_dl::free_cpus mask to be manipulated
> > by the deadline scheduler class rq_on/offline callbacks so the
> > mask would also reflect this state.
> > 
> > Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
> > from cpudl_find()") removed the check of the cpu_active_mask to
> > save some processing on the premise that the cpudl::free_cpus
> > mask already reflected the runqueue online state.
> > 
> > Unfortunately, there are cases where it is possible for the
> > cpudl_clear function to set the free_cpus bit for a CPU when the
> > deadline runqueue is offline. When this occurs while a CPU is
> > connected to the default root domain the flag may retain the bad
> > state after the CPU has been unplugged. Later, a different CPU
> > that is transitioning through the default root domain may push a
> > deadline task to the powered down CPU when cpudl_find sees its
> > free_cpus bit is set. If this happens the task will not have the
> > opportunity to run.
> > 
> > One example is outlined here:
> > https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com
> > 
> > Another occurs when the last deadline task is migrated from a
> > CPU that has an offlined runqueue. The dequeue_task member of
> > the deadline scheduler class will eventually call cpudl_clear
> > and set the free_cpus bit for the CPU.
> > 
> > This commit modifies the cpudl_clear function to be aware of the
> > online state of the deadline runqueue so that the free_cpus mask
> > can be updated appropriately.
> > 
> > It is no longer necessary to manage the mask outside of the
> > cpudl_set/clear functions so the cpudl_set/clear_freecpu
> > functions are removed. In addition, since the free_cpus mask is
> > now only updated under the cpudl lock the code was changed to
> > use the non-atomic __cpumask functions.
> > 
> > Signed-off-by: Doug Berger <opendmb@gmail.com>
> > ---
> 
> This looks now good to me.
> 
> Acked-by: Juri Lelli <juri.lelli@redhat.com>

So I just had a look at said patch because Juri here poked me; and I
came away with the feeling that cpudl_clear() is now a misnomen, seeing
how it is called from rq_online_dl().

Would cpudl_update() be a better name?
Re: [PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Doug Berger 4 weeks, 1 day ago
On 9/3/2025 12:54 AM, Peter Zijlstra wrote:
> On Mon, Aug 18, 2025 at 02:58:19PM +0200, Juri Lelli wrote:
>> Hello,
>>
>> On 14/08/25 18:22, Doug Berger wrote:
>>> Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
>>> to reflect rd->online") introduced the cpudl_set/clear_freecpu
>>> functions to allow the cpu_dl::free_cpus mask to be manipulated
>>> by the deadline scheduler class rq_on/offline callbacks so the
>>> mask would also reflect this state.
>>>
>>> Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
>>> from cpudl_find()") removed the check of the cpu_active_mask to
>>> save some processing on the premise that the cpudl::free_cpus
>>> mask already reflected the runqueue online state.
>>>
>>> Unfortunately, there are cases where it is possible for the
>>> cpudl_clear function to set the free_cpus bit for a CPU when the
>>> deadline runqueue is offline. When this occurs while a CPU is
>>> connected to the default root domain the flag may retain the bad
>>> state after the CPU has been unplugged. Later, a different CPU
>>> that is transitioning through the default root domain may push a
>>> deadline task to the powered down CPU when cpudl_find sees its
>>> free_cpus bit is set. If this happens the task will not have the
>>> opportunity to run.
>>>
>>> One example is outlined here:
>>> https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com
>>>
>>> Another occurs when the last deadline task is migrated from a
>>> CPU that has an offlined runqueue. The dequeue_task member of
>>> the deadline scheduler class will eventually call cpudl_clear
>>> and set the free_cpus bit for the CPU.
>>>
>>> This commit modifies the cpudl_clear function to be aware of the
>>> online state of the deadline runqueue so that the free_cpus mask
>>> can be updated appropriately.
>>>
>>> It is no longer necessary to manage the mask outside of the
>>> cpudl_set/clear functions so the cpudl_set/clear_freecpu
>>> functions are removed. In addition, since the free_cpus mask is
>>> now only updated under the cpudl lock the code was changed to
>>> use the non-atomic __cpumask functions.
>>>
>>> Signed-off-by: Doug Berger <opendmb@gmail.com>
>>> ---
>>
>> This looks now good to me.
>>
>> Acked-by: Juri Lelli <juri.lelli@redhat.com>
> 
> So I just had a look at said patch because Juri here poked me; and I
> came away with the feeling that cpudl_clear() is now a misnomen, seeing
> how it is called from rq_online_dl().
> 
> Would cpudl_update() be a better name?
Thanks for taking a look.

I don't really have a dog in any fights over naming here, but it seems 
to me that cpudl_clear and cpudl_set are intended to be complementary 
functions and the naming reflects that. It would appear that these are 
primarily intended to maintain the cpudl max-heap entries which is what 
are being set and cleared.

rq_online_dl() would now call one or the other based on whether any 
deadline tasks are running on the queue when it is onlined to ensure 
that the max-heap is valid. This either clears a stale entry that may 
occur from scenarios like the ones I'm running into or set the entry to 
the current deadline. In this context the names seem appropriate to me.

Renaming cpudl_clear to cpudl_update may be more confusing since the 
comment for cpudl_set reads "cpudl_set - update the cpudl max-heap".

I don't feel that the name change is relevant to my patch, but if we 
want to do it concurrently maybe cpudl_clear_max_heap() and 
cpudl_set_max_heap() would be more meaningful.

Please let me know how you would like to proceed,
     Doug
Re: [PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Doug Berger 1 week, 3 days ago
On 9/4/2025 9:43 PM, Doug Berger wrote:
> On 9/3/2025 12:54 AM, Peter Zijlstra wrote:
>> On Mon, Aug 18, 2025 at 02:58:19PM +0200, Juri Lelli wrote:
>>> Hello,
>>>
>>> On 14/08/25 18:22, Doug Berger wrote:
>>>> Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
>>>> to reflect rd->online") introduced the cpudl_set/clear_freecpu
>>>> functions to allow the cpu_dl::free_cpus mask to be manipulated
>>>> by the deadline scheduler class rq_on/offline callbacks so the
>>>> mask would also reflect this state.
>>>>
>>>> Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
>>>> from cpudl_find()") removed the check of the cpu_active_mask to
>>>> save some processing on the premise that the cpudl::free_cpus
>>>> mask already reflected the runqueue online state.
>>>>
>>>> Unfortunately, there are cases where it is possible for the
>>>> cpudl_clear function to set the free_cpus bit for a CPU when the
>>>> deadline runqueue is offline. When this occurs while a CPU is
>>>> connected to the default root domain the flag may retain the bad
>>>> state after the CPU has been unplugged. Later, a different CPU
>>>> that is transitioning through the default root domain may push a
>>>> deadline task to the powered down CPU when cpudl_find sees its
>>>> free_cpus bit is set. If this happens the task will not have the
>>>> opportunity to run.
>>>>
>>>> One example is outlined here:
>>>> https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com
>>>>
>>>> Another occurs when the last deadline task is migrated from a
>>>> CPU that has an offlined runqueue. The dequeue_task member of
>>>> the deadline scheduler class will eventually call cpudl_clear
>>>> and set the free_cpus bit for the CPU.
>>>>
>>>> This commit modifies the cpudl_clear function to be aware of the
>>>> online state of the deadline runqueue so that the free_cpus mask
>>>> can be updated appropriately.
>>>>
>>>> It is no longer necessary to manage the mask outside of the
>>>> cpudl_set/clear functions so the cpudl_set/clear_freecpu
>>>> functions are removed. In addition, since the free_cpus mask is
>>>> now only updated under the cpudl lock the code was changed to
>>>> use the non-atomic __cpumask functions.
>>>>
>>>> Signed-off-by: Doug Berger <opendmb@gmail.com>
>>>> ---
>>>
>>> This looks now good to me.
>>>
>>> Acked-by: Juri Lelli <juri.lelli@redhat.com>
>>
>> So I just had a look at said patch because Juri here poked me; and I
>> came away with the feeling that cpudl_clear() is now a misnomen, seeing
>> how it is called from rq_online_dl().
>>
>> Would cpudl_update() be a better name?
> Thanks for taking a look.
> 
> I don't really have a dog in any fights over naming here, but it seems 
> to me that cpudl_clear and cpudl_set are intended to be complementary 
> functions and the naming reflects that. It would appear that these are 
> primarily intended to maintain the cpudl max-heap entries which is what 
> are being set and cleared.
> 
> rq_online_dl() would now call one or the other based on whether any 
> deadline tasks are running on the queue when it is onlined to ensure 
> that the max-heap is valid. This either clears a stale entry that may 
> occur from scenarios like the ones I'm running into or set the entry to 
> the current deadline. In this context the names seem appropriate to me.
> 
> Renaming cpudl_clear to cpudl_update may be more confusing since the 
> comment for cpudl_set reads "cpudl_set - update the cpudl max-heap".
> 
> I don't feel that the name change is relevant to my patch, but if we 
> want to do it concurrently maybe cpudl_clear_max_heap() and 
> cpudl_set_max_heap() would be more meaningful.
> 
> Please let me know how you would like to proceed,
>      Doug

Is there any way I can help to move this patch forward?

Thanks,
     Doug
Re: [PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Florian Fainelli 1 day, 17 hours ago

On 9/23/2025 11:03 AM, Doug Berger wrote:
> On 9/4/2025 9:43 PM, Doug Berger wrote:
>> On 9/3/2025 12:54 AM, Peter Zijlstra wrote:
>>> On Mon, Aug 18, 2025 at 02:58:19PM +0200, Juri Lelli wrote:
>>>> Hello,
>>>>
>>>> On 14/08/25 18:22, Doug Berger wrote:
>>>>> Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
>>>>> to reflect rd->online") introduced the cpudl_set/clear_freecpu
>>>>> functions to allow the cpu_dl::free_cpus mask to be manipulated
>>>>> by the deadline scheduler class rq_on/offline callbacks so the
>>>>> mask would also reflect this state.
>>>>>
>>>>> Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
>>>>> from cpudl_find()") removed the check of the cpu_active_mask to
>>>>> save some processing on the premise that the cpudl::free_cpus
>>>>> mask already reflected the runqueue online state.
>>>>>
>>>>> Unfortunately, there are cases where it is possible for the
>>>>> cpudl_clear function to set the free_cpus bit for a CPU when the
>>>>> deadline runqueue is offline. When this occurs while a CPU is
>>>>> connected to the default root domain the flag may retain the bad
>>>>> state after the CPU has been unplugged. Later, a different CPU
>>>>> that is transitioning through the default root domain may push a
>>>>> deadline task to the powered down CPU when cpudl_find sees its
>>>>> free_cpus bit is set. If this happens the task will not have the
>>>>> opportunity to run.
>>>>>
>>>>> One example is outlined here:
>>>>> https://lore.kernel.org/lkml/20250110233010.2339521-1- 
>>>>> opendmb@gmail.com
>>>>>
>>>>> Another occurs when the last deadline task is migrated from a
>>>>> CPU that has an offlined runqueue. The dequeue_task member of
>>>>> the deadline scheduler class will eventually call cpudl_clear
>>>>> and set the free_cpus bit for the CPU.
>>>>>
>>>>> This commit modifies the cpudl_clear function to be aware of the
>>>>> online state of the deadline runqueue so that the free_cpus mask
>>>>> can be updated appropriately.
>>>>>
>>>>> It is no longer necessary to manage the mask outside of the
>>>>> cpudl_set/clear functions so the cpudl_set/clear_freecpu
>>>>> functions are removed. In addition, since the free_cpus mask is
>>>>> now only updated under the cpudl lock the code was changed to
>>>>> use the non-atomic __cpumask functions.
>>>>>
>>>>> Signed-off-by: Doug Berger <opendmb@gmail.com>
>>>>> ---
>>>>
>>>> This looks now good to me.
>>>>
>>>> Acked-by: Juri Lelli <juri.lelli@redhat.com>
>>>
>>> So I just had a look at said patch because Juri here poked me; and I
>>> came away with the feeling that cpudl_clear() is now a misnomen, seeing
>>> how it is called from rq_online_dl().
>>>
>>> Would cpudl_update() be a better name?
>> Thanks for taking a look.
>>
>> I don't really have a dog in any fights over naming here, but it seems 
>> to me that cpudl_clear and cpudl_set are intended to be complementary 
>> functions and the naming reflects that. It would appear that these are 
>> primarily intended to maintain the cpudl max-heap entries which is 
>> what are being set and cleared.
>>
>> rq_online_dl() would now call one or the other based on whether any 
>> deadline tasks are running on the queue when it is onlined to ensure 
>> that the max-heap is valid. This either clears a stale entry that may 
>> occur from scenarios like the ones I'm running into or set the entry 
>> to the current deadline. In this context the names seem appropriate to 
>> me.
>>
>> Renaming cpudl_clear to cpudl_update may be more confusing since the 
>> comment for cpudl_set reads "cpudl_set - update the cpudl max-heap".
>>
>> I don't feel that the name change is relevant to my patch, but if we 
>> want to do it concurrently maybe cpudl_clear_max_heap() and 
>> cpudl_set_max_heap() would be more meaningful.
>>
>> Please let me know how you would like to proceed,
>>      Doug
> 
> Is there any way I can help to move this patch forward?

OK, so we are all busy and whatnot but the response time on this patch 
seems to be completely abysmal, and reminder that it fixes an actual bug 
that we have been running into on production systems. Can we just get it 
applied and move on?
-- 
Florian

Re: [PATCH v2] sched/deadline: only set free_cpus for online runqueues
Posted by Florian Fainelli 1 month ago
On 8/18/25 05:58, Juri Lelli wrote:
> Hello,
> 
> On 14/08/25 18:22, Doug Berger wrote:
>> Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
>> to reflect rd->online") introduced the cpudl_set/clear_freecpu
>> functions to allow the cpu_dl::free_cpus mask to be manipulated
>> by the deadline scheduler class rq_on/offline callbacks so the
>> mask would also reflect this state.
>>
>> Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
>> from cpudl_find()") removed the check of the cpu_active_mask to
>> save some processing on the premise that the cpudl::free_cpus
>> mask already reflected the runqueue online state.
>>
>> Unfortunately, there are cases where it is possible for the
>> cpudl_clear function to set the free_cpus bit for a CPU when the
>> deadline runqueue is offline. When this occurs while a CPU is
>> connected to the default root domain the flag may retain the bad
>> state after the CPU has been unplugged. Later, a different CPU
>> that is transitioning through the default root domain may push a
>> deadline task to the powered down CPU when cpudl_find sees its
>> free_cpus bit is set. If this happens the task will not have the
>> opportunity to run.
>>
>> One example is outlined here:
>> https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com
>>
>> Another occurs when the last deadline task is migrated from a
>> CPU that has an offlined runqueue. The dequeue_task member of
>> the deadline scheduler class will eventually call cpudl_clear
>> and set the free_cpus bit for the CPU.
>>
>> This commit modifies the cpudl_clear function to be aware of the
>> online state of the deadline runqueue so that the free_cpus mask
>> can be updated appropriately.
>>
>> It is no longer necessary to manage the mask outside of the
>> cpudl_set/clear functions so the cpudl_set/clear_freecpu
>> functions are removed. In addition, since the free_cpus mask is
>> now only updated under the cpudl lock the code was changed to
>> use the non-atomic __cpumask functions.
>>
>> Signed-off-by: Doug Berger <opendmb@gmail.com>
>> ---
> 
> This looks now good to me.
> 
> Acked-by: Juri Lelli <juri.lelli@redhat.com>

Thanks Juri, can we apply it?
-- 
Florian