[PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb

Shrikanth Hegde posted 2 patches 2 weeks, 4 days ago
[PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shrikanth Hegde 2 weeks, 4 days ago
Current code around nohz_balancer_kick and kick_ilb:
1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
   needed.
2. Does a few checks to see if any conditions meet the criteria.
3. Tries to find the idle CPU. But the idle CPU found should be part of
   housekeeping CPUs.

If there is no housekeeping idle CPU, then step 2 is done
un-necessarily, since 3 bails out without doing the ilb.

Fix that by making the decision early and pass it on to find_new_ilb.
Use a percpu cpumask instead of allocating it everytime since this is in
fastpath.

If flags is set to NOHZ_STATS_KICK since the time is after nohz.next_blocked
but before nohz.next_balance and there are idle CPUs which are part of
housekeeping, need to copy the same logic there too.

While there, fix the stale comments around nohz.nr_cpus

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
---

Didn't add the fixes tag since it addresses more than stale comments.

 kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b19aeaa51ebc..02cca2c7a98d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
+static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
 
 #ifdef CONFIG_NO_HZ_COMMON
 
@@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
  * - When one of the busy CPUs notices that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
+ *
+ *   @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
  */
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(struct cpumask *cpus)
 {
-	const struct cpumask *hk_mask;
 	int ilb_cpu;
 
-	hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
-
-	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
+	for_each_cpu(ilb_cpu, cpus) {
 
 		if (ilb_cpu == smp_processor_id())
 			continue;
@@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
  * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
  * (if there is one).
  */
-static void kick_ilb(unsigned int flags)
+static void kick_ilb(unsigned int flags, struct cpumask *cpus)
 {
 	int ilb_cpu;
 
@@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
 	if (flags & NOHZ_BALANCE_KICK)
 		nohz.next_balance = jiffies+1;
 
-	ilb_cpu = find_new_ilb();
+	ilb_cpu = find_new_ilb(cpus);
 	if (ilb_cpu < 0)
 		return;
 
@@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
  */
 static void nohz_balancer_kick(struct rq *rq)
 {
+	struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
 	unsigned long now = jiffies;
 	struct sched_domain_shared *sds;
 	struct sched_domain *sd;
@@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
 	 */
 	nohz_balance_exit_idle(rq);
 
+	/* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
+
 	if (READ_ONCE(nohz.has_blocked_load) &&
-	    time_after(now, READ_ONCE(nohz.next_blocked)))
+	    time_after(now, READ_ONCE(nohz.next_blocked))) {
 		flags = NOHZ_STATS_KICK;
+		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
+			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+	}
 
 	/*
-	 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
-	 * Skip the read if time is not due.
+	 * Most of the time system is not 100% busy. i.e there are idle
+	 * housekeeping CPUs.
+	 *
+	 * So, Skip the reading idle_cpus_mask if time is not due.
 	 *
 	 * If none are in tickless mode, there maybe a narrow window
 	 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
 	 * But idle load balancing is not done as find_new_ilb fails.
-	 * That's very rare. So read nohz.nr_cpus only if time is due.
+	 * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
+	 * only if time is due.
+	 *
 	 */
 	if (time_before(now, nohz.next_balance))
 		goto out;
 
+	/* Avoid the double computation */
+	if (flags != NOHZ_STATS_KICK)
+		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
+			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+
 	/*
 	 * None are in tickless mode and hence no need for NOHZ idle load
 	 * balancing
 	 */
-	if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
+	if (unlikely(cpumask_empty(ilb_cpus)))
 		return;
 
 	if (rq->nr_running >= 2) {
@@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * When balancing between cores, all the SMT siblings of the
 		 * preferred CPU must be idle.
 		 */
-		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+		for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
 			if (sched_asym(sd, i, cpu)) {
 				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 				goto unlock;
@@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		flags |= NOHZ_NEXT_KICK;
 
 	if (flags)
-		kick_ilb(flags);
+		kick_ilb(flags, ilb_cpus);
 }
 
 static void set_cpu_sd_state_busy(int cpu)
@@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
 		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
 					GFP_KERNEL, cpu_to_node(i));
+		zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
+					GFP_KERNEL, cpu_to_node(i));
 
 #ifdef CONFIG_CFS_BANDWIDTH
 		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
-- 
2.43.0
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by K Prateek Nayak 2 weeks, 3 days ago
Hello Shrikanth,

On 3/19/2026 12:23 PM, Shrikanth Hegde wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b19aeaa51ebc..02cca2c7a98d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
>  static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>  static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>  static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);

nit. We can rename and reuse select_rq_mask. Wakeups happen with IRQs
disabled and kick happens from the hrtimer handler so it should be safe
to reuse that and save some space.

Thoughts?

[..snip..]

> @@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
>  	 */
>  	nohz_balance_exit_idle(rq);
>  
> +	/* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
> +
>  	if (READ_ONCE(nohz.has_blocked_load) &&
> -	    time_after(now, READ_ONCE(nohz.next_blocked)))
> +	    time_after(now, READ_ONCE(nohz.next_blocked))) {
>  		flags = NOHZ_STATS_KICK;
> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +	}
>  
>  	/*
> -	 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
> -	 * Skip the read if time is not due.
> +	 * Most of the time system is not 100% busy. i.e there are idle
> +	 * housekeeping CPUs.
> +	 *
> +	 * So, Skip the reading idle_cpus_mask if time is not due.
>  	 *
>  	 * If none are in tickless mode, there maybe a narrow window
>  	 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
>  	 * But idle load balancing is not done as find_new_ilb fails.
> -	 * That's very rare. So read nohz.nr_cpus only if time is due.
> +	 * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
> +	 * only if time is due.
> +	 *
>  	 */
>  	if (time_before(now, nohz.next_balance))
>  		goto out;
>  
> +	/* Avoid the double computation */
> +	if (flags != NOHZ_STATS_KICK)
> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +
>  	/*
>  	 * None are in tickless mode and hence no need for NOHZ idle load
>  	 * balancing
>  	 */
> -	if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
> +	if (unlikely(cpumask_empty(ilb_cpus)))
>  		return;

We can just use the return value from the previous cpumask_and() for
this and save on another cpumask iteration.

>  
>  	if (rq->nr_running >= 2) {
-- 
Thanks and Regards,
Prateek
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shrikanth Hegde 2 weeks, 3 days ago

On 3/20/26 9:07 AM, K Prateek Nayak wrote:
> Hello Shrikanth,
> 
> On 3/19/2026 12:23 PM, Shrikanth Hegde wrote:
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index b19aeaa51ebc..02cca2c7a98d 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
>>   static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>>   static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>>   static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
>> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
> 
> nit. We can rename and reuse select_rq_mask. Wakeups happen with IRQs
> disabled and kick happens from the hrtimer handler so it should be safe
> to reuse that and save some space.
> 
> Thoughts?

May be. but it could be a confusing name. sched_tmpmask?

We could similar stuff already to load_balance_mask, select_rq_mask.
So, i would prefer to keep it separate.

> 
> [..snip..]
> 
>> @@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
>>   	 */
>>   	nohz_balance_exit_idle(rq);
>>   
>> +	/* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
>> +
>>   	if (READ_ONCE(nohz.has_blocked_load) &&
>> -	    time_after(now, READ_ONCE(nohz.next_blocked)))
>> +	    time_after(now, READ_ONCE(nohz.next_blocked))) {
>>   		flags = NOHZ_STATS_KICK;
>> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
>> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
>> +	}
>>   
>>   	/*
>> -	 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
>> -	 * Skip the read if time is not due.
>> +	 * Most of the time system is not 100% busy. i.e there are idle
>> +	 * housekeeping CPUs.
>> +	 *
>> +	 * So, Skip the reading idle_cpus_mask if time is not due.
>>   	 *
>>   	 * If none are in tickless mode, there maybe a narrow window
>>   	 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
>>   	 * But idle load balancing is not done as find_new_ilb fails.
>> -	 * That's very rare. So read nohz.nr_cpus only if time is due.
>> +	 * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
>> +	 * only if time is due.
>> +	 *
>>   	 */
>>   	if (time_before(now, nohz.next_balance))
>>   		goto out;
>>   
>> +	/* Avoid the double computation */
>> +	if (flags != NOHZ_STATS_KICK)
>> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
>> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
>> +
>>   	/*
>>   	 * None are in tickless mode and hence no need for NOHZ idle load
>>   	 * balancing
>>   	 */
>> -	if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
>> +	if (unlikely(cpumask_empty(ilb_cpus)))
>>   		return;
> 
> We can just use the return value from the previous cpumask_and() for
> this and save on another cpumask iteration.
> 

Makes sense. Will do.

>>   
>>   	if (rq->nr_running >= 2) {
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Peter Zijlstra 2 weeks, 3 days ago
On Fri, Mar 20, 2026 at 02:49:30PM +0530, Shrikanth Hegde wrote:
> 
> 
> On 3/20/26 9:07 AM, K Prateek Nayak wrote:
> > Hello Shrikanth,
> > 
> > On 3/19/2026 12:23 PM, Shrikanth Hegde wrote:
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index b19aeaa51ebc..02cca2c7a98d 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
> > >   static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
> > >   static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
> > >   static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> > > +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
> > 
> > nit. We can rename and reuse select_rq_mask. Wakeups happen with IRQs
> > disabled and kick happens from the hrtimer handler so it should be safe
> > to reuse that and save some space.
> > 
> > Thoughts?
> 
> May be. but it could be a confusing name. sched_tmpmask?
> 
> We could similar stuff already to load_balance_mask, select_rq_mask.
> So, i would prefer to keep it separate.

But then we keep growing this ad infinitum.

The more sensible option is to name them after the context and have
get/put accessors that (for PROVE_LOCKING builds or so) verify the
context and maybe even 'lock' them to make sure nobody is trying to use
one for two things at the same time.

That should make it clearer whats what and improve reuse, no?
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shrikanth Hegde 2 weeks, 3 days ago

On 3/20/26 5:13 PM, Peter Zijlstra wrote:
> On Fri, Mar 20, 2026 at 02:49:30PM +0530, Shrikanth Hegde wrote:
>>
>>
>> On 3/20/26 9:07 AM, K Prateek Nayak wrote:
>>> Hello Shrikanth,
>>>
>>> On 3/19/2026 12:23 PM, Shrikanth Hegde wrote:
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index b19aeaa51ebc..02cca2c7a98d 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
>>>>    static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>>>>    static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>>>>    static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
>>>> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
>>>
>>> nit. We can rename and reuse select_rq_mask. Wakeups happen with IRQs
>>> disabled and kick happens from the hrtimer handler so it should be safe
>>> to reuse that and save some space.
>>>
>>> Thoughts?
>>
>> May be. but it could be a confusing name. sched_tmpmask?
>>
>> We could similar stuff already to load_balance_mask, select_rq_mask.
>> So, i would prefer to keep it separate.
> 
> But then we keep growing this ad infinitum.
> 
> The more sensible option is to name them after the context and have
> get/put accessors that (for PROVE_LOCKING builds or so) verify the
> context and maybe even 'lock' them to make sure nobody is trying to use
> one for two things at the same time.
> 
> That should make it clearer whats what and improve reuse, no?
> 

We have these:

deadline.c:static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
ext_idle.c:static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
ext_idle.c:static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
ext_idle.c:static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
fair.c:static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
fair.c:static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
fair.c:static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
rt.c:static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask)

Here,
1. load_balance_mask and should_we_balance_tmpmask are used in the same context
    all it cares is - I need two cpumask to play with.

2. local_cpu_mask, local_cpu_mask_dl, select_rq_mask are used in independent context - all it cares is
    all it cares is - I need one cpumask to play with.

3. ext_idle.c: local_* are used the same context.
    all it cares is - I need three cpumask to play with
    technically it is doable with one.(but that's a separate story)

1 and 2 are in interrupt disable section, 3 i am not completely sure.


I am wondering something like would make sense?


DEFINE_PER_CPU(cpumask_var_t, sched_tmp_cpumask1);
DEFINE_PER_CPU(cpumask_var_t, sched_tmp_cpumask2);
DEFINE_PER_CPU(cpumask_var_t, sched_tmp_cpumask3);

Request for a tmp cpumask with number?
i.e 2 would say sched_request_tmpmask(0)
     1 would sat sched_request_tmpmask(0), and then sched_request_tmpmask(1)
     3 would say sched_request_tmpmask(0), sched_request_tmpmask(1) and sched_request_tmpmask(2)

Do this if interrupts are disabled.
if interrupt are enabled, then maybe do allocation/free instead?

That would give us the get/put routines for all cases.
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shrikanth Hegde 2 weeks, 3 days ago

On 3/20/26 7:42 PM, Shrikanth Hegde wrote:
> 
> 
> On 3/20/26 5:13 PM, Peter Zijlstra wrote:
>> On Fri, Mar 20, 2026 at 02:49:30PM +0530, Shrikanth Hegde wrote:
>>>
>>>
>>> On 3/20/26 9:07 AM, K Prateek Nayak wrote:
>>>> Hello Shrikanth,
>>>>
>>>> On 3/19/2026 12:23 PM, Shrikanth Hegde wrote:
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index b19aeaa51ebc..02cca2c7a98d 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -7392,6 +7392,7 @@ static inline unsigned int 
>>>>> cfs_h_nr_delayed(struct rq *rq)
>>>>>    static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>>>>>    static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>>>>>    static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
>>>>> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
>>>>
>>>> nit. We can rename and reuse select_rq_mask. Wakeups happen with IRQs
>>>> disabled and kick happens from the hrtimer handler so it should be safe
>>>> to reuse that and save some space.
>>>>
>>>> Thoughts?
>>>
>>> May be. but it could be a confusing name. sched_tmpmask?
>>>
>>> We could similar stuff already to load_balance_mask, select_rq_mask.
>>> So, i would prefer to keep it separate.
>>
>> But then we keep growing this ad infinitum.
>>
>> The more sensible option is to name them after the context and have
>> get/put accessors that (for PROVE_LOCKING builds or so) verify the
>> context and maybe even 'lock' them to make sure nobody is trying to use
>> one for two things at the same time.
>>


Maybe we can define one cpumask which all irq disabled section could use?
i.e for try_to_wake_up and nohz_balancer_kick?

Is that what you meant?

>> That should make it clearer whats what and improve reuse, no?
>>
> 
> We have these:
> 
> deadline.c:static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
> ext_idle.c:static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
> ext_idle.c:static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
> ext_idle.c:static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
> fair.c:static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
> fair.c:static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
> fair.c:static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> rt.c:static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask)
> 
> Here,
> 1. load_balance_mask and should_we_balance_tmpmask are used in the same 
> context
>     all it cares is - I need two cpumask to play with.
> 
> 2. local_cpu_mask, local_cpu_mask_dl, select_rq_mask are used in 
> independent context - all it cares is
>     all it cares is - I need one cpumask to play with.
> 
> 3. ext_idle.c: local_* are used the same context.
>     all it cares is - I need three cpumask to play with
>     technically it is doable with one.(but that's a separate story)
> 
> 1 and 2 are in interrupt disable section, 3 i am not completely sure.
> 
> 
> I am wondering something like would make sense?
> 
> 
> DEFINE_PER_CPU(cpumask_var_t, sched_tmp_cpumask1);
> DEFINE_PER_CPU(cpumask_var_t, sched_tmp_cpumask2);
> DEFINE_PER_CPU(cpumask_var_t, sched_tmp_cpumask3);
> 
> Request for a tmp cpumask with number?
> i.e 2 would say sched_request_tmpmask(0)
>      1 would sat sched_request_tmpmask(0), and then 
> sched_request_tmpmask(1)
>      3 would say sched_request_tmpmask(0), sched_request_tmpmask(1) and 
> sched_request_tmpmask(2)
> 
> Do this if interrupts are disabled.
> if interrupt are enabled, then maybe do allocation/free instead?
> 

1 is in softirq. It need not have interrupts disabled.
So doing allocation there is too costly.
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shubhang Kaushik 2 weeks, 3 days ago
Hi Shrikanth,

On Thu, 19 Mar 2026, Shrikanth Hegde wrote:

> Current code around nohz_balancer_kick and kick_ilb:
> 1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
>   needed.
> 2. Does a few checks to see if any conditions meet the criteria.
> 3. Tries to find the idle CPU. But the idle CPU found should be part of
>   housekeeping CPUs.
>
> If there is no housekeeping idle CPU, then step 2 is done
> un-necessarily, since 3 bails out without doing the ilb.
>
> Fix that by making the decision early and pass it on to find_new_ilb.
> Use a percpu cpumask instead of allocating it everytime since this is in
> fastpath.
>
> If flags is set to NOHZ_STATS_KICK since the time is after nohz.next_blocked
> but before nohz.next_balance and there are idle CPUs which are part of
> housekeeping, need to copy the same logic there too.
>
> While there, fix the stale comments around nohz.nr_cpus
>
> Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> ---
>
> Didn't add the fixes tag since it addresses more than stale comments.
>
> kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
> 1 file changed, 31 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b19aeaa51ebc..02cca2c7a98d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
> static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
> static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
> static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
>
> #ifdef CONFIG_NO_HZ_COMMON
>
> @@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
>  * - When one of the busy CPUs notices that there may be an idle rebalancing
>  *   needed, they will kick the idle load balancer, which then does idle
>  *   load balancing for all the idle CPUs.
> + *
> + *   @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
>  */
> -static inline int find_new_ilb(void)
> +static inline int find_new_ilb(struct cpumask *cpus)
> {
> -	const struct cpumask *hk_mask;
> 	int ilb_cpu;
>
> -	hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
> -
> -	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
> +	for_each_cpu(ilb_cpu, cpus) {
>
> 		if (ilb_cpu == smp_processor_id())
> 			continue;
> @@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
>  * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
>  * (if there is one).
>  */
> -static void kick_ilb(unsigned int flags)
> +static void kick_ilb(unsigned int flags, struct cpumask *cpus)
> {
> 	int ilb_cpu;
>
> @@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
> 	if (flags & NOHZ_BALANCE_KICK)
> 		nohz.next_balance = jiffies+1;
>
> -	ilb_cpu = find_new_ilb();
> +	ilb_cpu = find_new_ilb(cpus);
> 	if (ilb_cpu < 0)
> 		return;
>
> @@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
>  */
> static void nohz_balancer_kick(struct rq *rq)
> {
> +	struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
> 	unsigned long now = jiffies;
> 	struct sched_domain_shared *sds;
> 	struct sched_domain *sd;
> @@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
> 	 */
> 	nohz_balance_exit_idle(rq);
>
> +	/* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
> +
> 	if (READ_ONCE(nohz.has_blocked_load) &&
> -	    time_after(now, READ_ONCE(nohz.next_blocked)))
> +	    time_after(now, READ_ONCE(nohz.next_blocked))) {
> 		flags = NOHZ_STATS_KICK;
> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +	}
>
Moving cpumask_and() here would be taxing the busy/requesting CPU on every 
kick check, thereby breaking the lazy evaluation.

In nohz_full scenario, the housekeeping mask is static (default CPU 0). If 
this HK CPU is busy, it is already running the tick and will handle load 
balancing itslef. If it is idle, it is already in the idle_cpus_mask. 
Moving this intersection to the busy CPU disturbs lazy evaluation. Why 
force every busy worker to perform bitmask math just to handle a 
no-idle-HK case tha the system handles naturally by being busy?

> 	/*
> -	 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
> -	 * Skip the read if time is not due.
> +	 * Most of the time system is not 100% busy. i.e there are idle
> +	 * housekeeping CPUs.
> +	 *
> +	 * So, Skip the reading idle_cpus_mask if time is not due.
> 	 *
> 	 * If none are in tickless mode, there maybe a narrow window
> 	 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
> 	 * But idle load balancing is not done as find_new_ilb fails.
> -	 * That's very rare. So read nohz.nr_cpus only if time is due.
> +	 * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
> +	 * only if time is due.
> +	 *
> 	 */
> 	if (time_before(now, nohz.next_balance))
> 		goto out;
>
> +	/* Avoid the double computation */
> +	if (flags != NOHZ_STATS_KICK)
> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +
> 	/*
> 	 * None are in tickless mode and hence no need for NOHZ idle load
> 	 * balancing
> 	 */
> -	if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
> +	if (unlikely(cpumask_empty(ilb_cpus)))
> 		return;
>
Checking ilb_cpus here is a broken fix because the underlying 
nohz.idle_cpus_mask is currently unreliable. Under 
CONFIG_NO_HZ_FULL, there is a documented visibility bug: 
tick_nohz_full_stop_tick() often stops the tick before idle entry, causing 
nohz_balance_enter_idle() to be skipped. This means many valid, tickless 
idle CPUs are never added to the mask in the first place.
https://lore.kernel.org/lkml/20260203-fix-nohz-idle-v1-1-ad05a5872080@os.amperecomputing.com/

> 	if (rq->nr_running >= 2) {
> @@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
> 		 * When balancing between cores, all the SMT siblings of the
> 		 * preferred CPU must be idle.
> 		 */
> -		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
> +		for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
> 			if (sched_asym(sd, i, cpu)) {
> 				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
> 				goto unlock;
> @@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
> 		flags |= NOHZ_NEXT_KICK;
>
> 	if (flags)
> -		kick_ilb(flags);
> +		kick_ilb(flags, ilb_cpus);
> }
>
> static void set_cpu_sd_state_busy(int cpu)
> @@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
> 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
> 		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
> 					GFP_KERNEL, cpu_to_node(i));
> +		zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
> +					GFP_KERNEL, cpu_to_node(i));
>
> #ifdef CONFIG_CFS_BANDWIDTH
> 		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
> -- 
> 2.43.0
>
>

Regards,
Shubhang Kaushik
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shrikanth Hegde 2 weeks, 3 days ago
Hi Shubhang. Thanks for taking a look.

On 3/20/26 4:28 AM, Shubhang Kaushik wrote:
> Hi Shrikanth,
> 
> On Thu, 19 Mar 2026, Shrikanth Hegde wrote:
> 
>> Current code around nohz_balancer_kick and kick_ilb:
>> 1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
>>   needed.
>> 2. Does a few checks to see if any conditions meet the criteria.
>> 3. Tries to find the idle CPU. But the idle CPU found should be part of
>>   housekeeping CPUs.
>>
>> If there is no housekeeping idle CPU, then step 2 is done
>> un-necessarily, since 3 bails out without doing the ilb.
>>
>> Fix that by making the decision early and pass it on to find_new_ilb.
>> Use a percpu cpumask instead of allocating it everytime since this is in
>> fastpath.
>>
>> If flags is set to NOHZ_STATS_KICK since the time is after 
>> nohz.next_blocked
>> but before nohz.next_balance and there are idle CPUs which are part of
>> housekeeping, need to copy the same logic there too.
>>
>> While there, fix the stale comments around nohz.nr_cpus
>>
>> Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>> ---
>>
>> Didn't add the fixes tag since it addresses more than stale comments.
>>
>> kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
>> 1 file changed, 31 insertions(+), 14 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index b19aeaa51ebc..02cca2c7a98d 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7392,6 +7392,7 @@ static inline unsigned int 
>> cfs_h_nr_delayed(struct rq *rq)
>> static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>> static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>> static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
>> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>>
>> @@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
>>  * - When one of the busy CPUs notices that there may be an idle 
>> rebalancing
>>  *   needed, they will kick the idle load balancer, which then does idle
>>  *   load balancing for all the idle CPUs.
>> + *
>> + *   @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
>>  */
>> -static inline int find_new_ilb(void)
>> +static inline int find_new_ilb(struct cpumask *cpus)
>> {
>> -    const struct cpumask *hk_mask;
>>     int ilb_cpu;
>>
>> -    hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
>> -
>> -    for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
>> +    for_each_cpu(ilb_cpu, cpus) {
>>
>>         if (ilb_cpu == smp_processor_id())
>>             continue;
>> @@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
>>  * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping 
>> set
>>  * (if there is one).
>>  */
>> -static void kick_ilb(unsigned int flags)
>> +static void kick_ilb(unsigned int flags, struct cpumask *cpus)
>> {
>>     int ilb_cpu;
>>
>> @@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
>>     if (flags & NOHZ_BALANCE_KICK)
>>         nohz.next_balance = jiffies+1;
>>
>> -    ilb_cpu = find_new_ilb();
>> +    ilb_cpu = find_new_ilb(cpus);
>>     if (ilb_cpu < 0)
>>         return;
>>
>> @@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
>>  */
>> static void nohz_balancer_kick(struct rq *rq)
>> {
>> +    struct cpumask *ilb_cpus = 
>> this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
>>     unsigned long now = jiffies;
>>     struct sched_domain_shared *sds;
>>     struct sched_domain *sd;
>> @@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
>>      */
>>     nohz_balance_exit_idle(rq);
>>
>> +    /* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
>> +
>>     if (READ_ONCE(nohz.has_blocked_load) &&
>> -        time_after(now, READ_ONCE(nohz.next_blocked)))
>> +        time_after(now, READ_ONCE(nohz.next_blocked))) {
>>         flags = NOHZ_STATS_KICK;
>> +        cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
>> +                housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
>> +    }
>>
> Moving cpumask_and() here would be taxing the busy/requesting CPU on 
> every kick check, thereby breaking the lazy evaluation.
> 

There is time checks. So not every tick.

> In nohz_full scenario, the housekeeping mask is static (default CPU 0).

One could specify the mask. What do you mean CPU 0?

> If this HK CPU is busy, it is already running the tick and will handle 
> load balancing itslef. If it is idle, it is already in the 
> idle_cpus_mask. Moving this intersection to the busy CPU disturbs lazy 
> evaluation. Why force every busy worker to perform bitmask math just to 
> handle a no-idle-HK case tha the system handles naturally by being busy?
> 

lazy evaluation is about NOHZ_BALANCE_KICK which does the actual idle balance.
But NOHZ_STATS_KICK would set more often on normal system at 32 msec.

You do have a point. What if it was not due to nohz.next_blocked but due to later
checks such as nr_running > 1. I would say thats rare. I didn;t want to put too many
checks such as do this only for nohz_full systems and do lazy evaluation for systems
without it.

I do agree benefit of this patch is limited to case where one
specifies nohz_full=<small set of CPUS>.

>>     /*
>> -     * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
>> -     * Skip the read if time is not due.
>> +     * Most of the time system is not 100% busy. i.e there are idle
>> +     * housekeeping CPUs.
>> +     *
>> +     * So, Skip the reading idle_cpus_mask if time is not due.
>>      *
>>      * If none are in tickless mode, there maybe a narrow window
>>      * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
>>      * But idle load balancing is not done as find_new_ilb fails.
>> -     * That's very rare. So read nohz.nr_cpus only if time is due.
>> +     * That's very rare. So check (idle_cpus_mask & 
>> HK_TYPE_KERNEL_NOISE)
>> +     * only if time is due.
>> +     *
>>      */
>>     if (time_before(now, nohz.next_balance))
>>         goto out;
>>
>> +    /* Avoid the double computation */
>> +    if (flags != NOHZ_STATS_KICK)
>> +        cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
>> +                housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
>> +
>>     /*
>>      * None are in tickless mode and hence no need for NOHZ idle load
>>      * balancing
>>      */
>> -    if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
>> +    if (unlikely(cpumask_empty(ilb_cpus)))
>>         return;
>>
> Checking ilb_cpus here is a broken fix because the underlying 
> nohz.idle_cpus_mask is currently unreliable. Under CONFIG_NO_HZ_FULL, 
> there is a documented visibility bug: tick_nohz_full_stop_tick() often 
> stops the tick before idle entry, causing nohz_balance_enter_idle() to 
> be skipped. This means many valid, tickless idle CPUs are never added to 
> the mask in the first place.

This is not in the scope of this patch.

> https://lore.kernel.org/lkml/20260203-fix-nohz-idle-v1-1- 
> ad05a5872080@os.amperecomputing.com/
> 
>>     if (rq->nr_running >= 2) {
>> @@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
>>          * When balancing between cores, all the SMT siblings of the
>>          * preferred CPU must be idle.
>>          */
>> -        for_each_cpu_and(i, sched_domain_span(sd), 
>> nohz.idle_cpus_mask) {
>> +        for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
>>             if (sched_asym(sd, i, cpu)) {
>>                 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>>                 goto unlock;
>> @@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
>>         flags |= NOHZ_NEXT_KICK;
>>
>>     if (flags)
>> -        kick_ilb(flags);
>> +        kick_ilb(flags, ilb_cpus);
>> }
>>
>> static void set_cpu_sd_state_busy(int cpu)
>> @@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
>>         zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), 
>> GFP_KERNEL, cpu_to_node(i));
>>         zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
>>                     GFP_KERNEL, cpu_to_node(i));
>> +        zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
>> +                    GFP_KERNEL, cpu_to_node(i));
>>
>> #ifdef CONFIG_CFS_BANDWIDTH
>>         INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
>> -- 
>> 2.43.0
>>
>>
> 
> Regards,
> Shubhang Kaushik

Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Mukesh Kumar Chaurasiya 2 weeks, 4 days ago
On Thu, Mar 19, 2026 at 12:23:13PM +0530, Shrikanth Hegde wrote:
> Current code around nohz_balancer_kick and kick_ilb:
> 1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
>    needed.
> 2. Does a few checks to see if any conditions meet the criteria.
> 3. Tries to find the idle CPU. But the idle CPU found should be part of
>    housekeeping CPUs.
> 
> If there is no housekeeping idle CPU, then step 2 is done
> un-necessarily, since 3 bails out without doing the ilb.
> 
> Fix that by making the decision early and pass it on to find_new_ilb.
> Use a percpu cpumask instead of allocating it everytime since this is in
> fastpath.
> 
> If flags is set to NOHZ_STATS_KICK since the time is after nohz.next_blocked
> but before nohz.next_balance and there are idle CPUs which are part of
> housekeeping, need to copy the same logic there too.
> 
> While there, fix the stale comments around nohz.nr_cpus
> 
> Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
> ---
> 
> Didn't add the fixes tag since it addresses more than stale comments.
> 
>  kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
>  1 file changed, 31 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b19aeaa51ebc..02cca2c7a98d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
>  static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>  static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>  static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>  
> @@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
>   * - When one of the busy CPUs notices that there may be an idle rebalancing
>   *   needed, they will kick the idle load balancer, which then does idle
>   *   load balancing for all the idle CPUs.
> + *
> + *   @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
>   */
> -static inline int find_new_ilb(void)
> +static inline int find_new_ilb(struct cpumask *cpus)
>  {
> -	const struct cpumask *hk_mask;
>  	int ilb_cpu;
>  
> -	hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
> -
> -	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
> +	for_each_cpu(ilb_cpu, cpus) {
>  
>  		if (ilb_cpu == smp_processor_id())
>  			continue;
> @@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
>   * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
>   * (if there is one).
>   */
> -static void kick_ilb(unsigned int flags)
> +static void kick_ilb(unsigned int flags, struct cpumask *cpus)
>  {
>  	int ilb_cpu;
>  
> @@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
>  	if (flags & NOHZ_BALANCE_KICK)
>  		nohz.next_balance = jiffies+1;
>  
> -	ilb_cpu = find_new_ilb();
> +	ilb_cpu = find_new_ilb(cpus);
>  	if (ilb_cpu < 0)
>  		return;
>  
> @@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
>   */
>  static void nohz_balancer_kick(struct rq *rq)
>  {
> +	struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
>  	unsigned long now = jiffies;
>  	struct sched_domain_shared *sds;
>  	struct sched_domain *sd;
> @@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
>  	 */
>  	nohz_balance_exit_idle(rq);
>  
> +	/* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
> +
>  	if (READ_ONCE(nohz.has_blocked_load) &&
> -	    time_after(now, READ_ONCE(nohz.next_blocked)))
> +	    time_after(now, READ_ONCE(nohz.next_blocked))) {
>  		flags = NOHZ_STATS_KICK;
> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +	}
>  
>  	/*
> -	 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
> -	 * Skip the read if time is not due.
> +	 * Most of the time system is not 100% busy. i.e there are idle
> +	 * housekeeping CPUs.
> +	 *
> +	 * So, Skip the reading idle_cpus_mask if time is not due.
>  	 *
>  	 * If none are in tickless mode, there maybe a narrow window
>  	 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
>  	 * But idle load balancing is not done as find_new_ilb fails.
> -	 * That's very rare. So read nohz.nr_cpus only if time is due.
> +	 * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
> +	 * only if time is due.
> +	 *
>  	 */
>  	if (time_before(now, nohz.next_balance))
>  		goto out;
>  
> +	/* Avoid the double computation */
> +	if (flags != NOHZ_STATS_KICK)
> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
> +
There is no usage of ilb_cpus till this point. We can avoid this if
condition and get the ilb_cpus here itself instead of earlier.
>  	/*
>  	 * None are in tickless mode and hence no need for NOHZ idle load
>  	 * balancing
>  	 */
> -	if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
> +	if (unlikely(cpumask_empty(ilb_cpus)))
>  		return;
>  
>  	if (rq->nr_running >= 2) {
> @@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
>  		 * When balancing between cores, all the SMT siblings of the
>  		 * preferred CPU must be idle.
>  		 */
> -		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
> +		for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
>  			if (sched_asym(sd, i, cpu)) {
>  				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>  				goto unlock;
> @@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
>  		flags |= NOHZ_NEXT_KICK;
>  
>  	if (flags)
> -		kick_ilb(flags);
> +		kick_ilb(flags, ilb_cpus);
>  }
>  
>  static void set_cpu_sd_state_busy(int cpu)
> @@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
>  		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
>  		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
>  					GFP_KERNEL, cpu_to_node(i));
> +		zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
> +					GFP_KERNEL, cpu_to_node(i));
>  
>  #ifdef CONFIG_CFS_BANDWIDTH
>  		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
> -- 
> 2.43.0
> 
Rest LGTM

Regards,
Mukesh
Re: [PATCH 1/2] sched/fair: consider hk_mask early in triggering ilb
Posted by Shrikanth Hegde 2 weeks, 4 days ago
Hi Mukesh.

On 3/19/26 1:45 PM, Mukesh Kumar Chaurasiya wrote:
> On Thu, Mar 19, 2026 at 12:23:13PM +0530, Shrikanth Hegde wrote:
>> Current code around nohz_balancer_kick and kick_ilb:
>> 1. Checks for nohz.idle_cpus_mask to see if idle load balance(ilb) is
>>     needed.
>> 2. Does a few checks to see if any conditions meet the criteria.
>> 3. Tries to find the idle CPU. But the idle CPU found should be part of
>>     housekeeping CPUs.
>>
>> If there is no housekeeping idle CPU, then step 2 is done
>> un-necessarily, since 3 bails out without doing the ilb.
>>
>> Fix that by making the decision early and pass it on to find_new_ilb.
>> Use a percpu cpumask instead of allocating it everytime since this is in
>> fastpath.
>>
>> If flags is set to NOHZ_STATS_KICK since the time is after nohz.next_blocked
>> but before nohz.next_balance and there are idle CPUs which are part of
>> housekeeping, need to copy the same logic there too.
>>
>> While there, fix the stale comments around nohz.nr_cpus
>>
>> Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
>> ---
>>
>> Didn't add the fixes tag since it addresses more than stale comments.
>>
>>   kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
>>   1 file changed, 31 insertions(+), 14 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index b19aeaa51ebc..02cca2c7a98d 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7392,6 +7392,7 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
>>   static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>>   static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>>   static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
>> +static DEFINE_PER_CPU(cpumask_var_t, kick_ilb_tmpmask);
>>   
>>   #ifdef CONFIG_NO_HZ_COMMON
>>   
>> @@ -12629,15 +12630,14 @@ static inline int on_null_domain(struct rq *rq)
>>    * - When one of the busy CPUs notices that there may be an idle rebalancing
>>    *   needed, they will kick the idle load balancer, which then does idle
>>    *   load balancing for all the idle CPUs.
>> + *
>> + *   @cpus idle CPUs in HK_TYPE_KERNEL_NOISE housekeeping
>>    */
>> -static inline int find_new_ilb(void)
>> +static inline int find_new_ilb(struct cpumask *cpus)
>>   {
>> -	const struct cpumask *hk_mask;
>>   	int ilb_cpu;
>>   
>> -	hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
>> -
>> -	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
>> +	for_each_cpu(ilb_cpu, cpus) {
>>   
>>   		if (ilb_cpu == smp_processor_id())
>>   			continue;
>> @@ -12656,7 +12656,7 @@ static inline int find_new_ilb(void)
>>    * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
>>    * (if there is one).
>>    */
>> -static void kick_ilb(unsigned int flags)
>> +static void kick_ilb(unsigned int flags, struct cpumask *cpus)
>>   {
>>   	int ilb_cpu;
>>   
>> @@ -12667,7 +12667,7 @@ static void kick_ilb(unsigned int flags)
>>   	if (flags & NOHZ_BALANCE_KICK)
>>   		nohz.next_balance = jiffies+1;
>>   
>> -	ilb_cpu = find_new_ilb();
>> +	ilb_cpu = find_new_ilb(cpus);
>>   	if (ilb_cpu < 0)
>>   		return;
>>   
>> @@ -12700,6 +12700,7 @@ static void kick_ilb(unsigned int flags)
>>    */
>>   static void nohz_balancer_kick(struct rq *rq)
>>   {
>> +	struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask);
>>   	unsigned long now = jiffies;
>>   	struct sched_domain_shared *sds;
>>   	struct sched_domain *sd;
>> @@ -12715,27 +12716,41 @@ static void nohz_balancer_kick(struct rq *rq)
>>   	 */
>>   	nohz_balance_exit_idle(rq);
>>   
>> +	/* ILB considers only HK_TYPE_KERNEL_NOISE housekeeping CPUs */
>> +
>>   	if (READ_ONCE(nohz.has_blocked_load) &&
>> -	    time_after(now, READ_ONCE(nohz.next_blocked)))
>> +	    time_after(now, READ_ONCE(nohz.next_blocked))) {
>>   		flags = NOHZ_STATS_KICK;
>> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
>> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
>> +	}
>>   
>>   	/*
>> -	 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
>> -	 * Skip the read if time is not due.
>> +	 * Most of the time system is not 100% busy. i.e there are idle
>> +	 * housekeeping CPUs.
>> +	 *
>> +	 * So, Skip the reading idle_cpus_mask if time is not due.
>>   	 *
>>   	 * If none are in tickless mode, there maybe a narrow window
>>   	 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
>>   	 * But idle load balancing is not done as find_new_ilb fails.
>> -	 * That's very rare. So read nohz.nr_cpus only if time is due.
>> +	 * That's very rare. So check (idle_cpus_mask & HK_TYPE_KERNEL_NOISE)
>> +	 * only if time is due.
>> +	 *
>>   	 */
>>   	if (time_before(now, nohz.next_balance))
>>   		goto out;
>>   
>> +	/* Avoid the double computation */
>> +	if (flags != NOHZ_STATS_KICK)
>> +		cpumask_and(ilb_cpus, nohz.idle_cpus_mask,
>> +			    housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
>> +
> There is no usage of ilb_cpus till this point. We can avoid this if
> condition and get the ilb_cpus here itself instead of earlier.

No there is. Why?

struct cpumask *ilb_cpus = this_cpu_cpumask_var_ptr(kick_ilb_tmpmask) << this is just a variable.

if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked)))
	flags = NOHZ_STATS_KICK

if (time_before(now, nohz.next_balance))
   	goto out;

If there are idle cpus, nohz.has_blocked_load=1 on idle entry which
could be after previous nohz idle balance. After 32 jiffies time now points
after next_blocked. But nohz.next_balance is typically set to 60 jiffies.
So, it goes to out with flags set and that passes ilb_cpus which is not set yet.
Hence both places setting the ilb_cpu is necessary.

I kept it at both places and added flags check since it is difficult to
predict movement of nohz.next_balance and nohz.next_blocked since there
multiple CPUs involved which maybe doing idle entry/exit. On first tick
after idle exit, nohz_balancer_kick would be called.

>>   	/*
>>   	 * None are in tickless mode and hence no need for NOHZ idle load
>>   	 * balancing
>>   	 */
>> -	if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
>> +	if (unlikely(cpumask_empty(ilb_cpus)))
>>   		return;
>>   
>>   	if (rq->nr_running >= 2) {
>> @@ -12767,7 +12782,7 @@ static void nohz_balancer_kick(struct rq *rq)
>>   		 * When balancing between cores, all the SMT siblings of the
>>   		 * preferred CPU must be idle.
>>   		 */
>> -		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
>> +		for_each_cpu_and(i, sched_domain_span(sd), ilb_cpus) {
>>   			if (sched_asym(sd, i, cpu)) {
>>   				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
>>   				goto unlock;
>> @@ -12820,7 +12835,7 @@ static void nohz_balancer_kick(struct rq *rq)
>>   		flags |= NOHZ_NEXT_KICK;
>>   
>>   	if (flags)
>> -		kick_ilb(flags);
>> +		kick_ilb(flags, ilb_cpus);
>>   }
>>   
>>   static void set_cpu_sd_state_busy(int cpu)
>> @@ -14253,6 +14268,8 @@ __init void init_sched_fair_class(void)
>>   		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
>>   		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
>>   					GFP_KERNEL, cpu_to_node(i));
>> +		zalloc_cpumask_var_node(&per_cpu(kick_ilb_tmpmask, i),
>> +					GFP_KERNEL, cpu_to_node(i));
>>   
>>   #ifdef CONFIG_CFS_BANDWIDTH
>>   		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
>> -- 
>> 2.43.0
>>
> Rest LGTM
> 

Thank you for going through the patch.