[PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers

Frederic Weisbecker posted 33 patches 5 months, 3 weeks ago
There is a newer version of this series
[PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Frederic Weisbecker 5 months, 3 weeks ago
HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
A synchronization mechanism is then needed to synchronize the updates
with the housekeeping cpumask readers.

Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
cpumask will be modified, the update side will wait for an RCU grace
period and propagate the change to interested subsystem when deemed
necessary.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
 kernel/sched/sched.h     |  1 +
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 8690fb705089..b46c20b5437f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
 EXPORT_SYMBOL_GPL(housekeeping_overridden);
 
 struct housekeeping {
-	cpumask_var_t cpumasks[HK_TYPE_MAX];
+	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
 	unsigned long flags;
 };
 
@@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
 }
 EXPORT_SYMBOL_GPL(housekeeping_enabled);
 
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
+{
+	if (static_branch_unlikely(&housekeeping_overridden)) {
+		if (housekeeping.flags & BIT(type)) {
+			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
+		}
+	}
+	return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
 int housekeeping_any_cpu(enum hk_type type)
 {
 	int cpu;
 
 	if (static_branch_unlikely(&housekeeping_overridden)) {
 		if (housekeeping.flags & BIT(type)) {
-			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
+			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
 			if (cpu < nr_cpu_ids)
 				return cpu;
 
-			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
+			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
 			if (likely(cpu < nr_cpu_ids))
 				return cpu;
 			/*
@@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
 }
 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
-const struct cpumask *housekeeping_cpumask(enum hk_type type)
-{
-	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping.flags & BIT(type))
-			return housekeeping.cpumasks[type];
-	return cpu_possible_mask;
-}
-EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-
 void housekeeping_affine(struct task_struct *t, enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
 		if (housekeeping.flags & BIT(type))
-			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
+			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
 bool housekeeping_test_cpu(int cpu, enum hk_type type)
 {
-	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping.flags & BIT(type))
-			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
+	if (housekeeping.flags & BIT(type))
+		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
 	return true;
 }
 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
@@ -96,20 +97,33 @@ void __init housekeeping_init(void)
 
 	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
 		sched_tick_offload_init();
-
+	/*
+	 * Realloc with a proper allocator so that any cpumask update
+	 * can indifferently free the old version with kfree().
+	 */
 	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
+
+		if (WARN_ON_ONCE(!nmask))
+			return;
+
+		omask = rcu_dereference(housekeeping.cpumasks[type]);
+
 		/* We need at least one CPU to handle housekeeping work */
-		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+		WARN_ON_ONCE(cpumask_empty(omask));
+		cpumask_copy(nmask, omask);
+		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
+		memblock_free(omask, cpumask_size());
 	}
 }
 
 static void __init housekeeping_setup_type(enum hk_type type,
 					   cpumask_var_t housekeeping_staging)
 {
+	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
 
-	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
-	cpumask_copy(housekeeping.cpumasks[type],
-		     housekeeping_staging);
+	cpumask_copy(mask, housekeeping_staging);
+	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
 }
 
 static int __init housekeeping_setup(char *str, unsigned long flags)
@@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
 
 		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
 			if (!cpumask_equal(housekeeping_staging,
-					   housekeeping.cpumasks[type])) {
+					   housekeeping_cpumask(type))) {
 				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
 				goto free_housekeeping_staging;
 			}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d07067f60..0c0ef8999fd6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -42,6 +42,7 @@
 #include <linux/ktime_api.h>
 #include <linux/lockdep_api.h>
 #include <linux/lockdep.h>
+#include <linux/memblock.h>
 #include <linux/minmax.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-- 
2.51.0
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Waiman Long 5 months, 2 weeks ago
On 10/13/25 4:31 PM, Frederic Weisbecker wrote:
> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
> A synchronization mechanism is then needed to synchronize the updates
> with the housekeeping cpumask readers.
>
> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
> cpumask will be modified, the update side will wait for an RCU grace
> period and propagate the change to interested subsystem when deemed
> necessary.
>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>   kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
>   kernel/sched/sched.h     |  1 +
>   2 files changed, 37 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> index 8690fb705089..b46c20b5437f 100644
> --- a/kernel/sched/isolation.c
> +++ b/kernel/sched/isolation.c
> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
>   EXPORT_SYMBOL_GPL(housekeeping_overridden);
>   
>   struct housekeeping {
> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
>   	unsigned long flags;
>   };
>   
> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
>   }
>   EXPORT_SYMBOL_GPL(housekeeping_enabled);
>   
> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
> +{
> +	if (static_branch_unlikely(&housekeeping_overridden)) {
> +		if (housekeeping.flags & BIT(type)) {
> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
> +		}
> +	}
> +	return cpu_possible_mask;
> +}
> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> +
>   int housekeeping_any_cpu(enum hk_type type)
>   {
>   	int cpu;
>   
>   	if (static_branch_unlikely(&housekeeping_overridden)) {
>   		if (housekeeping.flags & BIT(type)) {
> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
>   			if (cpu < nr_cpu_ids)
>   				return cpu;
>   
> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
>   			if (likely(cpu < nr_cpu_ids))
>   				return cpu;
>   			/*
> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
>   }
>   EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
>   
> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
> -{
> -	if (static_branch_unlikely(&housekeeping_overridden))
> -		if (housekeeping.flags & BIT(type))
> -			return housekeeping.cpumasks[type];
> -	return cpu_possible_mask;
> -}
> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> -
>   void housekeeping_affine(struct task_struct *t, enum hk_type type)
>   {
>   	if (static_branch_unlikely(&housekeeping_overridden))
>   		if (housekeeping.flags & BIT(type))
> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
>   }
>   EXPORT_SYMBOL_GPL(housekeeping_affine);
>   
>   bool housekeeping_test_cpu(int cpu, enum hk_type type)
>   {
> -	if (static_branch_unlikely(&housekeeping_overridden))
> -		if (housekeeping.flags & BIT(type))
> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
> +	if (housekeeping.flags & BIT(type))
> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
>   	return true;
>   }

The housekeeping_overridden static key check is kept in other places 
except this one. Should we keep it for consistency?

Cheers,
Longman
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Frederic Weisbecker 5 months ago
Le Mon, Oct 20, 2025 at 11:49:53PM -0400, Waiman Long a écrit :
> >   bool housekeeping_test_cpu(int cpu, enum hk_type type)
> >   {
> > -	if (static_branch_unlikely(&housekeeping_overridden))
> > -		if (housekeeping.flags & BIT(type))
> > -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
> > +	if (housekeeping.flags & BIT(type))
> > +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
> >   	return true;
> >   }
> 
> The housekeeping_overridden static key check is kept in other places except
> this one. Should we keep it for consistency?

Indeed!

Thanks.

-- 
Frederic Weisbecker
SUSE Labs
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Chen Ridong 5 months, 2 weeks ago

On 2025/10/14 4:31, Frederic Weisbecker wrote:
> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
> A synchronization mechanism is then needed to synchronize the updates
> with the housekeeping cpumask readers.
> 
> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
> cpumask will be modified, the update side will wait for an RCU grace
> period and propagate the change to interested subsystem when deemed
> necessary.
> 
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>  kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
>  kernel/sched/sched.h     |  1 +
>  2 files changed, 37 insertions(+), 22 deletions(-)
> 
> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> index 8690fb705089..b46c20b5437f 100644
> --- a/kernel/sched/isolation.c
> +++ b/kernel/sched/isolation.c
> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
>  EXPORT_SYMBOL_GPL(housekeeping_overridden);
>  
>  struct housekeeping {
> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
>  	unsigned long flags;
>  };
>  
> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
>  }
>  EXPORT_SYMBOL_GPL(housekeeping_enabled);
>  
> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
> +{
> +	if (static_branch_unlikely(&housekeeping_overridden)) {
> +		if (housekeeping.flags & BIT(type)) {
> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
> +		}
> +	}
> +	return cpu_possible_mask;
> +}
> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> +
>  int housekeeping_any_cpu(enum hk_type type)
>  {
>  	int cpu;
>  
>  	if (static_branch_unlikely(&housekeeping_overridden)) {
>  		if (housekeeping.flags & BIT(type)) {
> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
>  			if (cpu < nr_cpu_ids)
>  				return cpu;
>  
> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
>  			if (likely(cpu < nr_cpu_ids))
>  				return cpu;
>  			/*
> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
>  }
>  EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
>  
> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
> -{
> -	if (static_branch_unlikely(&housekeeping_overridden))
> -		if (housekeeping.flags & BIT(type))
> -			return housekeeping.cpumasks[type];
> -	return cpu_possible_mask;
> -}
> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> -
>  void housekeeping_affine(struct task_struct *t, enum hk_type type)
>  {
>  	if (static_branch_unlikely(&housekeeping_overridden))
>  		if (housekeeping.flags & BIT(type))
> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
>  }
>  EXPORT_SYMBOL_GPL(housekeeping_affine);
>  
>  bool housekeeping_test_cpu(int cpu, enum hk_type type)
>  {
> -	if (static_branch_unlikely(&housekeeping_overridden))
> -		if (housekeeping.flags & BIT(type))
> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
> +	if (housekeeping.flags & BIT(type))
> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
>  	return true;
>  }
>  EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
> @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
>  
>  	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
>  		sched_tick_offload_init();
> -
> +	/*
> +	 * Realloc with a proper allocator so that any cpumask update
> +	 * can indifferently free the old version with kfree().
> +	 */
>  	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
> +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
> +
> +		if (WARN_ON_ONCE(!nmask))
> +			return;
> +
> +		omask = rcu_dereference(housekeeping.cpumasks[type]);
> +
>  		/* We need at least one CPU to handle housekeeping work */
> -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
> +		WARN_ON_ONCE(cpumask_empty(omask));
> +		cpumask_copy(nmask, omask);
> +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
> +		memblock_free(omask, cpumask_size());
>  	}
>  }
>  
>  static void __init housekeeping_setup_type(enum hk_type type,
>  					   cpumask_var_t housekeeping_staging)
>  {
> +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
>  
> -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
> -	cpumask_copy(housekeeping.cpumasks[type],
> -		     housekeeping_staging);
> +	cpumask_copy(mask, housekeeping_staging);
> +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
>  }
>  
>  static int __init housekeeping_setup(char *str, unsigned long flags)
> @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
>  
>  		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
>  			if (!cpumask_equal(housekeeping_staging,
> -					   housekeeping.cpumasks[type])) {
> +					   housekeeping_cpumask(type))) {
>  				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
>  				goto free_housekeeping_staging;
>  			}
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1f5d07067f60..0c0ef8999fd6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -42,6 +42,7 @@
>  #include <linux/ktime_api.h>
>  #include <linux/lockdep_api.h>
>  #include <linux/lockdep.h>
> +#include <linux/memblock.h>
>  #include <linux/minmax.h>
>  #include <linux/mm.h>
>  #include <linux/module.h>

A warning was detected:

=============================
WARNING: suspicious RCU usage
6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
-----------------------------
kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!

other info that might help us debug this:


rcu_scheduler_active = 2, debug_locks = 1
1 lock held by swapper/0/1:
 #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone

stack backtrace:
CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
Call Trace:
 <TASK>
 dump_stack_lvl+0x68/0xa0
 lockdep_rcu_suspicious+0x148/0x1b0
 housekeeping_cpumask+0xaa/0xb0
 housekeeping_test_cpu+0x25/0x40
 find_get_block_common+0x41/0x3e0
 bdev_getblk+0x28/0xa0
 ext4_getblk+0xba/0x2d0
 ext4_bread_batch+0x56/0x170
 __ext4_find_entry+0x17c/0x410
 ? lock_release+0xc6/0x290
 ext4_lookup+0x7a/0x1d0
 __lookup_slow+0xf9/0x1b0
 walk_component+0xe0/0x150
 link_path_walk+0x201/0x3e0
 path_openat+0xb1/0xb30
 ? stack_depot_save_flags+0x41e/0xa00
 do_filp_open+0xbc/0x170
 ? _raw_spin_unlock_irqrestore+0x2c/0x50
 ? __create_object+0x59/0x80
 ? trace_kmem_cache_alloc+0x1d/0xa0
 ? vprintk_emit+0x2b2/0x360
 do_open_execat+0x56/0x100
 alloc_bprm+0x1a/0x200
 ? __pfx_kernel_init+0x10/0x10
 kernel_execve+0x4b/0x160
 kernel_init+0xe5/0x1c0
 ret_from_fork+0x185/0x1d0
 ? __pfx_kernel_init+0x10/0x10
 ret_from_fork_asm+0x1a/0x30
 </TASK>
random: crng init done

-- 
Best regards,
Ridong
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Waiman Long 5 months, 2 weeks ago
On 10/20/25 9:46 PM, Chen Ridong wrote:
>
> On 2025/10/14 4:31, Frederic Weisbecker wrote:
>> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
>> A synchronization mechanism is then needed to synchronize the updates
>> with the housekeeping cpumask readers.
>>
>> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
>> cpumask will be modified, the update side will wait for an RCU grace
>> period and propagate the change to interested subsystem when deemed
>> necessary.
>>
>> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
>> ---
>>   kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
>>   kernel/sched/sched.h     |  1 +
>>   2 files changed, 37 insertions(+), 22 deletions(-)
>>
>> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
>> index 8690fb705089..b46c20b5437f 100644
>> --- a/kernel/sched/isolation.c
>> +++ b/kernel/sched/isolation.c
>> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
>>   EXPORT_SYMBOL_GPL(housekeeping_overridden);
>>   
>>   struct housekeeping {
>> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
>> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
>>   	unsigned long flags;
>>   };
>>   
>> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
>>   }
>>   EXPORT_SYMBOL_GPL(housekeeping_enabled);
>>   
>> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
>> +{
>> +	if (static_branch_unlikely(&housekeeping_overridden)) {
>> +		if (housekeeping.flags & BIT(type)) {
>> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
>> +		}
>> +	}
>> +	return cpu_possible_mask;
>> +}
>> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>> +
>>   int housekeeping_any_cpu(enum hk_type type)
>>   {
>>   	int cpu;
>>   
>>   	if (static_branch_unlikely(&housekeeping_overridden)) {
>>   		if (housekeeping.flags & BIT(type)) {
>> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
>> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
>>   			if (cpu < nr_cpu_ids)
>>   				return cpu;
>>   
>> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
>> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
>>   			if (likely(cpu < nr_cpu_ids))
>>   				return cpu;
>>   			/*
>> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
>>   }
>>   EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
>>   
>> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
>> -{
>> -	if (static_branch_unlikely(&housekeeping_overridden))
>> -		if (housekeeping.flags & BIT(type))
>> -			return housekeeping.cpumasks[type];
>> -	return cpu_possible_mask;
>> -}
>> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>> -
>>   void housekeeping_affine(struct task_struct *t, enum hk_type type)
>>   {
>>   	if (static_branch_unlikely(&housekeeping_overridden))
>>   		if (housekeeping.flags & BIT(type))
>> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
>> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
>>   }
>>   EXPORT_SYMBOL_GPL(housekeeping_affine);
>>   
>>   bool housekeeping_test_cpu(int cpu, enum hk_type type)
>>   {
>> -	if (static_branch_unlikely(&housekeeping_overridden))
>> -		if (housekeeping.flags & BIT(type))
>> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
>> +	if (housekeeping.flags & BIT(type))
>> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
>>   	return true;
>>   }
>>   EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
>> @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
>>   
>>   	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
>>   		sched_tick_offload_init();
>> -
>> +	/*
>> +	 * Realloc with a proper allocator so that any cpumask update
>> +	 * can indifferently free the old version with kfree().
>> +	 */
>>   	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
>> +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
>> +
>> +		if (WARN_ON_ONCE(!nmask))
>> +			return;
>> +
>> +		omask = rcu_dereference(housekeeping.cpumasks[type]);
>> +
>>   		/* We need at least one CPU to handle housekeeping work */
>> -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
>> +		WARN_ON_ONCE(cpumask_empty(omask));
>> +		cpumask_copy(nmask, omask);
>> +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
>> +		memblock_free(omask, cpumask_size());
>>   	}
>>   }
>>   
>>   static void __init housekeeping_setup_type(enum hk_type type,
>>   					   cpumask_var_t housekeeping_staging)
>>   {
>> +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
>>   
>> -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
>> -	cpumask_copy(housekeeping.cpumasks[type],
>> -		     housekeeping_staging);
>> +	cpumask_copy(mask, housekeeping_staging);
>> +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
>>   }
>>   
>>   static int __init housekeeping_setup(char *str, unsigned long flags)
>> @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
>>   
>>   		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
>>   			if (!cpumask_equal(housekeeping_staging,
>> -					   housekeeping.cpumasks[type])) {
>> +					   housekeeping_cpumask(type))) {
>>   				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
>>   				goto free_housekeeping_staging;
>>   			}
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 1f5d07067f60..0c0ef8999fd6 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -42,6 +42,7 @@
>>   #include <linux/ktime_api.h>
>>   #include <linux/lockdep_api.h>
>>   #include <linux/lockdep.h>
>> +#include <linux/memblock.h>
>>   #include <linux/minmax.h>
>>   #include <linux/mm.h>
>>   #include <linux/module.h>
> A warning was detected:
>
> =============================
> WARNING: suspicious RCU usage
> 6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
> -----------------------------
> kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!
>
> other info that might help us debug this:
>
>
> rcu_scheduler_active = 2, debug_locks = 1
> 1 lock held by swapper/0/1:
>   #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone
>
> stack backtrace:
> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
> Call Trace:
>   <TASK>
>   dump_stack_lvl+0x68/0xa0
>   lockdep_rcu_suspicious+0x148/0x1b0
>   housekeeping_cpumask+0xaa/0xb0
>   housekeeping_test_cpu+0x25/0x40
>   find_get_block_common+0x41/0x3e0
>   bdev_getblk+0x28/0xa0
>   ext4_getblk+0xba/0x2d0
>   ext4_bread_batch+0x56/0x170
>   __ext4_find_entry+0x17c/0x410
>   ? lock_release+0xc6/0x290
>   ext4_lookup+0x7a/0x1d0
>   __lookup_slow+0xf9/0x1b0
>   walk_component+0xe0/0x150
>   link_path_walk+0x201/0x3e0
>   path_openat+0xb1/0xb30
>   ? stack_depot_save_flags+0x41e/0xa00
>   do_filp_open+0xbc/0x170
>   ? _raw_spin_unlock_irqrestore+0x2c/0x50
>   ? __create_object+0x59/0x80
>   ? trace_kmem_cache_alloc+0x1d/0xa0
>   ? vprintk_emit+0x2b2/0x360
>   do_open_execat+0x56/0x100
>   alloc_bprm+0x1a/0x200
>   ? __pfx_kernel_init+0x10/0x10
>   kernel_execve+0x4b/0x160
>   kernel_init+0xe5/0x1c0
>   ret_from_fork+0x185/0x1d0
>   ? __pfx_kernel_init+0x10/0x10
>   ret_from_fork_asm+0x1a/0x30
>   </TASK>
> random: crng init done
>
It is because bh_lru_install() of fs/buffer.c calls cpu_is_isolated() 
without holding a rcu_read_lock. Will need to add a rcu_read_lock() there.

Cheers,
Longman
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Frederic Weisbecker 5 months ago
Le Tue, Oct 21, 2025 at 12:03:05AM -0400, Waiman Long a écrit :
> On 10/20/25 9:46 PM, Chen Ridong wrote:
> > 
> > On 2025/10/14 4:31, Frederic Weisbecker wrote:
> > > HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
> > > A synchronization mechanism is then needed to synchronize the updates
> > > with the housekeeping cpumask readers.
> > > 
> > > Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
> > > cpumask will be modified, the update side will wait for an RCU grace
> > > period and propagate the change to interested subsystem when deemed
> > > necessary.
> > > 
> > > Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> > > ---
> > >   kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
> > >   kernel/sched/sched.h     |  1 +
> > >   2 files changed, 37 insertions(+), 22 deletions(-)
> > > 
> > > diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> > > index 8690fb705089..b46c20b5437f 100644
> > > --- a/kernel/sched/isolation.c
> > > +++ b/kernel/sched/isolation.c
> > > @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
> > >   EXPORT_SYMBOL_GPL(housekeeping_overridden);
> > >   struct housekeeping {
> > > -	cpumask_var_t cpumasks[HK_TYPE_MAX];
> > > +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
> > >   	unsigned long flags;
> > >   };
> > > @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
> > >   }
> > >   EXPORT_SYMBOL_GPL(housekeeping_enabled);
> > > +const struct cpumask *housekeeping_cpumask(enum hk_type type)
> > > +{
> > > +	if (static_branch_unlikely(&housekeeping_overridden)) {
> > > +		if (housekeeping.flags & BIT(type)) {
> > > +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
> > > +		}
> > > +	}
> > > +	return cpu_possible_mask;
> > > +}
> > > +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> > > +
> > >   int housekeeping_any_cpu(enum hk_type type)
> > >   {
> > >   	int cpu;
> > >   	if (static_branch_unlikely(&housekeeping_overridden)) {
> > >   		if (housekeeping.flags & BIT(type)) {
> > > -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
> > > +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
> > >   			if (cpu < nr_cpu_ids)
> > >   				return cpu;
> > > -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
> > > +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
> > >   			if (likely(cpu < nr_cpu_ids))
> > >   				return cpu;
> > >   			/*
> > > @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
> > >   }
> > >   EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
> > > -const struct cpumask *housekeeping_cpumask(enum hk_type type)
> > > -{
> > > -	if (static_branch_unlikely(&housekeeping_overridden))
> > > -		if (housekeeping.flags & BIT(type))
> > > -			return housekeeping.cpumasks[type];
> > > -	return cpu_possible_mask;
> > > -}
> > > -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> > > -
> > >   void housekeeping_affine(struct task_struct *t, enum hk_type type)
> > >   {
> > >   	if (static_branch_unlikely(&housekeeping_overridden))
> > >   		if (housekeeping.flags & BIT(type))
> > > -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
> > > +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
> > >   }
> > >   EXPORT_SYMBOL_GPL(housekeeping_affine);
> > >   bool housekeeping_test_cpu(int cpu, enum hk_type type)
> > >   {
> > > -	if (static_branch_unlikely(&housekeeping_overridden))
> > > -		if (housekeeping.flags & BIT(type))
> > > -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
> > > +	if (housekeeping.flags & BIT(type))
> > > +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
> > >   	return true;
> > >   }
> > >   EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
> > > @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
> > >   	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
> > >   		sched_tick_offload_init();
> > > -
> > > +	/*
> > > +	 * Realloc with a proper allocator so that any cpumask update
> > > +	 * can indifferently free the old version with kfree().
> > > +	 */
> > >   	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
> > > +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
> > > +
> > > +		if (WARN_ON_ONCE(!nmask))
> > > +			return;
> > > +
> > > +		omask = rcu_dereference(housekeeping.cpumasks[type]);
> > > +
> > >   		/* We need at least one CPU to handle housekeeping work */
> > > -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
> > > +		WARN_ON_ONCE(cpumask_empty(omask));
> > > +		cpumask_copy(nmask, omask);
> > > +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
> > > +		memblock_free(omask, cpumask_size());
> > >   	}
> > >   }
> > >   static void __init housekeeping_setup_type(enum hk_type type,
> > >   					   cpumask_var_t housekeeping_staging)
> > >   {
> > > +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
> > > -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
> > > -	cpumask_copy(housekeeping.cpumasks[type],
> > > -		     housekeeping_staging);
> > > +	cpumask_copy(mask, housekeeping_staging);
> > > +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
> > >   }
> > >   static int __init housekeeping_setup(char *str, unsigned long flags)
> > > @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
> > >   		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
> > >   			if (!cpumask_equal(housekeeping_staging,
> > > -					   housekeeping.cpumasks[type])) {
> > > +					   housekeeping_cpumask(type))) {
> > >   				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
> > >   				goto free_housekeeping_staging;
> > >   			}
> > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > > index 1f5d07067f60..0c0ef8999fd6 100644
> > > --- a/kernel/sched/sched.h
> > > +++ b/kernel/sched/sched.h
> > > @@ -42,6 +42,7 @@
> > >   #include <linux/ktime_api.h>
> > >   #include <linux/lockdep_api.h>
> > >   #include <linux/lockdep.h>
> > > +#include <linux/memblock.h>
> > >   #include <linux/minmax.h>
> > >   #include <linux/mm.h>
> > >   #include <linux/module.h>
> > A warning was detected:
> > 
> > =============================
> > WARNING: suspicious RCU usage
> > 6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
> > -----------------------------
> > kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!
> > 
> > other info that might help us debug this:
> > 
> > 
> > rcu_scheduler_active = 2, debug_locks = 1
> > 1 lock held by swapper/0/1:
> >   #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone
> > 
> > stack backtrace:
> > CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
> > Call Trace:
> >   <TASK>
> >   dump_stack_lvl+0x68/0xa0
> >   lockdep_rcu_suspicious+0x148/0x1b0
> >   housekeeping_cpumask+0xaa/0xb0
> >   housekeeping_test_cpu+0x25/0x40
> >   find_get_block_common+0x41/0x3e0
> >   bdev_getblk+0x28/0xa0
> >   ext4_getblk+0xba/0x2d0
> >   ext4_bread_batch+0x56/0x170
> >   __ext4_find_entry+0x17c/0x410
> >   ? lock_release+0xc6/0x290
> >   ext4_lookup+0x7a/0x1d0
> >   __lookup_slow+0xf9/0x1b0
> >   walk_component+0xe0/0x150
> >   link_path_walk+0x201/0x3e0
> >   path_openat+0xb1/0xb30
> >   ? stack_depot_save_flags+0x41e/0xa00
> >   do_filp_open+0xbc/0x170
> >   ? _raw_spin_unlock_irqrestore+0x2c/0x50
> >   ? __create_object+0x59/0x80
> >   ? trace_kmem_cache_alloc+0x1d/0xa0
> >   ? vprintk_emit+0x2b2/0x360
> >   do_open_execat+0x56/0x100
> >   alloc_bprm+0x1a/0x200
> >   ? __pfx_kernel_init+0x10/0x10
> >   kernel_execve+0x4b/0x160
> >   kernel_init+0xe5/0x1c0
> >   ret_from_fork+0x185/0x1d0
> >   ? __pfx_kernel_init+0x10/0x10
> >   ret_from_fork_asm+0x1a/0x30
> >   </TASK>
> > random: crng init done
> > 
> It is because bh_lru_install() of fs/buffer.c calls cpu_is_isolated()
> without holding a rcu_read_lock. Will need to add a rcu_read_lock() there.

But this is called within bh_lru_lock() which should have either disabled
IRQs or preemption off. I would expect rcu_dereference_check() to automatically
verify those implied RCU read-side critical sections.

Let's see, lockdep_assert_in_rcu_reader() checks preemptible(), which is:

#define preemptible()	(preempt_count() == 0 && !irqs_disabled())

Ah but if !CONFIG_PREEMPT_COUNT:

#define preemptible()	0

Chen did you have !CONFIG_PREEMPT_COUNT ?

Probably lockdep_assert_in_rcu_reader() should be fixed accordingly and consider
preemption always disabled whenever !CONFIG_PREEMPT_COUNT. Let me check that...

Thanks.

-- 
Frederic Weisbecker
SUSE Labs
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Chen Ridong 5 months ago

On 2025/11/1 0:17, Frederic Weisbecker wrote:
> Le Tue, Oct 21, 2025 at 12:03:05AM -0400, Waiman Long a écrit :
>> On 10/20/25 9:46 PM, Chen Ridong wrote:
>>>
>>> On 2025/10/14 4:31, Frederic Weisbecker wrote:
>>>> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
>>>> A synchronization mechanism is then needed to synchronize the updates
>>>> with the housekeeping cpumask readers.
>>>>
>>>> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
>>>> cpumask will be modified, the update side will wait for an RCU grace
>>>> period and propagate the change to interested subsystem when deemed
>>>> necessary.
>>>>
>>>> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
>>>> ---
>>>>   kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
>>>>   kernel/sched/sched.h     |  1 +
>>>>   2 files changed, 37 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
>>>> index 8690fb705089..b46c20b5437f 100644
>>>> --- a/kernel/sched/isolation.c
>>>> +++ b/kernel/sched/isolation.c
>>>> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
>>>>   EXPORT_SYMBOL_GPL(housekeeping_overridden);
>>>>   struct housekeeping {
>>>> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
>>>> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
>>>>   	unsigned long flags;
>>>>   };
>>>> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(housekeeping_enabled);
>>>> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
>>>> +{
>>>> +	if (static_branch_unlikely(&housekeeping_overridden)) {
>>>> +		if (housekeeping.flags & BIT(type)) {
>>>> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
>>>> +		}
>>>> +	}
>>>> +	return cpu_possible_mask;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>>>> +
>>>>   int housekeeping_any_cpu(enum hk_type type)
>>>>   {
>>>>   	int cpu;
>>>>   	if (static_branch_unlikely(&housekeeping_overridden)) {
>>>>   		if (housekeeping.flags & BIT(type)) {
>>>> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
>>>> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
>>>>   			if (cpu < nr_cpu_ids)
>>>>   				return cpu;
>>>> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
>>>> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
>>>>   			if (likely(cpu < nr_cpu_ids))
>>>>   				return cpu;
>>>>   			/*
>>>> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
>>>> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
>>>> -{
>>>> -	if (static_branch_unlikely(&housekeeping_overridden))
>>>> -		if (housekeeping.flags & BIT(type))
>>>> -			return housekeeping.cpumasks[type];
>>>> -	return cpu_possible_mask;
>>>> -}
>>>> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>>>> -
>>>>   void housekeeping_affine(struct task_struct *t, enum hk_type type)
>>>>   {
>>>>   	if (static_branch_unlikely(&housekeeping_overridden))
>>>>   		if (housekeeping.flags & BIT(type))
>>>> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
>>>> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(housekeeping_affine);
>>>>   bool housekeeping_test_cpu(int cpu, enum hk_type type)
>>>>   {
>>>> -	if (static_branch_unlikely(&housekeeping_overridden))
>>>> -		if (housekeeping.flags & BIT(type))
>>>> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
>>>> +	if (housekeeping.flags & BIT(type))
>>>> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
>>>>   	return true;
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
>>>> @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
>>>>   	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
>>>>   		sched_tick_offload_init();
>>>> -
>>>> +	/*
>>>> +	 * Realloc with a proper allocator so that any cpumask update
>>>> +	 * can indifferently free the old version with kfree().
>>>> +	 */
>>>>   	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
>>>> +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
>>>> +
>>>> +		if (WARN_ON_ONCE(!nmask))
>>>> +			return;
>>>> +
>>>> +		omask = rcu_dereference(housekeeping.cpumasks[type]);
>>>> +
>>>>   		/* We need at least one CPU to handle housekeeping work */
>>>> -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
>>>> +		WARN_ON_ONCE(cpumask_empty(omask));
>>>> +		cpumask_copy(nmask, omask);
>>>> +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
>>>> +		memblock_free(omask, cpumask_size());
>>>>   	}
>>>>   }
>>>>   static void __init housekeeping_setup_type(enum hk_type type,
>>>>   					   cpumask_var_t housekeeping_staging)
>>>>   {
>>>> +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
>>>> -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
>>>> -	cpumask_copy(housekeeping.cpumasks[type],
>>>> -		     housekeeping_staging);
>>>> +	cpumask_copy(mask, housekeeping_staging);
>>>> +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
>>>>   }
>>>>   static int __init housekeeping_setup(char *str, unsigned long flags)
>>>> @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
>>>>   		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
>>>>   			if (!cpumask_equal(housekeeping_staging,
>>>> -					   housekeeping.cpumasks[type])) {
>>>> +					   housekeeping_cpumask(type))) {
>>>>   				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
>>>>   				goto free_housekeeping_staging;
>>>>   			}
>>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>>> index 1f5d07067f60..0c0ef8999fd6 100644
>>>> --- a/kernel/sched/sched.h
>>>> +++ b/kernel/sched/sched.h
>>>> @@ -42,6 +42,7 @@
>>>>   #include <linux/ktime_api.h>
>>>>   #include <linux/lockdep_api.h>
>>>>   #include <linux/lockdep.h>
>>>> +#include <linux/memblock.h>
>>>>   #include <linux/minmax.h>
>>>>   #include <linux/mm.h>
>>>>   #include <linux/module.h>
>>> A warning was detected:
>>>
>>> =============================
>>> WARNING: suspicious RCU usage
>>> 6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
>>> -----------------------------
>>> kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!
>>>
>>> other info that might help us debug this:
>>>
>>>
>>> rcu_scheduler_active = 2, debug_locks = 1
>>> 1 lock held by swapper/0/1:
>>>   #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone
>>>
>>> stack backtrace:
>>> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
>>> Call Trace:
>>>   <TASK>
>>>   dump_stack_lvl+0x68/0xa0
>>>   lockdep_rcu_suspicious+0x148/0x1b0
>>>   housekeeping_cpumask+0xaa/0xb0
>>>   housekeeping_test_cpu+0x25/0x40
>>>   find_get_block_common+0x41/0x3e0
>>>   bdev_getblk+0x28/0xa0
>>>   ext4_getblk+0xba/0x2d0
>>>   ext4_bread_batch+0x56/0x170
>>>   __ext4_find_entry+0x17c/0x410
>>>   ? lock_release+0xc6/0x290
>>>   ext4_lookup+0x7a/0x1d0
>>>   __lookup_slow+0xf9/0x1b0
>>>   walk_component+0xe0/0x150
>>>   link_path_walk+0x201/0x3e0
>>>   path_openat+0xb1/0xb30
>>>   ? stack_depot_save_flags+0x41e/0xa00
>>>   do_filp_open+0xbc/0x170
>>>   ? _raw_spin_unlock_irqrestore+0x2c/0x50
>>>   ? __create_object+0x59/0x80
>>>   ? trace_kmem_cache_alloc+0x1d/0xa0
>>>   ? vprintk_emit+0x2b2/0x360
>>>   do_open_execat+0x56/0x100
>>>   alloc_bprm+0x1a/0x200
>>>   ? __pfx_kernel_init+0x10/0x10
>>>   kernel_execve+0x4b/0x160
>>>   kernel_init+0xe5/0x1c0
>>>   ret_from_fork+0x185/0x1d0
>>>   ? __pfx_kernel_init+0x10/0x10
>>>   ret_from_fork_asm+0x1a/0x30
>>>   </TASK>
>>> random: crng init done
>>>
>> It is because bh_lru_install() of fs/buffer.c calls cpu_is_isolated()
>> without holding a rcu_read_lock. Will need to add a rcu_read_lock() there.
> 
> But this is called within bh_lru_lock() which should have either disabled
> IRQs or preemption off. I would expect rcu_dereference_check() to automatically
> verify those implied RCU read-side critical sections.
> 
> Let's see, lockdep_assert_in_rcu_reader() checks preemptible(), which is:
> 
> #define preemptible()	(preempt_count() == 0 && !irqs_disabled())
> 
> Ah but if !CONFIG_PREEMPT_COUNT:
> 
> #define preemptible()	0
> 
> Chen did you have !CONFIG_PREEMPT_COUNT ?
> 
> Probably lockdep_assert_in_rcu_reader() should be fixed accordingly and consider
> preemption always disabled whenever !CONFIG_PREEMPT_COUNT. Let me check that...
> 
> Thanks.
> 

I compiled with CONFIG_PREEMPT_COUNT=y and CONFIG_SMP=y.

-- 
Best regards,
Ridong

Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Frederic Weisbecker 5 months ago
Le Mon, Nov 03, 2025 at 10:22:47AM +0800, Chen Ridong a écrit :
> 
> 
> On 2025/11/1 0:17, Frederic Weisbecker wrote:
> > Le Tue, Oct 21, 2025 at 12:03:05AM -0400, Waiman Long a écrit :
> >> On 10/20/25 9:46 PM, Chen Ridong wrote:
> >>>
> >>> On 2025/10/14 4:31, Frederic Weisbecker wrote:
> >>>> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
> >>>> A synchronization mechanism is then needed to synchronize the updates
> >>>> with the housekeeping cpumask readers.
> >>>>
> >>>> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
> >>>> cpumask will be modified, the update side will wait for an RCU grace
> >>>> period and propagate the change to interested subsystem when deemed
> >>>> necessary.
> >>>>
> >>>> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> >>>> ---
> >>>>   kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
> >>>>   kernel/sched/sched.h     |  1 +
> >>>>   2 files changed, 37 insertions(+), 22 deletions(-)
> >>>>
> >>>> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
> >>>> index 8690fb705089..b46c20b5437f 100644
> >>>> --- a/kernel/sched/isolation.c
> >>>> +++ b/kernel/sched/isolation.c
> >>>> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
> >>>>   EXPORT_SYMBOL_GPL(housekeeping_overridden);
> >>>>   struct housekeeping {
> >>>> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
> >>>> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
> >>>>   	unsigned long flags;
> >>>>   };
> >>>> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
> >>>>   }
> >>>>   EXPORT_SYMBOL_GPL(housekeeping_enabled);
> >>>> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
> >>>> +{
> >>>> +	if (static_branch_unlikely(&housekeeping_overridden)) {
> >>>> +		if (housekeeping.flags & BIT(type)) {
> >>>> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
> >>>> +		}
> >>>> +	}
> >>>> +	return cpu_possible_mask;
> >>>> +}
> >>>> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> >>>> +
> >>>>   int housekeeping_any_cpu(enum hk_type type)
> >>>>   {
> >>>>   	int cpu;
> >>>>   	if (static_branch_unlikely(&housekeeping_overridden)) {
> >>>>   		if (housekeeping.flags & BIT(type)) {
> >>>> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
> >>>> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
> >>>>   			if (cpu < nr_cpu_ids)
> >>>>   				return cpu;
> >>>> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
> >>>> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
> >>>>   			if (likely(cpu < nr_cpu_ids))
> >>>>   				return cpu;
> >>>>   			/*
> >>>> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
> >>>>   }
> >>>>   EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
> >>>> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
> >>>> -{
> >>>> -	if (static_branch_unlikely(&housekeeping_overridden))
> >>>> -		if (housekeeping.flags & BIT(type))
> >>>> -			return housekeeping.cpumasks[type];
> >>>> -	return cpu_possible_mask;
> >>>> -}
> >>>> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
> >>>> -
> >>>>   void housekeeping_affine(struct task_struct *t, enum hk_type type)
> >>>>   {
> >>>>   	if (static_branch_unlikely(&housekeeping_overridden))
> >>>>   		if (housekeeping.flags & BIT(type))
> >>>> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
> >>>> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
> >>>>   }
> >>>>   EXPORT_SYMBOL_GPL(housekeeping_affine);
> >>>>   bool housekeeping_test_cpu(int cpu, enum hk_type type)
> >>>>   {
> >>>> -	if (static_branch_unlikely(&housekeeping_overridden))
> >>>> -		if (housekeeping.flags & BIT(type))
> >>>> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
> >>>> +	if (housekeeping.flags & BIT(type))
> >>>> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
> >>>>   	return true;
> >>>>   }
> >>>>   EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
> >>>> @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
> >>>>   	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
> >>>>   		sched_tick_offload_init();
> >>>> -
> >>>> +	/*
> >>>> +	 * Realloc with a proper allocator so that any cpumask update
> >>>> +	 * can indifferently free the old version with kfree().
> >>>> +	 */
> >>>>   	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
> >>>> +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
> >>>> +
> >>>> +		if (WARN_ON_ONCE(!nmask))
> >>>> +			return;
> >>>> +
> >>>> +		omask = rcu_dereference(housekeeping.cpumasks[type]);
> >>>> +
> >>>>   		/* We need at least one CPU to handle housekeeping work */
> >>>> -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
> >>>> +		WARN_ON_ONCE(cpumask_empty(omask));
> >>>> +		cpumask_copy(nmask, omask);
> >>>> +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
> >>>> +		memblock_free(omask, cpumask_size());
> >>>>   	}
> >>>>   }
> >>>>   static void __init housekeeping_setup_type(enum hk_type type,
> >>>>   					   cpumask_var_t housekeeping_staging)
> >>>>   {
> >>>> +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
> >>>> -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
> >>>> -	cpumask_copy(housekeeping.cpumasks[type],
> >>>> -		     housekeeping_staging);
> >>>> +	cpumask_copy(mask, housekeeping_staging);
> >>>> +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
> >>>>   }
> >>>>   static int __init housekeeping_setup(char *str, unsigned long flags)
> >>>> @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
> >>>>   		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
> >>>>   			if (!cpumask_equal(housekeeping_staging,
> >>>> -					   housekeeping.cpumasks[type])) {
> >>>> +					   housekeeping_cpumask(type))) {
> >>>>   				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
> >>>>   				goto free_housekeeping_staging;
> >>>>   			}
> >>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> >>>> index 1f5d07067f60..0c0ef8999fd6 100644
> >>>> --- a/kernel/sched/sched.h
> >>>> +++ b/kernel/sched/sched.h
> >>>> @@ -42,6 +42,7 @@
> >>>>   #include <linux/ktime_api.h>
> >>>>   #include <linux/lockdep_api.h>
> >>>>   #include <linux/lockdep.h>
> >>>> +#include <linux/memblock.h>
> >>>>   #include <linux/minmax.h>
> >>>>   #include <linux/mm.h>
> >>>>   #include <linux/module.h>
> >>> A warning was detected:
> >>>
> >>> =============================
> >>> WARNING: suspicious RCU usage
> >>> 6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
> >>> -----------------------------
> >>> kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!
> >>>
> >>> other info that might help us debug this:
> >>>
> >>>
> >>> rcu_scheduler_active = 2, debug_locks = 1
> >>> 1 lock held by swapper/0/1:
> >>>   #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone
> >>>
> >>> stack backtrace:
> >>> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
> >>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
> >>> Call Trace:
> >>>   <TASK>
> >>>   dump_stack_lvl+0x68/0xa0
> >>>   lockdep_rcu_suspicious+0x148/0x1b0
> >>>   housekeeping_cpumask+0xaa/0xb0
> >>>   housekeeping_test_cpu+0x25/0x40
> >>>   find_get_block_common+0x41/0x3e0
> >>>   bdev_getblk+0x28/0xa0
> >>>   ext4_getblk+0xba/0x2d0
> >>>   ext4_bread_batch+0x56/0x170
> >>>   __ext4_find_entry+0x17c/0x410
> >>>   ? lock_release+0xc6/0x290
> >>>   ext4_lookup+0x7a/0x1d0
> >>>   __lookup_slow+0xf9/0x1b0
> >>>   walk_component+0xe0/0x150
> >>>   link_path_walk+0x201/0x3e0
> >>>   path_openat+0xb1/0xb30
> >>>   ? stack_depot_save_flags+0x41e/0xa00
> >>>   do_filp_open+0xbc/0x170
> >>>   ? _raw_spin_unlock_irqrestore+0x2c/0x50
> >>>   ? __create_object+0x59/0x80
> >>>   ? trace_kmem_cache_alloc+0x1d/0xa0
> >>>   ? vprintk_emit+0x2b2/0x360
> >>>   do_open_execat+0x56/0x100
> >>>   alloc_bprm+0x1a/0x200
> >>>   ? __pfx_kernel_init+0x10/0x10
> >>>   kernel_execve+0x4b/0x160
> >>>   kernel_init+0xe5/0x1c0
> >>>   ret_from_fork+0x185/0x1d0
> >>>   ? __pfx_kernel_init+0x10/0x10
> >>>   ret_from_fork_asm+0x1a/0x30
> >>>   </TASK>
> >>> random: crng init done
> >>>
> >> It is because bh_lru_install() of fs/buffer.c calls cpu_is_isolated()
> >> without holding a rcu_read_lock. Will need to add a rcu_read_lock() there.
> > 
> > But this is called within bh_lru_lock() which should have either disabled
> > IRQs or preemption off. I would expect rcu_dereference_check() to automatically
> > verify those implied RCU read-side critical sections.
> > 
> > Let's see, lockdep_assert_in_rcu_reader() checks preemptible(), which is:
> > 
> > #define preemptible()	(preempt_count() == 0 && !irqs_disabled())
> > 
> > Ah but if !CONFIG_PREEMPT_COUNT:
> > 
> > #define preemptible()	0
> > 
> > Chen did you have !CONFIG_PREEMPT_COUNT ?
> > 
> > Probably lockdep_assert_in_rcu_reader() should be fixed accordingly and consider
> > preemption always disabled whenever !CONFIG_PREEMPT_COUNT. Let me check that...
> > 
> > Thanks.
> > 
> 
> I compiled with CONFIG_PREEMPT_COUNT=y and CONFIG_SMP=y.

Oh actually I think it's my fault. This should be fixed with this:

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 95d69c2102f6..b2cb75513336 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -56,8 +56,8 @@ static bool housekeeping_dereference_check(enum hk_type type)
 
 static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type)
 {
-	return rcu_dereference_check(housekeeping.cpumasks[type],
-				     housekeeping_dereference_check(type));
+	return rcu_dereference_all_check(housekeeping.cpumasks[type],
+					 housekeeping_dereference_check(type));
 }
 
 const struct cpumask *housekeeping_cpumask(enum hk_type type)
Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Waiman Long 5 months ago
On 10/31/25 12:17 PM, Frederic Weisbecker wrote:
> Le Tue, Oct 21, 2025 at 12:03:05AM -0400, Waiman Long a écrit :
>> On 10/20/25 9:46 PM, Chen Ridong wrote:
>>> On 2025/10/14 4:31, Frederic Weisbecker wrote:
>>>> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
>>>> A synchronization mechanism is then needed to synchronize the updates
>>>> with the housekeeping cpumask readers.
>>>>
>>>> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
>>>> cpumask will be modified, the update side will wait for an RCU grace
>>>> period and propagate the change to interested subsystem when deemed
>>>> necessary.
>>>>
>>>> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
>>>> ---
>>>>    kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
>>>>    kernel/sched/sched.h     |  1 +
>>>>    2 files changed, 37 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
>>>> index 8690fb705089..b46c20b5437f 100644
>>>> --- a/kernel/sched/isolation.c
>>>> +++ b/kernel/sched/isolation.c
>>>> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
>>>>    EXPORT_SYMBOL_GPL(housekeeping_overridden);
>>>>    struct housekeeping {
>>>> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
>>>> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
>>>>    	unsigned long flags;
>>>>    };
>>>> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(housekeeping_enabled);
>>>> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
>>>> +{
>>>> +	if (static_branch_unlikely(&housekeeping_overridden)) {
>>>> +		if (housekeeping.flags & BIT(type)) {
>>>> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
>>>> +		}
>>>> +	}
>>>> +	return cpu_possible_mask;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>>>> +
>>>>    int housekeeping_any_cpu(enum hk_type type)
>>>>    {
>>>>    	int cpu;
>>>>    	if (static_branch_unlikely(&housekeeping_overridden)) {
>>>>    		if (housekeeping.flags & BIT(type)) {
>>>> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
>>>> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
>>>>    			if (cpu < nr_cpu_ids)
>>>>    				return cpu;
>>>> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
>>>> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
>>>>    			if (likely(cpu < nr_cpu_ids))
>>>>    				return cpu;
>>>>    			/*
>>>> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
>>>> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
>>>> -{
>>>> -	if (static_branch_unlikely(&housekeeping_overridden))
>>>> -		if (housekeeping.flags & BIT(type))
>>>> -			return housekeeping.cpumasks[type];
>>>> -	return cpu_possible_mask;
>>>> -}
>>>> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>>>> -
>>>>    void housekeeping_affine(struct task_struct *t, enum hk_type type)
>>>>    {
>>>>    	if (static_branch_unlikely(&housekeeping_overridden))
>>>>    		if (housekeeping.flags & BIT(type))
>>>> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
>>>> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(housekeeping_affine);
>>>>    bool housekeeping_test_cpu(int cpu, enum hk_type type)
>>>>    {
>>>> -	if (static_branch_unlikely(&housekeeping_overridden))
>>>> -		if (housekeeping.flags & BIT(type))
>>>> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
>>>> +	if (housekeeping.flags & BIT(type))
>>>> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
>>>>    	return true;
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
>>>> @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
>>>>    	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
>>>>    		sched_tick_offload_init();
>>>> -
>>>> +	/*
>>>> +	 * Realloc with a proper allocator so that any cpumask update
>>>> +	 * can indifferently free the old version with kfree().
>>>> +	 */
>>>>    	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
>>>> +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
>>>> +
>>>> +		if (WARN_ON_ONCE(!nmask))
>>>> +			return;
>>>> +
>>>> +		omask = rcu_dereference(housekeeping.cpumasks[type]);
>>>> +
>>>>    		/* We need at least one CPU to handle housekeeping work */
>>>> -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
>>>> +		WARN_ON_ONCE(cpumask_empty(omask));
>>>> +		cpumask_copy(nmask, omask);
>>>> +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
>>>> +		memblock_free(omask, cpumask_size());
>>>>    	}
>>>>    }
>>>>    static void __init housekeeping_setup_type(enum hk_type type,
>>>>    					   cpumask_var_t housekeeping_staging)
>>>>    {
>>>> +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
>>>> -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
>>>> -	cpumask_copy(housekeeping.cpumasks[type],
>>>> -		     housekeeping_staging);
>>>> +	cpumask_copy(mask, housekeeping_staging);
>>>> +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
>>>>    }
>>>>    static int __init housekeeping_setup(char *str, unsigned long flags)
>>>> @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
>>>>    		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
>>>>    			if (!cpumask_equal(housekeeping_staging,
>>>> -					   housekeeping.cpumasks[type])) {
>>>> +					   housekeeping_cpumask(type))) {
>>>>    				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
>>>>    				goto free_housekeeping_staging;
>>>>    			}
>>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>>> index 1f5d07067f60..0c0ef8999fd6 100644
>>>> --- a/kernel/sched/sched.h
>>>> +++ b/kernel/sched/sched.h
>>>> @@ -42,6 +42,7 @@
>>>>    #include <linux/ktime_api.h>
>>>>    #include <linux/lockdep_api.h>
>>>>    #include <linux/lockdep.h>
>>>> +#include <linux/memblock.h>
>>>>    #include <linux/minmax.h>
>>>>    #include <linux/mm.h>
>>>>    #include <linux/module.h>
>>> A warning was detected:
>>>
>>> =============================
>>> WARNING: suspicious RCU usage
>>> 6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
>>> -----------------------------
>>> kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!
>>>
>>> other info that might help us debug this:
>>>
>>>
>>> rcu_scheduler_active = 2, debug_locks = 1
>>> 1 lock held by swapper/0/1:
>>>    #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone
>>>
>>> stack backtrace:
>>> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
>>> Call Trace:
>>>    <TASK>
>>>    dump_stack_lvl+0x68/0xa0
>>>    lockdep_rcu_suspicious+0x148/0x1b0
>>>    housekeeping_cpumask+0xaa/0xb0
>>>    housekeeping_test_cpu+0x25/0x40
>>>    find_get_block_common+0x41/0x3e0
>>>    bdev_getblk+0x28/0xa0
>>>    ext4_getblk+0xba/0x2d0
>>>    ext4_bread_batch+0x56/0x170
>>>    __ext4_find_entry+0x17c/0x410
>>>    ? lock_release+0xc6/0x290
>>>    ext4_lookup+0x7a/0x1d0
>>>    __lookup_slow+0xf9/0x1b0
>>>    walk_component+0xe0/0x150
>>>    link_path_walk+0x201/0x3e0
>>>    path_openat+0xb1/0xb30
>>>    ? stack_depot_save_flags+0x41e/0xa00
>>>    do_filp_open+0xbc/0x170
>>>    ? _raw_spin_unlock_irqrestore+0x2c/0x50
>>>    ? __create_object+0x59/0x80
>>>    ? trace_kmem_cache_alloc+0x1d/0xa0
>>>    ? vprintk_emit+0x2b2/0x360
>>>    do_open_execat+0x56/0x100
>>>    alloc_bprm+0x1a/0x200
>>>    ? __pfx_kernel_init+0x10/0x10
>>>    kernel_execve+0x4b/0x160
>>>    kernel_init+0xe5/0x1c0
>>>    ret_from_fork+0x185/0x1d0
>>>    ? __pfx_kernel_init+0x10/0x10
>>>    ret_from_fork_asm+0x1a/0x30
>>>    </TASK>
>>> random: crng init done
>>>
>> It is because bh_lru_install() of fs/buffer.c calls cpu_is_isolated()
>> without holding a rcu_read_lock. Will need to add a rcu_read_lock() there.
> But this is called within bh_lru_lock() which should have either disabled
> IRQs or preemption off. I would expect rcu_dereference_check() to automatically
> verify those implied RCU read-side critical sections.
>
> Let's see, lockdep_assert_in_rcu_reader() checks preemptible(), which is:
>
> #define preemptible()	(preempt_count() == 0 && !irqs_disabled())
>
> Ah but if !CONFIG_PREEMPT_COUNT:
>
> #define preemptible()	0
>
> Chen did you have !CONFIG_PREEMPT_COUNT ?
>
> Probably lockdep_assert_in_rcu_reader() should be fixed accordingly and consider
> preemption always disabled whenever !CONFIG_PREEMPT_COUNT. Let me check that...

Yes, !CONFIG_PREEMPT_COUNT could be the problem here. I thought it was 
missing rcu_read_lock(), but I didn't really check the code to ensure that.

Cheers, Longman

Re: [PATCH 12/33] sched/isolation: Convert housekeeping cpumasks to rcu pointers
Posted by Chen Ridong 5 months, 2 weeks ago

On 2025/10/21 9:46, Chen Ridong wrote:
> 
> 
> On 2025/10/14 4:31, Frederic Weisbecker wrote:
>> HK_TYPE_DOMAIN's cpumask will soon be made modifyable by cpuset.
>> A synchronization mechanism is then needed to synchronize the updates
>> with the housekeeping cpumask readers.
>>
>> Turn the housekeeping cpumasks into RCU pointers. Once a housekeeping
>> cpumask will be modified, the update side will wait for an RCU grace
>> period and propagate the change to interested subsystem when deemed
>> necessary.
>>
>> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
>> ---
>>  kernel/sched/isolation.c | 58 +++++++++++++++++++++++++---------------
>>  kernel/sched/sched.h     |  1 +
>>  2 files changed, 37 insertions(+), 22 deletions(-)
>>
>> diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
>> index 8690fb705089..b46c20b5437f 100644
>> --- a/kernel/sched/isolation.c
>> +++ b/kernel/sched/isolation.c
>> @@ -21,7 +21,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
>>  EXPORT_SYMBOL_GPL(housekeeping_overridden);
>>  
>>  struct housekeeping {
>> -	cpumask_var_t cpumasks[HK_TYPE_MAX];
>> +	struct cpumask __rcu *cpumasks[HK_TYPE_MAX];
>>  	unsigned long flags;
>>  };
>>  
>> @@ -33,17 +33,28 @@ bool housekeeping_enabled(enum hk_type type)
>>  }
>>  EXPORT_SYMBOL_GPL(housekeeping_enabled);
>>  
>> +const struct cpumask *housekeeping_cpumask(enum hk_type type)
>> +{
>> +	if (static_branch_unlikely(&housekeeping_overridden)) {
>> +		if (housekeeping.flags & BIT(type)) {
>> +			return rcu_dereference_check(housekeeping.cpumasks[type], 1);
>> +		}
>> +	}
>> +	return cpu_possible_mask;
>> +}
>> +EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>> +
>>  int housekeeping_any_cpu(enum hk_type type)
>>  {
>>  	int cpu;
>>  
>>  	if (static_branch_unlikely(&housekeeping_overridden)) {
>>  		if (housekeeping.flags & BIT(type)) {
>> -			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
>> +			cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id());
>>  			if (cpu < nr_cpu_ids)
>>  				return cpu;
>>  
>> -			cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
>> +			cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask);
>>  			if (likely(cpu < nr_cpu_ids))
>>  				return cpu;
>>  			/*
>> @@ -59,28 +70,18 @@ int housekeeping_any_cpu(enum hk_type type)
>>  }
>>  EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
>>  
>> -const struct cpumask *housekeeping_cpumask(enum hk_type type)
>> -{
>> -	if (static_branch_unlikely(&housekeeping_overridden))
>> -		if (housekeeping.flags & BIT(type))
>> -			return housekeeping.cpumasks[type];
>> -	return cpu_possible_mask;
>> -}
>> -EXPORT_SYMBOL_GPL(housekeeping_cpumask);
>> -
>>  void housekeeping_affine(struct task_struct *t, enum hk_type type)
>>  {
>>  	if (static_branch_unlikely(&housekeeping_overridden))
>>  		if (housekeeping.flags & BIT(type))
>> -			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
>> +			set_cpus_allowed_ptr(t, housekeeping_cpumask(type));
>>  }
>>  EXPORT_SYMBOL_GPL(housekeeping_affine);
>>  
>>  bool housekeeping_test_cpu(int cpu, enum hk_type type)
>>  {
>> -	if (static_branch_unlikely(&housekeeping_overridden))
>> -		if (housekeeping.flags & BIT(type))
>> -			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
>> +	if (housekeeping.flags & BIT(type))
>> +		return cpumask_test_cpu(cpu, housekeeping_cpumask(type));
>>  	return true;
>>  }
>>  EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
>> @@ -96,20 +97,33 @@ void __init housekeeping_init(void)
>>  
>>  	if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
>>  		sched_tick_offload_init();
>> -
>> +	/*
>> +	 * Realloc with a proper allocator so that any cpumask update
>> +	 * can indifferently free the old version with kfree().
>> +	 */
>>  	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
>> +		struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL);
>> +
>> +		if (WARN_ON_ONCE(!nmask))
>> +			return;
>> +
>> +		omask = rcu_dereference(housekeeping.cpumasks[type]);
>> +
>>  		/* We need at least one CPU to handle housekeeping work */
>> -		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
>> +		WARN_ON_ONCE(cpumask_empty(omask));
>> +		cpumask_copy(nmask, omask);
>> +		RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask);
>> +		memblock_free(omask, cpumask_size());
>>  	}
>>  }
>>  
>>  static void __init housekeeping_setup_type(enum hk_type type,
>>  					   cpumask_var_t housekeeping_staging)
>>  {
>> +	struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
>>  
>> -	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
>> -	cpumask_copy(housekeeping.cpumasks[type],
>> -		     housekeeping_staging);
>> +	cpumask_copy(mask, housekeeping_staging);
>> +	RCU_INIT_POINTER(housekeeping.cpumasks[type], mask);
>>  }
>>  
>>  static int __init housekeeping_setup(char *str, unsigned long flags)
>> @@ -162,7 +176,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
>>  
>>  		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
>>  			if (!cpumask_equal(housekeeping_staging,
>> -					   housekeeping.cpumasks[type])) {
>> +					   housekeeping_cpumask(type))) {
>>  				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
>>  				goto free_housekeeping_staging;
>>  			}
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 1f5d07067f60..0c0ef8999fd6 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -42,6 +42,7 @@
>>  #include <linux/ktime_api.h>
>>  #include <linux/lockdep_api.h>
>>  #include <linux/lockdep.h>
>> +#include <linux/memblock.h>
>>  #include <linux/minmax.h>
>>  #include <linux/mm.h>
>>  #include <linux/module.h>
> 
> A warning was detected:
> 
> =============================
> WARNING: suspicious RCU usage
> 6.17.0-next-20251009-00033-g4444da88969b #808 Not tainted
> -----------------------------
> kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage!
> 
> other info that might help us debug this:
> 
> 
> rcu_scheduler_active = 2, debug_locks = 1
> 1 lock held by swapper/0/1:
>  #0: ffff888100600ce0 (&type->i_mutex_dir_key#3){++++}-{4:4}, at: walk_compone
> 
> stack backtrace:
> CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-next-20251009-00033-g4
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239
> Call Trace:
>  <TASK>
>  dump_stack_lvl+0x68/0xa0
>  lockdep_rcu_suspicious+0x148/0x1b0
>  housekeeping_cpumask+0xaa/0xb0
>  housekeeping_test_cpu+0x25/0x40
>  find_get_block_common+0x41/0x3e0
>  bdev_getblk+0x28/0xa0
>  ext4_getblk+0xba/0x2d0
>  ext4_bread_batch+0x56/0x170
>  __ext4_find_entry+0x17c/0x410
>  ? lock_release+0xc6/0x290
>  ext4_lookup+0x7a/0x1d0
>  __lookup_slow+0xf9/0x1b0
>  walk_component+0xe0/0x150
>  link_path_walk+0x201/0x3e0
>  path_openat+0xb1/0xb30
>  ? stack_depot_save_flags+0x41e/0xa00
>  do_filp_open+0xbc/0x170
>  ? _raw_spin_unlock_irqrestore+0x2c/0x50
>  ? __create_object+0x59/0x80
>  ? trace_kmem_cache_alloc+0x1d/0xa0
>  ? vprintk_emit+0x2b2/0x360
>  do_open_execat+0x56/0x100
>  alloc_bprm+0x1a/0x200
>  ? __pfx_kernel_init+0x10/0x10
>  kernel_execve+0x4b/0x160
>  kernel_init+0xe5/0x1c0
>  ret_from_fork+0x185/0x1d0
>  ? __pfx_kernel_init+0x10/0x10
>  ret_from_fork_asm+0x1a/0x30
>  </TASK>
> random: crng init done
> 

This warning was likely introduced by patch 13, which added the housekeeping_dereference_check
condition, and is not caused by the current patch.

-- 
Best regards,
Ridong