Allow preemption during IPI completion waiting to improve real-time performance

[PATCH 04/11] smp: Use on-stack cpumask in smp_call_function_many_cond

Posted by Chuyi Zhou 6 days, 11 hours ago

This patch use on-stack cpumask to replace percpu cfd cpumask in
smp_call_function_many_cond(). alloc_cpumask_var() may fail when
CONFIG_CPUMASK_OFFSTACK is enabled. In such extreme case, fall back to
cfd->cpumask. This is a preparation for the next patch.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
---
 kernel/smp.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index f572716c3c7d..35948afced2e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -805,11 +805,17 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 	int cpu, last_cpu, this_cpu = smp_processor_id();
 	struct call_function_data *cfd;
 	bool wait = scf_flags & SCF_WAIT;
+	bool preemptible_wait = true;
+	cpumask_var_t cpumask_stack;
+	struct cpumask *cpumask;
 	int nr_cpus = 0;
 	bool run_remote = false;
 
 	lockdep_assert_preemption_disabled();
 
+	if (!alloc_cpumask_var(&cpumask_stack, GFP_ATOMIC))
+		preemptible_wait = false;
+
 	/*
 	 * Can deadlock when called with interrupts disabled.
 	 * We allow cpu's that are not yet online though, as no one else can
@@ -831,15 +837,18 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 	/* Check if we need remote execution, i.e., any CPU excluding this one. */
 	if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
 		cfd = this_cpu_ptr(&cfd_data);
-		cpumask_and(cfd->cpumask, mask, cpu_online_mask);
-		__cpumask_clear_cpu(this_cpu, cfd->cpumask);
+
+		cpumask = preemptible_wait ? cpumask_stack : cfd->cpumask;
+
+		cpumask_and(cpumask, mask, cpu_online_mask);
+		__cpumask_clear_cpu(this_cpu, cpumask);
 
 		cpumask_clear(cfd->cpumask_ipi);
-		for_each_cpu(cpu, cfd->cpumask) {
+		for_each_cpu(cpu, cpumask) {
 			call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 
 			if (cond_func && !cond_func(cpu, info)) {
-				__cpumask_clear_cpu(cpu, cfd->cpumask);
+				__cpumask_clear_cpu(cpu, cpumask);
 				continue;
 			}
 
@@ -890,13 +899,16 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 	}
 
 	if (run_remote && wait) {
-		for_each_cpu(cpu, cfd->cpumask) {
+		for_each_cpu(cpu, cpumask) {
 			call_single_data_t *csd;
 
 			csd = per_cpu_ptr(cfd->csd, cpu);
 			csd_lock_wait(csd);
 		}
 	}
+
+	if (preemptible_wait)
+		free_cpumask_var(cpumask_stack);
 }
 
 /**
-- 
2.20.1

Re: [PATCH 04/11] smp: Use on-stack cpumask in smp_call_function_many_cond

Posted by Peter Zijlstra 4 days, 13 hours ago

On Tue, Feb 03, 2026 at 07:23:54PM +0800, Chuyi Zhou wrote:
> This patch use on-stack cpumask to replace percpu cfd cpumask in
> smp_call_function_many_cond(). alloc_cpumask_var() may fail when
> CONFIG_CPUMASK_OFFSTACK is enabled. In such extreme case, fall back to
> cfd->cpumask. This is a preparation for the next patch.
> 
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  kernel/smp.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/smp.c b/kernel/smp.c
> index f572716c3c7d..35948afced2e 100644
> --- a/kernel/smp.c
> +++ b/kernel/smp.c
> @@ -805,11 +805,17 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>  	int cpu, last_cpu, this_cpu = smp_processor_id();
>  	struct call_function_data *cfd;
>  	bool wait = scf_flags & SCF_WAIT;
> +	bool preemptible_wait = true;
> +	cpumask_var_t cpumask_stack;
> +	struct cpumask *cpumask;
>  	int nr_cpus = 0;
>  	bool run_remote = false;
>  
>  	lockdep_assert_preemption_disabled();
>  
> +	if (!alloc_cpumask_var(&cpumask_stack, GFP_ATOMIC))
> +		preemptible_wait = false;
> +
>  	/*
>  	 * Can deadlock when called with interrupts disabled.
>  	 * We allow cpu's that are not yet online though, as no one else can
> @@ -831,15 +837,18 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>  	/* Check if we need remote execution, i.e., any CPU excluding this one. */
>  	if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
>  		cfd = this_cpu_ptr(&cfd_data);
> -		cpumask_and(cfd->cpumask, mask, cpu_online_mask);
> -		__cpumask_clear_cpu(this_cpu, cfd->cpumask);
> +
> +		cpumask = preemptible_wait ? cpumask_stack : cfd->cpumask;
> +
> +		cpumask_and(cpumask, mask, cpu_online_mask);
> +		__cpumask_clear_cpu(this_cpu, cpumask);
>  
>  		cpumask_clear(cfd->cpumask_ipi);
> -		for_each_cpu(cpu, cfd->cpumask) {
> +		for_each_cpu(cpu, cpumask) {
>  			call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
>  
>  			if (cond_func && !cond_func(cpu, info)) {
> -				__cpumask_clear_cpu(cpu, cfd->cpumask);
> +				__cpumask_clear_cpu(cpu, cpumask);
>  				continue;
>  			}
>  
> @@ -890,13 +899,16 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>  	}
>  
>  	if (run_remote && wait) {
> -		for_each_cpu(cpu, cfd->cpumask) {
> +		for_each_cpu(cpu, cpumask) {
>  			call_single_data_t *csd;
>  
>  			csd = per_cpu_ptr(cfd->csd, cpu);
>  			csd_lock_wait(csd);
>  		}
>  	}
> +
> +	if (preemptible_wait)
> +		free_cpumask_var(cpumask_stack);
>  }

*sigh*, even if you don't break RT, this is quite terrible, what is
wrong with something like so?

---
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -802,19 +802,18 @@ static void smp_call_function_many_cond(
 					unsigned int scf_flags,
 					smp_cond_func_t cond_func)
 {
+	struct call_function_data *cfd = this_cpu_ptr(&cfd_data);
 	int cpu, last_cpu, this_cpu = smp_processor_id();
-	struct call_function_data *cfd;
+	struct cpumask *cpumask = cfd->cpumask;
 	bool wait = scf_flags & SCF_WAIT;
-	bool preemptible_wait = true;
 	cpumask_var_t cpumask_stack;
-	struct cpumask *cpumask;
 	int nr_cpus = 0;
 	bool run_remote = false;
 
 	lockdep_assert_preemption_disabled();
 
 	if (!alloc_cpumask_var(&cpumask_stack, GFP_ATOMIC))
-		preemptible_wait = false;
+		cpumask = cpumask_stack;
 
 	/*
 	 * Can deadlock when called with interrupts disabled.
@@ -836,10 +835,6 @@ static void smp_call_function_many_cond(
 
 	/* Check if we need remote execution, i.e., any CPU excluding this one. */
 	if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
-		cfd = this_cpu_ptr(&cfd_data);
-
-		cpumask = preemptible_wait ? cpumask_stack : cfd->cpumask;
-
 		cpumask_and(cpumask, mask, cpu_online_mask);
 		__cpumask_clear_cpu(this_cpu, cpumask);
 
@@ -907,8 +902,7 @@ static void smp_call_function_many_cond(
 		}
 	}
 
-	if (preemptible_wait)
-		free_cpumask_var(cpumask_stack);
+	free_cpumask_var(cpumask_stack);
 }
 
 /**

Re: [PATCH 04/11] smp: Use on-stack cpumask in smp_call_function_many_cond

Posted by Peter Zijlstra 4 days, 13 hours ago

On Tue, Feb 03, 2026 at 07:23:54PM +0800, Chuyi Zhou wrote:
> This patch use on-stack cpumask to replace percpu cfd cpumask in
> smp_call_function_many_cond(). alloc_cpumask_var() may fail when
> CONFIG_CPUMASK_OFFSTACK is enabled. In such extreme case, fall back to
> cfd->cpumask. This is a preparation for the next patch.
> 
> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
> ---
>  kernel/smp.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/smp.c b/kernel/smp.c
> index f572716c3c7d..35948afced2e 100644
> --- a/kernel/smp.c
> +++ b/kernel/smp.c
> @@ -805,11 +805,17 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>  	int cpu, last_cpu, this_cpu = smp_processor_id();
>  	struct call_function_data *cfd;
>  	bool wait = scf_flags & SCF_WAIT;
> +	bool preemptible_wait = true;
> +	cpumask_var_t cpumask_stack;
> +	struct cpumask *cpumask;
>  	int nr_cpus = 0;
>  	bool run_remote = false;
>  
>  	lockdep_assert_preemption_disabled();
>  
> +	if (!alloc_cpumask_var(&cpumask_stack, GFP_ATOMIC))
> +		preemptible_wait = false;

IIRC this breaks RT, must not allocate with preemption disabled.

Re: [PATCH 04/11] smp: Use on-stack cpumask in smp_call_function_many_cond

Posted by Chuyi Zhou 3 days, 14 hours ago

在 2026/2/5 17:44, Peter Zijlstra 写道:
> On Tue, Feb 03, 2026 at 07:23:54PM +0800, Chuyi Zhou wrote:
>> This patch use on-stack cpumask to replace percpu cfd cpumask in
>> smp_call_function_many_cond(). alloc_cpumask_var() may fail when
>> CONFIG_CPUMASK_OFFSTACK is enabled. In such extreme case, fall back to
>> cfd->cpumask. This is a preparation for the next patch.
>>
>> Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
>> ---
>>   kernel/smp.c | 22 +++++++++++++++++-----
>>   1 file changed, 17 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/smp.c b/kernel/smp.c
>> index f572716c3c7d..35948afced2e 100644
>> --- a/kernel/smp.c
>> +++ b/kernel/smp.c
>> @@ -805,11 +805,17 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>>   	int cpu, last_cpu, this_cpu = smp_processor_id();
>>   	struct call_function_data *cfd;
>>   	bool wait = scf_flags & SCF_WAIT;
>> +	bool preemptible_wait = true;
>> +	cpumask_var_t cpumask_stack;
>> +	struct cpumask *cpumask;
>>   	int nr_cpus = 0;
>>   	bool run_remote = false;
>>   
>>   	lockdep_assert_preemption_disabled();
>>   
>> +	if (!alloc_cpumask_var(&cpumask_stack, GFP_ATOMIC))
>> +		preemptible_wait = false;
> 
> IIRC this breaks RT, must not allocate with preemption disabled.

Thank you for the reminder.

Perhaps another feasible approach is only consider 
CONFIG_CPUMASK_OFFSTACK=n.

Of course, if we use cpus_read_lock and ensure that the caller’s context 
is sleepable, this issue would also be eliminated.