Switch the implementation of cpumask_any_but() to cpumask_first_but().
Elimate the need to looping and make cpumask_any_but() a macro to wrap
around cpumask_first_but(), just like cpumask_any() does, to make it
more consistence.
The change brings some benefit in terms of code size shrinking of
vmlinux, for NR_CPUS=64, it reduce 78 bytes in total, for
NR_CPUS=4096, it reduce 2 bytes in total. The details are shown in the
table [1].
Performance test is done using the test script [2]. Running the test for
10000 times, the origin implementation of cpumask_any_but() use 19665287
nanoseconds in total, the new version of it, which is a wrapper around
cpumask_first_but(), uses 19545574 nanoseconds. The difference is 119713
nanoseconds.
Co-developed-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: I Hsin Cheng <richard120310@gmail.com>
---
[1]:
For NR_CPUS=64:
$ ./scripts/bloat-o-meter vmlinux_old vmlinux_new
add/remove: 0/0 grow/shrink: 3/7 up/down: 46/-124 (-78)
Function old new delta
irq_migrate_all_off_this_cpu 716 745 +29
try_to_unmap_one 3380 3391 +11
try_to_migrate_one 2451 2457 +6
uncore_event_cpu_offline 343 342 -1
calibrate_delay_is_known 236 235 -1
tsc_store_and_check_tsc_adjust 506 495 -11
arch_tlbbatch_flush 302 288 -14
tmigr_cpu_offline 406 382 -24
perf_event_exit_cpu_context 592 565 -27
flush_tlb_mm_range 1561 1515 -46
Total: Before=23390770, After=23390692, chg -0.00%
For NR_CPUS=4096:
$ ./scripts/bloat-o-meter vmlinux_old vmlinux_new
add/remove: 0/0 grow/shrink: 7/3 up/down: 136/-138 (-2)
Function old new delta
uncore_event_cpu_offline 291 333 +42
try_to_migrate_one 2378 2413 +35
flush_tlb_mm_range 1476 1503 +27
irq_migrate_all_off_this_cpu 741 754 +13
tmigr_cpu_offline 353 362 +9
calibrate_delay_is_known 183 192 +9
arch_tlbbatch_flush 296 297 +1
tsc_store_and_check_tsc_adjust 484 482 -2
perf_event_exit_cpu_context 546 528 -18
try_to_unmap_one 3560 3442 -118
Total: Before=23448698, After=23448696, chg -0.00%
[2]:
static int __init test_init(void)
{
struct cpumask test_mask;
ktime_t start_time, end_time;
s64 elapsed_ns;
unsigned int result;
unsigned int random_cpu;
int i;
cpumask_copy(&test_mask, cpu_online_mask);
start_time = ktime_get();
for (i = 0; i < 100000; i++) {
get_random_bytes(&random_cpu, sizeof(random_cpu));
random_cpu = random_cpu % nr_cpu_ids;
result = cpumask_any_but(&test_mask, random_cpu);
}
end_time = ktime_get();
elapsed_ns = ktime_to_ns(ktime_sub(end_time, start_time));
pr_info("Total time: %lld ns\n", elapsed_ns);
return 0;
}
The test is running in the form of kernel module.
The test machine is running ubuntu 24.04 on x86_64 machine with kernel
version of v6.14.0, CPU type is AMD Ryzen 7 5700X3D 8-Core Processor.
Best regards,
I Hsin Cheng
---
include/linux/cpumask.h | 22 ++++++++++++++++------
1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f9a868384083..d91630a97d76 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -408,22 +408,22 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)
/**
- * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one.
+ * cpumask_first_but - return the first cpu in a cpumask, but not this one.
* @mask: the cpumask to search
* @cpu: the cpu to ignore.
*
- * Often used to find any cpu but smp_processor_id() in a mask.
+ * Often used to find the first cpu but smp_processor_id() in a mask.
* Return: >= nr_cpu_ids if no cpus set.
*/
static __always_inline
-unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+unsigned int cpumask_first_but(const struct cpumask *mask, unsigned int cpu)
{
unsigned int i;
cpumask_check(cpu);
- for_each_cpu(i, mask)
- if (i != cpu)
- break;
+ i = cpumask_first(mask);
+ if (i == cpu)
+ i = cpumask_next(i, mask);
return i;
}
@@ -864,6 +864,16 @@ void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp)
*/
#define cpumask_any(srcp) cpumask_first(srcp)
+/**
+ * cpumask_any_but - pick an arbitrary cpu from *srcp but not the given cpu
+ * @srcp: the input cpumask
+ * @cpu: the cpu to ignore
+ *
+ * Often used to find any cpu but smp_processor_id() in a mask.
+ * Return: >= nr_cpu_ids if no cpus set.
+ */
+#define cpumask_any_but(srcp, cpu) cpumask_first_but(srcp, cpu)
+
/**
* cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2
* @mask1: the first input cpumask
--
2.43.0
On Fri, Jun 13, 2025 at 11:34:46AM +0800, I Hsin Cheng wrote: > Switch the implementation of cpumask_any_but() to cpumask_first_but(). > Elimate the need to looping and make cpumask_any_but() a macro to wrap > around cpumask_first_but(), just like cpumask_any() does, to make it > more consistence. > > The change brings some benefit in terms of code size shrinking of > vmlinux, for NR_CPUS=64, it reduce 78 bytes in total, for > NR_CPUS=4096, it reduce 2 bytes in total. The details are shown in the > table [1]. > > Performance test is done using the test script [2]. Running the test for > 10000 times, the origin implementation of cpumask_any_but() use 19665287 > nanoseconds in total, the new version of it, which is a wrapper around > cpumask_first_but(), uses 19545574 nanoseconds. The difference is 119713 > nanoseconds. > > Co-developed-by: Yu-Chun Lin <eleanor15x@gmail.com> > Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com> > Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> This looks like an invalid SoB chain. In our private discussion, I did agree that you're free to build upon my previous patch for further development. Since this patch is almost identical to mine, I assume you intended to include me and Yu-Chun as co-authors. If that's the case, you should also add a Co-developed-by: tag for me. It might also be helpful to add a Link: tag pointing to my original patch for better traceability. https://docs.kernel.org/process/submitting-patches.html#when-to-use-acked-by-cc-and-co-developed-by Regards, Kuan-Wei > Signed-off-by: I Hsin Cheng <richard120310@gmail.com> > --- > [1]: > > For NR_CPUS=64: > $ ./scripts/bloat-o-meter vmlinux_old vmlinux_new > add/remove: 0/0 grow/shrink: 3/7 up/down: 46/-124 (-78) > Function old new delta > irq_migrate_all_off_this_cpu 716 745 +29 > try_to_unmap_one 3380 3391 +11 > try_to_migrate_one 2451 2457 +6 > uncore_event_cpu_offline 343 342 -1 > calibrate_delay_is_known 236 235 -1 > tsc_store_and_check_tsc_adjust 506 495 -11 > arch_tlbbatch_flush 302 288 -14 > tmigr_cpu_offline 406 382 -24 > perf_event_exit_cpu_context 592 565 -27 > flush_tlb_mm_range 1561 1515 -46 > Total: Before=23390770, After=23390692, chg -0.00% > > For NR_CPUS=4096: > $ ./scripts/bloat-o-meter vmlinux_old vmlinux_new > add/remove: 0/0 grow/shrink: 7/3 up/down: 136/-138 (-2) > Function old new delta > uncore_event_cpu_offline 291 333 +42 > try_to_migrate_one 2378 2413 +35 > flush_tlb_mm_range 1476 1503 +27 > irq_migrate_all_off_this_cpu 741 754 +13 > tmigr_cpu_offline 353 362 +9 > calibrate_delay_is_known 183 192 +9 > arch_tlbbatch_flush 296 297 +1 > tsc_store_and_check_tsc_adjust 484 482 -2 > perf_event_exit_cpu_context 546 528 -18 > try_to_unmap_one 3560 3442 -118 > Total: Before=23448698, After=23448696, chg -0.00% > > [2]: > static int __init test_init(void) > { > struct cpumask test_mask; > ktime_t start_time, end_time; > s64 elapsed_ns; > unsigned int result; > unsigned int random_cpu; > int i; > > cpumask_copy(&test_mask, cpu_online_mask); > > start_time = ktime_get(); > > for (i = 0; i < 100000; i++) { > get_random_bytes(&random_cpu, sizeof(random_cpu)); > random_cpu = random_cpu % nr_cpu_ids; > result = cpumask_any_but(&test_mask, random_cpu); > } > > end_time = ktime_get(); > > elapsed_ns = ktime_to_ns(ktime_sub(end_time, start_time)); > > pr_info("Total time: %lld ns\n", elapsed_ns); > > return 0; > } > > The test is running in the form of kernel module. > The test machine is running ubuntu 24.04 on x86_64 machine with kernel > version of v6.14.0, CPU type is AMD Ryzen 7 5700X3D 8-Core Processor. > > Best regards, > I Hsin Cheng > --- > include/linux/cpumask.h | 22 ++++++++++++++++------ > 1 file changed, 16 insertions(+), 6 deletions(-) > > diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h > index f9a868384083..d91630a97d76 100644 > --- a/include/linux/cpumask.h > +++ b/include/linux/cpumask.h > @@ -408,22 +408,22 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *src) > for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) > > /** > - * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. > + * cpumask_first_but - return the first cpu in a cpumask, but not this one. > * @mask: the cpumask to search > * @cpu: the cpu to ignore. > * > - * Often used to find any cpu but smp_processor_id() in a mask. > + * Often used to find the first cpu but smp_processor_id() in a mask. > * Return: >= nr_cpu_ids if no cpus set. > */ > static __always_inline > -unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) > +unsigned int cpumask_first_but(const struct cpumask *mask, unsigned int cpu) > { > unsigned int i; > > cpumask_check(cpu); > - for_each_cpu(i, mask) > - if (i != cpu) > - break; > + i = cpumask_first(mask); > + if (i == cpu) > + i = cpumask_next(i, mask); > return i; > } > > @@ -864,6 +864,16 @@ void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) > */ > #define cpumask_any(srcp) cpumask_first(srcp) > > +/** > + * cpumask_any_but - pick an arbitrary cpu from *srcp but not the given cpu > + * @srcp: the input cpumask > + * @cpu: the cpu to ignore > + * > + * Often used to find any cpu but smp_processor_id() in a mask. > + * Return: >= nr_cpu_ids if no cpus set. > + */ > +#define cpumask_any_but(srcp, cpu) cpumask_first_but(srcp, cpu) > + > /** > * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 > * @mask1: the first input cpumask > -- > 2.43.0 >
© 2016 - 2025 Red Hat, Inc.