On Mon, Dec 08, 2025 at 09:26:52AM +0000, K Prateek Nayak wrote:
> There is a developing pattern of using cpumask_and() followed by
> for_each_cpu_wrap() in the scheduler.
>
> To avoid a temporary variable and needlessly iterating over the cpumask
> twice - once for the cpumask_and() operation and the second in the
> for_each_cpu_wrap() loop - introduce a new for_each_cpu_and_wrap()
> helper to iterate over the common bits set on two bitfields starting at
> a specific position without needing an intermediate cpumask variable.
>
> Cc: Yury Norov <yury.norov@gmail.com>
> Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> ---
> include/linux/cpumask.h | 20 ++++++++++++++++++++
> include/linux/find.h | 37 +++++++++++++++++++++++++++++++++++++
> 2 files changed, 57 insertions(+)
>
> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> index ff8f41ab7ce6..b7a984c2a7d5 100644
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -406,6 +406,26 @@ unsigned int cpumask_random(const struct cpumask *src)
> #define for_each_cpu_and(cpu, mask1, mask2) \
> for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
>
> +/**
> + * for_each_cpu_wrap - iterate over every cpu in both masks, starting at a
for_each_cpu_and_wrap
With that,
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
> + * specified location.
> + * @cpu: the (optionally unsigned) integer iterator
> + * @mask1: the first cpumask pointer
> + * @mask2: the second cpumask pointer
> + * @start: the start location
> + *
> + * This saves a temporary CPU mask in many places. It is equivalent to:
> + * struct cpumask tmp;
> + * cpumask_and(&tmp, &mask1, &mask2);
> + * for_each_cpu_wrap(cpu, &tmp, start)
> + * ...
> + *
> + * After the loop, cpu is >= nr_cpu_ids.
> + */
> +#define for_each_cpu_and_wrap(cpu, mask1, mask2, start) \
> + for_each_and_bit_wrap(cpu, cpumask_bits(mask1), cpumask_bits(mask2), \
> + small_cpumask_bits, start)
> +
> /**
> * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
> * those present in another.
> diff --git a/include/linux/find.h b/include/linux/find.h
> index 9d720ad92bc1..fff9e2d55e4d 100644
> --- a/include/linux/find.h
> +++ b/include/linux/find.h
> @@ -487,6 +487,29 @@ unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
> return bit < start ? bit : size;
> }
>
> +/* Helper for for_each_and_bit_wrap(). */
> +static __always_inline
> +unsigned long __for_each_and_wrap(const unsigned long *bitmap1, const unsigned long *bitmap2,
> + unsigned long size, unsigned long start, unsigned long n)
> +{
> + unsigned long bit;
> +
> + /* If not wrapped around */
> + if (n > start) {
> + /* and have a bit, just return it. */
> + bit = find_next_and_bit(bitmap1, bitmap2, size, n);
> + if (bit < size)
> + return bit;
> +
> + /* Otherwise, wrap around and ... */
> + n = 0;
> + }
> +
> + /* Search the other part. */
> + bit = find_next_and_bit(bitmap1, bitmap2, start, n);
> + return bit < start ? bit : size;
> +}
> +
> /**
> * find_next_clump8 - find next 8-bit clump with set bits in a memory region
> * @clump: location to store copy of found clump
> @@ -682,6 +705,20 @@ unsigned long find_next_bit_le(const void *addr, unsigned
> (bit) < (size); \
> (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
>
> +/**
> + * for_each_and_bit_wrap - iterate over all set bits in (*addr1 & *addr2)
> + * starting from @start, and wrapping around the end of bitmap.
> + * @bit: offset for current iteration
> + * @addr1: address of first bitmap
> + * @addr2: address of second bitmask to and with the first
> + * @size: bitmap size in number of bits
> + * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
> + */
> +#define for_each_and_bit_wrap(bit, addr1, addr2, size, start) \
> + for ((bit) = find_next_and_bit_wrap((addr1), (addr2), (size), (start)); \
> + (bit) < (size); \
> + (bit) = __for_each_and_wrap((addr1), (addr2), (size), (start), (bit) + 1))
> +
> /**
> * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
> * @start: bit offset to start search and to store the current iteration offset
> --
> 2.43.0