[patch 07/19] cpumask: Introduce cpumask_or_weight()

Thomas Gleixner posted 19 patches 2 months ago
There is a newer version of this series
[patch 07/19] cpumask: Introduce cpumask_or_weight()
Posted by Thomas Gleixner 2 months ago
CID management OR's two cpumasks and then calculates the weight on the
result. That's inefficient as that has to walk the same stuff twice. As
this is done with runqueue lock held, there is a real benefit of speeding
this up.

Provide cpumask_or_weight() and the corresponding bitmap functions which
return the weight of the OR result right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h  |   15 +++++++++++++++
 include/linux/cpumask.h |   16 ++++++++++++++++
 lib/bitmap.c            |   17 +++++++++++++++++
 3 files changed, 48 insertions(+)

--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -45,6 +45,7 @@ struct device;
  *  bitmap_copy(dst, src, nbits)                *dst = *src
  *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
  *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
+ *  bitmap_or_weight(dst, src1, src2, nbits)    *dst = *src1 | *src2. Returns Hamming Weight of dst
  *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
  *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
  *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
@@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co
 		 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
+unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
+				const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 		  const unsigned long *bitmap2, unsigned int nbits);
 bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
@@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const
 }
 
 static __always_inline
+unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src1,
+			      const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits)) {
+		*dst = *src1 | *src2;
+		return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
+	} else {
+		return __bitmap_or_weight(dst, src1, src2, nbits);
+	}
+}
+
+static __always_inline
 void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 		const unsigned long *src2, unsigned int nbits)
 {
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co
 }
 
 /**
+ * cpumask_or_weight - *dstp = *src1p | *src2p and return the weight of the result
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Return: The number of bits set in the resulting cpumask @dstp
+ */
+static __always_inline
+unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask *src1p,
+			       const struct cpumask *src2p)
+{
+	return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p),
+				cpumask_bits(src2p), small_cpumask_bits);
+}
+
+/**
  * cpumask_xor - *dstp = *src1p ^ *src2p
  * @dstp: the cpumask result
  * @src1p: the first input
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con
 }
 EXPORT_SYMBOL(__bitmap_or);
 
+unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
+				const unsigned long *bitmap2, unsigned int bits)
+{
+	unsigned int k, w = 0;
+
+	for (k = 0; k < bits / BITS_PER_LONG; k++) {
+		dst[k] = bitmap1[k] | bitmap2[k];
+		w += hweight_long(dst[k]);
+	}
+
+	if (bits % BITS_PER_LONG) {
+		dst[k] = bitmap1[k] | bitmap2[k];
+		w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
+	}
+	return w;
+}
+
 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 				const unsigned long *bitmap2, unsigned int bits)
 {
Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
Posted by Yury Norov 2 months ago
Hi Tomas,

On Wed, Oct 15, 2025 at 07:29:36PM +0200, Thomas Gleixner wrote:
> CID management OR's two cpumasks and then calculates the weight on the
> result. That's inefficient as that has to walk the same stuff twice. As
> this is done with runqueue lock held, there is a real benefit of speeding
> this up.
> 
> Provide cpumask_or_weight() and the corresponding bitmap functions which
> return the weight of the OR result right away.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Yury Norov <yury.norov@gmail.com>
> ---
>  include/linux/bitmap.h  |   15 +++++++++++++++
>  include/linux/cpumask.h |   16 ++++++++++++++++
>  lib/bitmap.c            |   17 +++++++++++++++++
>  3 files changed, 48 insertions(+)
> 
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -45,6 +45,7 @@ struct device;
>   *  bitmap_copy(dst, src, nbits)                *dst = *src
>   *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
>   *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
> + *  bitmap_or_weight(dst, src1, src2, nbits)    *dst = *src1 | *src2. Returns Hamming Weight of dst
>   *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
>   *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
>   *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
> @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co
>  		 const unsigned long *bitmap2, unsigned int nbits);
>  void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
>  		 const unsigned long *bitmap2, unsigned int nbits);
> +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> +				const unsigned long *bitmap2, unsigned int nbits);
>  void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
>  		  const unsigned long *bitmap2, unsigned int nbits);
>  bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
> @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const
>  }
>  
>  static __always_inline
> +unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src1,
> +			      const unsigned long *src2, unsigned int nbits)
> +{
> +	if (small_const_nbits(nbits)) {
> +		*dst = *src1 | *src2;
> +		return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
> +	} else {
> +		return __bitmap_or_weight(dst, src1, src2, nbits);
> +	}
> +}
> +
> +static __always_inline
>  void bitmap_xor(unsigned long *dst, const unsigned long *src1,
>  		const unsigned long *src2, unsigned int nbits)
>  {
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co
>  }
>  
>  /**
> + * cpumask_or_weight - *dstp = *src1p | *src2p and return the weight of the result
> + * @dstp: the cpumask result
> + * @src1p: the first input
> + * @src2p: the second input
> + *
> + * Return: The number of bits set in the resulting cpumask @dstp
> + */
> +static __always_inline
> +unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask *src1p,
> +			       const struct cpumask *src2p)
> +{
> +	return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p),
> +				cpumask_bits(src2p), small_cpumask_bits);
> +}
> +
> +/**
>   * cpumask_xor - *dstp = *src1p ^ *src2p
>   * @dstp: the cpumask result
>   * @src1p: the first input
> --- a/lib/bitmap.c
> +++ b/lib/bitmap.c
> @@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con
>  }
>  EXPORT_SYMBOL(__bitmap_or);
>  
> +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> +				const unsigned long *bitmap2, unsigned int bits)
> +{
> +	unsigned int k, w = 0;
> +
> +	for (k = 0; k < bits / BITS_PER_LONG; k++) {
> +		dst[k] = bitmap1[k] | bitmap2[k];
> +		w += hweight_long(dst[k]);
> +	}
> +
> +	if (bits % BITS_PER_LONG) {
> +		dst[k] = bitmap1[k] | bitmap2[k];
> +		w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
> +	}
> +	return w;
> +}

We've got bitmap_weight_and() and bitmap_weight_andnot() already. Can
you align naming with the existing scheme: bitmap_weight_or().

Also, for outline implementation, can you employ the BITMAP_WEIGHT()
macro?

Thanks,
Yury
Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
Posted by Thomas Gleixner 1 month, 4 weeks ago
Yury!

On Wed, Oct 15 2025 at 13:41, Yury Norov wrote:
> On Wed, Oct 15, 2025 at 07:29:36PM +0200, Thomas Gleixner wrote:
>> +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
>> +				const unsigned long *bitmap2, unsigned int bits)
>> +{
>> +	unsigned int k, w = 0;
>> +
>> +	for (k = 0; k < bits / BITS_PER_LONG; k++) {
>> +		dst[k] = bitmap1[k] | bitmap2[k];
>> +		w += hweight_long(dst[k]);
>> +	}
>> +
>> +	if (bits % BITS_PER_LONG) {
>> +		dst[k] = bitmap1[k] | bitmap2[k];
>> +		w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
>> +	}
>> +	return w;
>> +}
>
> We've got bitmap_weight_and() and bitmap_weight_andnot() already. Can
> you align naming with the existing scheme: bitmap_weight_or().

That's not the same thing. bitmap_weight_and/not() calculate the weight
of the AND resp. ANDNOT of the two bitmaps w/o modifying them:

   for (...)
       w += hweight(map1[k] & map2[k]);

While the above does:

   for (...) {
       dst[k] = map1[k] | map2[k];
       w += hweight(dst[k]);
   }

The whole point of this as explained in the change log is to avoid
walking the resulting bitmap after doing the OR operation. The compiler
is clever enough to do the or operation in a register, write it to dst
and then do the hweight calculation with it.

> Also, for outline implementation, can you employ the BITMAP_WEIGHT()
> macro?

If you insist on this ugly:

  return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits);

Sure.

Thanks,

        tglx
Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
Posted by Yury Norov 2 months ago
On Wed, Oct 15, 2025 at 01:41:50PM -0400, Yury Norov wrote:
> Hi Tomas,
> 
> On Wed, Oct 15, 2025 at 07:29:36PM +0200, Thomas Gleixner wrote:
> > CID management OR's two cpumasks and then calculates the weight on the
> > result. That's inefficient as that has to walk the same stuff twice. As
> > this is done with runqueue lock held, there is a real benefit of speeding
> > this up.
> > 
> > Provide cpumask_or_weight() and the corresponding bitmap functions which
> > return the weight of the OR result right away.
> > 
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Yury Norov <yury.norov@gmail.com>
> > ---
> >  include/linux/bitmap.h  |   15 +++++++++++++++
> >  include/linux/cpumask.h |   16 ++++++++++++++++
> >  lib/bitmap.c            |   17 +++++++++++++++++
> >  3 files changed, 48 insertions(+)
> > 
> > --- a/include/linux/bitmap.h
> > +++ b/include/linux/bitmap.h
> > @@ -45,6 +45,7 @@ struct device;
> >   *  bitmap_copy(dst, src, nbits)                *dst = *src
> >   *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
> >   *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
> > + *  bitmap_or_weight(dst, src1, src2, nbits)    *dst = *src1 | *src2. Returns Hamming Weight of dst
> >   *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
> >   *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
> >   *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
> > @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co
> >  		 const unsigned long *bitmap2, unsigned int nbits);
> >  void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
> >  		 const unsigned long *bitmap2, unsigned int nbits);
> > +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> > +				const unsigned long *bitmap2, unsigned int nbits);
> >  void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
> >  		  const unsigned long *bitmap2, unsigned int nbits);
> >  bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
> > @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const
> >  }
> >  
> >  static __always_inline
> > +unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src1,
> > +			      const unsigned long *src2, unsigned int nbits)
> > +{
> > +	if (small_const_nbits(nbits)) {
> > +		*dst = *src1 | *src2;
> > +		return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
> > +	} else {
> > +		return __bitmap_or_weight(dst, src1, src2, nbits);
> > +	}
> > +}
> > +
> > +static __always_inline
> >  void bitmap_xor(unsigned long *dst, const unsigned long *src1,
> >  		const unsigned long *src2, unsigned int nbits)
> >  {
> > --- a/include/linux/cpumask.h
> > +++ b/include/linux/cpumask.h
> > @@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co
> >  }
> >  
> >  /**
> > + * cpumask_or_weight - *dstp = *src1p | *src2p and return the weight of the result
> > + * @dstp: the cpumask result
> > + * @src1p: the first input
> > + * @src2p: the second input
> > + *
> > + * Return: The number of bits set in the resulting cpumask @dstp
> > + */
> > +static __always_inline
> > +unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask *src1p,
> > +			       const struct cpumask *src2p)
> > +{
> > +	return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p),
> > +				cpumask_bits(src2p), small_cpumask_bits);
> > +}
> > +
> > +/**
> >   * cpumask_xor - *dstp = *src1p ^ *src2p
> >   * @dstp: the cpumask result
> >   * @src1p: the first input
> > --- a/lib/bitmap.c
> > +++ b/lib/bitmap.c
> > @@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con
> >  }
> >  EXPORT_SYMBOL(__bitmap_or);
> >  
> > +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> > +				const unsigned long *bitmap2, unsigned int bits)
> > +{
> > +	unsigned int k, w = 0;
> > +
> > +	for (k = 0; k < bits / BITS_PER_LONG; k++) {
> > +		dst[k] = bitmap1[k] | bitmap2[k];
> > +		w += hweight_long(dst[k]);
> > +	}
> > +
> > +	if (bits % BITS_PER_LONG) {
> > +		dst[k] = bitmap1[k] | bitmap2[k];
> > +		w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
> > +	}
> > +	return w;
> > +}
> 
> We've got bitmap_weight_and() and bitmap_weight_andnot() already. Can
> you align naming with the existing scheme: bitmap_weight_or().
> 
> Also, for outline implementation, can you employ the BITMAP_WEIGHT()
> macro?

Ok, I see now. You want to do a regular cpumask_or(), but return the
hweight() of the result, instead of a boolean.

The cpumask_or_weight() may be really confused with cpumask_weight_or().
Can you try considering a different naming? (I am seemingly can't.)

Can you describe the performance impact you've mentioned in the commit
message in more details?

Anyways, for the approach:

Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
Posted by Thomas Gleixner 1 month, 4 weeks ago
Yury!

On Wed, Oct 15 2025 at 14:06, Yury Norov wrote:
> On Wed, Oct 15, 2025 at 01:41:50PM -0400, Yury Norov wrote:
> Ok, I see now. You want to do a regular cpumask_or(), but return the
> hweight() of the result, instead of a boolean.
>
> The cpumask_or_weight() may be really confused with cpumask_weight_or().
> Can you try considering a different naming? (I am seemingly can't.)

the only thing I came up with was cpumask_or_and_weight(), but that
sounded odd too. cpumask_or_and_calc_weight() perhaps.

> Can you describe the performance impact you've mentioned in the commit
> message in more details?

It's sparing the second loop with the related memory reads. It's about
10-20% faster for a 4k CPU mask (64 iterations) depending on the machine
I test on.

As this is invoked with runqueue lock held, there is definitely a desire
to spare as much cycles as possible.

Thanks,

        tglx