[PATCH 5/9] xen/bitops: Introduce generic_hweightl() and hweightl()

Andrew Cooper posted 9 patches 3 months ago
There is a newer version of this series
[PATCH 5/9] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Andrew Cooper 3 months ago
There are 6 remaining callers in Xen:

  * The two hweight32() calls, _domain_struct_bits() and efi_find_gop_mode(),
    are __init only.
  * The two hweight_long() calls are both in bitmap_weight().
  * The two hweight64() calls are hv_vpset_nr_banks() and x86_emulate().

Only bitmap_weight() and possibly hv_vpset_nr_banks() can be considered
fast(ish) paths, and they're all of GPR-width form.

Furthermore, the differences between a generic int and generic long form is
only an ADD and SHIFT, and only in !CONFIG_HAS_FAST_MULTIPLY builds.

Therefore, it is definitely not worth having both generic implemenations.

Implement generic_hweightl() based on the current generic_hweight64(),
adjusted to be compatible with ARM32, along with standard SELF_TESTS.

Implement hweightl() with usual constant-folding and arch opt-in support.  PPC
is the only architecture that devates from generic, and it simply uses the
builtin.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Julien Grall <julien@xen.org>
CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
CC: Bertrand Marquis <bertrand.marquis@arm.com>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
CC: Shawn Anastasio <sanastasio@raptorengineering.com>
---
 xen/arch/ppc/include/asm/bitops.h |  2 ++
 xen/common/bitops.c               | 14 ++++++++++
 xen/include/xen/bitops.h          | 18 ++++++++++++
 xen/lib/Makefile                  |  1 +
 xen/lib/generic-hweightl.c        | 46 +++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+)
 create mode 100644 xen/lib/generic-hweightl.c

diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h
index a62c4f99c3bb..64512e949530 100644
--- a/xen/arch/ppc/include/asm/bitops.h
+++ b/xen/arch/ppc/include/asm/bitops.h
@@ -124,6 +124,8 @@ static inline int test_and_set_bit(unsigned int nr, volatile void *addr)
 #define arch_fls(x)  ((x) ? 32 - __builtin_clz(x) : 0)
 #define arch_flsl(x) ((x) ? BITS_PER_LONG - __builtin_clzl(x) : 0)
 
+#define arch_hweightl(x) __builtin_popcountl(x)
+
 /**
  * hweightN - returns the hamming weight of a N-bit word
  * @x: the word to weigh
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
index 4545682aa8e0..d0c268b4994a 100644
--- a/xen/common/bitops.c
+++ b/xen/common/bitops.c
@@ -106,10 +106,24 @@ static void __init test_multiple_bits_set(void)
     CHECK(multiple_bits_set, 0xc000000000000000ULL, true);
 }
 
+static void __init test_hweight(void)
+{
+    /* unsigned int hweightl(unsigned long) */
+    CHECK(hweightl, 0, 0);
+    CHECK(hweightl, 1, 1);
+    CHECK(hweightl, 3, 2);
+    CHECK(hweightl, 7, 3);
+    CHECK(hweightl, 0xff, 8);
+
+    CHECK(hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    CHECK(hweightl, -1UL, BITS_PER_LONG);
+}
+
 static void __init __constructor test_bitops(void)
 {
     test_ffs();
     test_fls();
 
     test_multiple_bits_set();
+    test_hweight();
 }
diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index 64d70a7a1cb5..3aac10b7f532 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -35,6 +35,12 @@ extern void __bitop_bad_size(void);
 unsigned int __pure generic_ffsl(unsigned long x);
 unsigned int __pure generic_flsl(unsigned long x);
 
+/*
+ * Hamming Weight, also called Population Count.  Returns the number of set
+ * bits in @x.
+ */
+unsigned int __pure generic_hweightl(unsigned long x);
+
 /**
  * generic__test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
@@ -284,6 +290,18 @@ static always_inline __pure unsigned int fls64(uint64_t x)
         (_v & (_v - 1)) != 0;                   \
     })
 
+static always_inline __pure unsigned int hweightl(unsigned long x)
+{
+    if ( __builtin_constant_p(x) )
+        return __builtin_popcountl(x);
+
+#ifdef arch_hweightl
+    return arch_hweightl(x);
+#else
+    return generic_hweightl(x);
+#endif
+}
+
 /* --------------------- Please tidy below here --------------------- */
 
 #ifndef find_next_bit
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index a48541596470..b6558e108bd9 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -6,6 +6,7 @@ lib-y += ctype.o
 lib-y += find-next-bit.o
 lib-y += generic-ffsl.o
 lib-y += generic-flsl.o
+lib-y += generic-hweightl.o
 lib-y += list-sort.o
 lib-y += memchr.o
 lib-y += memchr_inv.o
diff --git a/xen/lib/generic-hweightl.c b/xen/lib/generic-hweightl.c
new file mode 100644
index 000000000000..fa4bbec273ab
--- /dev/null
+++ b/xen/lib/generic-hweightl.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/* Mask value @b broadcast to every byte in a long */
+#if BITS_PER_LONG == 32
+# define MASK(b) ((b) * 0x01010101UL)
+#elif BITS_PER_LONG == 64
+# define MASK(b) ((b) * 0x0101010101010101UL)
+#else
+# error Extend me please
+#endif
+
+unsigned int generic_hweightl(unsigned long x)
+{
+    x -= (x >> 1) & MASK(0x55);
+    x =  (x & MASK(0x33)) + ((x >> 2) & MASK(0x33));
+    x =  (x + (x >> 4)) & MASK(0x0f);
+
+    if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) )
+        return (x * MASK(0x01)) >> (BITS_PER_LONG - 8);
+
+    x += x >> 8;
+    x += x >> 16;
+#if BITS_PER_LONG > 32
+    x += x >> 32;
+#endif
+
+    return x & 0xff;
+}
+
+#ifdef CONFIG_SELF_TESTS
+static void __init __constructor test_generic_hweightl(void)
+{
+    RUNTIME_CHECK(generic_hweightl, 0, 0);
+    RUNTIME_CHECK(generic_hweightl, 1, 1);
+    RUNTIME_CHECK(generic_hweightl, 3, 2);
+    RUNTIME_CHECK(generic_hweightl, 7, 3);
+    RUNTIME_CHECK(generic_hweightl, 0xff, 8);
+
+    RUNTIME_CHECK(generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    RUNTIME_CHECK(generic_hweightl, -1UL, BITS_PER_LONG);
+}
+#endif /* CONFIG_SELF_TESTS */
-- 
2.39.2


Re: [PATCH 5/9] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Jan Beulich 2 months, 4 weeks ago
On 23.08.2024 01:06, Andrew Cooper wrote:
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -35,6 +35,12 @@ extern void __bitop_bad_size(void);
 unsigned int __pure generic_ffsl(unsigned long x);
 unsigned int __pure generic_flsl(unsigned long x);
 
> +/*
> + * Hamming Weight, also called Population Count.  Returns the number of set
> + * bits in @x.
> + */
> +unsigned int __pure generic_hweightl(unsigned long x);

Aren't this and ...

> @@ -284,6 +290,18 @@ static always_inline __pure unsigned int fls64(uint64_t x)
>          (_v & (_v - 1)) != 0;                   \
>      })
>  
> +static always_inline __pure unsigned int hweightl(unsigned long x)

... this even __attribute_const__?

> +{
> +    if ( __builtin_constant_p(x) )
> +        return __builtin_popcountl(x);

How certain are you that compilers (even old ones) will reliably fold
constant expressions here, and never emit a libgcc call instead? The
conditions look to be more tight than just __builtin_constant_p(); a
pretty absurd example:

unsigned ltest(void) {
    return __builtin_constant_p("") ? __builtin_popcountl((unsigned long)"") : ~0;
}

> --- /dev/null
> +++ b/xen/lib/generic-hweightl.c
> @@ -0,0 +1,46 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <xen/bitops.h>
> +#include <xen/init.h>
> +#include <xen/self-tests.h>
> +
> +/* Mask value @b broadcast to every byte in a long */
> +#if BITS_PER_LONG == 32
> +# define MASK(b) ((b) * 0x01010101UL)
> +#elif BITS_PER_LONG == 64
> +# define MASK(b) ((b) * 0x0101010101010101UL)
> +#else
> +# error Extend me please
> +#endif
> +
> +unsigned int generic_hweightl(unsigned long x)
> +{
> +    x -= (x >> 1) & MASK(0x55);
> +    x =  (x & MASK(0x33)) + ((x >> 2) & MASK(0x33));
> +    x =  (x + (x >> 4)) & MASK(0x0f);
> +
> +    if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) )
> +        return (x * MASK(0x01)) >> (BITS_PER_LONG - 8);

I realize it's nitpicking, yet especially this use isn't really "mask"-
like. Could I talk you into naming the macro e.g. BCST()?

> +    x += x >> 8;
> +    x += x >> 16;
> +#if BITS_PER_LONG > 32
> +    x += x >> 32;
> +#endif
> +
> +    return x & 0xff;
> +}

Perhaps #undef MASK here, or else ...

> +#ifdef CONFIG_SELF_TESTS
> +static void __init __constructor test_generic_hweightl(void)
> +{
> +    RUNTIME_CHECK(generic_hweightl, 0, 0);
> +    RUNTIME_CHECK(generic_hweightl, 1, 1);
> +    RUNTIME_CHECK(generic_hweightl, 3, 2);
> +    RUNTIME_CHECK(generic_hweightl, 7, 3);
> +    RUNTIME_CHECK(generic_hweightl, 0xff, 8);
> +
> +    RUNTIME_CHECK(generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
> +    RUNTIME_CHECK(generic_hweightl, -1UL, BITS_PER_LONG);
> +}

... actually use it some here, to have a few more cases?

Jan
Re: [PATCH 5/9] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Andrew Cooper 2 months, 4 weeks ago
On 26/08/2024 12:40 pm, Jan Beulich wrote:
> On 23.08.2024 01:06, Andrew Cooper wrote:
> --- a/xen/include/xen/bitops.h
> +++ b/xen/include/xen/bitops.h
> @@ -35,6 +35,12 @@ extern void __bitop_bad_size(void);
>  unsigned int __pure generic_ffsl(unsigned long x);
>  unsigned int __pure generic_flsl(unsigned long x);
>  
>> +/*
>> + * Hamming Weight, also called Population Count.  Returns the number of set
>> + * bits in @x.
>> + */
>> +unsigned int __pure generic_hweightl(unsigned long x);
> Aren't this and ...
>
>> @@ -284,6 +290,18 @@ static always_inline __pure unsigned int fls64(uint64_t x)
>>          (_v & (_v - 1)) != 0;                   \
>>      })
>>  
>> +static always_inline __pure unsigned int hweightl(unsigned long x)
> ... this even __attribute_const__?

Hmm.  This is following fls(), but on further consideration, they should
be const too.

I'll do a prep patch fixing that, although I'm going to rename it to
__attr_const for brevity.

Much as I'd prefer __const, I expect that is going too far, making it
too close to regular const.

>
>> +{
>> +    if ( __builtin_constant_p(x) )
>> +        return __builtin_popcountl(x);
> How certain are you that compilers (even old ones) will reliably fold
> constant expressions here, and never emit a libgcc call instead? The
> conditions look to be more tight than just __builtin_constant_p(); a
> pretty absurd example:
>
> unsigned ltest(void) {
>     return __builtin_constant_p("") ? __builtin_popcountl((unsigned long)"") : ~0;
> }

How do you express that in terms of a call to hweightl()?

Again, this is following the layout started with fls() in order to avoid
each arch opencoding different versions of constant folding.

https://godbolt.org/z/r544c49oY

When it's forced through the hweightl() interface, even GCC 4.1 decides
that it's non-constant and falls back to generic_hweightl().


I did spend a *lot* of time with the fls() series checking that all
compilers we supported did what we wanted in this case, so I don't
expect it to be a problem.  But, if a library call is emitted, it will
be very obvious (link failure), and we can re-evaluate.


>
>> --- /dev/null
>> +++ b/xen/lib/generic-hweightl.c
>> @@ -0,0 +1,46 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +
>> +#include <xen/bitops.h>
>> +#include <xen/init.h>
>> +#include <xen/self-tests.h>
>> +
>> +/* Mask value @b broadcast to every byte in a long */
>> +#if BITS_PER_LONG == 32
>> +# define MASK(b) ((b) * 0x01010101UL)
>> +#elif BITS_PER_LONG == 64
>> +# define MASK(b) ((b) * 0x0101010101010101UL)
>> +#else
>> +# error Extend me please
>> +#endif
>> +
>> +unsigned int generic_hweightl(unsigned long x)
>> +{
>> +    x -= (x >> 1) & MASK(0x55);
>> +    x =  (x & MASK(0x33)) + ((x >> 2) & MASK(0x33));
>> +    x =  (x + (x >> 4)) & MASK(0x0f);
>> +
>> +    if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) )
>> +        return (x * MASK(0x01)) >> (BITS_PER_LONG - 8);
> I realize it's nitpicking, yet especially this use isn't really "mask"-
> like. Could I talk you into naming the macro e.g. BCST()?

Ok - I wasn't overly happy with the name MASK(), and BCST() is better.

>
>> +    x += x >> 8;
>> +    x += x >> 16;
>> +#if BITS_PER_LONG > 32
>> +    x += x >> 32;
>> +#endif
>> +
>> +    return x & 0xff;
>> +}
> Perhaps #undef MASK here, or else ...
>
>> +#ifdef CONFIG_SELF_TESTS
>> +static void __init __constructor test_generic_hweightl(void)
>> +{
>> +    RUNTIME_CHECK(generic_hweightl, 0, 0);
>> +    RUNTIME_CHECK(generic_hweightl, 1, 1);
>> +    RUNTIME_CHECK(generic_hweightl, 3, 2);
>> +    RUNTIME_CHECK(generic_hweightl, 7, 3);
>> +    RUNTIME_CHECK(generic_hweightl, 0xff, 8);
>> +
>> +    RUNTIME_CHECK(generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
>> +    RUNTIME_CHECK(generic_hweightl, -1UL, BITS_PER_LONG);
>> +}
> ... actually use it some here, to have a few more cases?

Hmm, why not.

~Andrew

Re: [PATCH 5/9] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Jan Beulich 2 months, 4 weeks ago
On 27.08.2024 12:39, Andrew Cooper wrote:
> On 26/08/2024 12:40 pm, Jan Beulich wrote:
>> On 23.08.2024 01:06, Andrew Cooper wrote:
>> --- a/xen/include/xen/bitops.h
>> +++ b/xen/include/xen/bitops.h
>> @@ -35,6 +35,12 @@ extern void __bitop_bad_size(void);
>>  unsigned int __pure generic_ffsl(unsigned long x);
>>  unsigned int __pure generic_flsl(unsigned long x);
>>  
>>> +/*
>>> + * Hamming Weight, also called Population Count.  Returns the number of set
>>> + * bits in @x.
>>> + */
>>> +unsigned int __pure generic_hweightl(unsigned long x);
>> Aren't this and ...
>>
>>> @@ -284,6 +290,18 @@ static always_inline __pure unsigned int fls64(uint64_t x)
>>>          (_v & (_v - 1)) != 0;                   \
>>>      })
>>>  
>>> +static always_inline __pure unsigned int hweightl(unsigned long x)
>> ... this even __attribute_const__?
> 
> Hmm.  This is following fls(), but on further consideration, they should
> be const too.
> 
> I'll do a prep patch fixing that, although I'm going to rename it to
> __attr_const for brevity.
> 
> Much as I'd prefer __const, I expect that is going too far, making it
> too close to regular const.

I was actually going to suggest using that name, if we want to shorten
__attribute_const__. Yes, gcc treats __const (and __const__) as
keywords, but do we care (much)? All it requires is that we don't start
using __const as a (real) keyword.

Of course __const is a good example of why really we shouldn't use
double-underscore prefixed names anywhere. Any of them can be assigned
a meaning by the compiler, and here that's clearly the case. Therefore,
taking your planned rename, maybe better make it attr_const then? And
eventually switch stuff like __packed, __pure, and __weak to attr_* as
well? Or even introduce something like

#define attr(attr...) __attribute__((attr))

and use attr(const) here?

>>> +{
>>> +    if ( __builtin_constant_p(x) )
>>> +        return __builtin_popcountl(x);
>> How certain are you that compilers (even old ones) will reliably fold
>> constant expressions here, and never emit a libgcc call instead? The
>> conditions look to be more tight than just __builtin_constant_p(); a
>> pretty absurd example:
>>
>> unsigned ltest(void) {
>>     return __builtin_constant_p("") ? __builtin_popcountl((unsigned long)"") : ~0;
>> }
> 
> How do you express that in terms of a call to hweightl()?

hweightl((unsigned long)"");

Yet as said - it's absurd. It merely serves to make the point that what
__builtin_constant_p() returns true for doesn't necessarily constant-
fold in expressions.

> Again, this is following the layout started with fls() in order to avoid
> each arch opencoding different versions of constant folding.
> 
> https://godbolt.org/z/r544c49oY
> 
> When it's forced through the hweightl() interface, even GCC 4.1 decides
> that it's non-constant and falls back to generic_hweightl().
> 
> 
> I did spend a *lot* of time with the fls() series checking that all
> compilers we supported did what we wanted in this case, so I don't
> expect it to be a problem.

Right, and I guess I was pointlessly more concerned about popcount than
I was for ffs() / fls(). The criteria upon which gcc decides whether to
constant-fold the uses is exactly the same.

>  But, if a library call is emitted, it will
> be very obvious (link failure), and we can re-evaluate.

Indeed, we certainly would notice, albeit the diagnostic may be cryptic
to people.

Bottom line - keep it as is.

Jan

Re: [PATCH 5/9] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Andrew Cooper 2 months, 4 weeks ago
On 27/08/2024 12:41 pm, Jan Beulich wrote:
> On 27.08.2024 12:39, Andrew Cooper wrote:
>> On 26/08/2024 12:40 pm, Jan Beulich wrote:
>>> On 23.08.2024 01:06, Andrew Cooper wrote:
>>> --- a/xen/include/xen/bitops.h
>>> +++ b/xen/include/xen/bitops.h
>>> @@ -35,6 +35,12 @@ extern void __bitop_bad_size(void);
>>>  unsigned int __pure generic_ffsl(unsigned long x);
>>>  unsigned int __pure generic_flsl(unsigned long x);
>>>  
>>>> +/*
>>>> + * Hamming Weight, also called Population Count.  Returns the number of set
>>>> + * bits in @x.
>>>> + */
>>>> +unsigned int __pure generic_hweightl(unsigned long x);
>>> Aren't this and ...
>>>
>>>> @@ -284,6 +290,18 @@ static always_inline __pure unsigned int fls64(uint64_t x)
>>>>          (_v & (_v - 1)) != 0;                   \
>>>>      })
>>>>  
>>>> +static always_inline __pure unsigned int hweightl(unsigned long x)
>>> ... this even __attribute_const__?
>> Hmm.  This is following fls(), but on further consideration, they should
>> be const too.
>>
>> I'll do a prep patch fixing that, although I'm going to rename it to
>> __attr_const for brevity.
>>
>> Much as I'd prefer __const, I expect that is going too far, making it
>> too close to regular const.
> I was actually going to suggest using that name, if we want to shorten
> __attribute_const__. Yes, gcc treats __const (and __const__) as
> keywords, but do we care (much)? All it requires is that we don't start
> using __const as a (real) keyword.

Well also we'll get into more MISRA fun for overriding keywords.

But yes - the fact that GCC treats __const to mean const is precisely
why we shouldn't give it an unrelated meaning.

>
> Of course __const is a good example of why really we shouldn't use
> double-underscore prefixed names anywhere. Any of them can be assigned
> a meaning by the compiler, and here that's clearly the case. Therefore,
> taking your planned rename, maybe better make it attr_const then? And
> eventually switch stuff like __packed, __pure, and __weak to attr_* as
> well? Or even introduce something like
>
> #define attr(attr...) __attribute__((attr))
>
> and use attr(const) here?

Hmm - that's an interesting approach, and for other attributes which we
can use unconditionally.  It will end up shorter than multiple separate
__-prefixed names.

As a tangent, I've got some work from playing with -fanalyzer which
sprinkles some attr malloc/alloc_{size,align}()/free around.  It does
improve code generation (abeit marginally), but the function declaration
size suffers.

It won't work for attributes which are conditionally nothing (e.g.
cf_check), or ones that contain multiple aspects (e.g. __constructer
conataining cf_check).

In practice this means we're always going to end up with a mix, so maybe
attr_const is better for consistency.

>
>>>> +{
>>>> +    if ( __builtin_constant_p(x) )
>>>> +        return __builtin_popcountl(x);
>>> How certain are you that compilers (even old ones) will reliably fold
>>> constant expressions here, and never emit a libgcc call instead? The
>>> conditions look to be more tight than just __builtin_constant_p(); a
>>> pretty absurd example:
>>>
>>> unsigned ltest(void) {
>>>     return __builtin_constant_p("") ? __builtin_popcountl((unsigned long)"") : ~0;
>>> }
>> How do you express that in terms of a call to hweightl()?
> hweightl((unsigned long)"");
>
> Yet as said - it's absurd. It merely serves to make the point that what
> __builtin_constant_p() returns true for doesn't necessarily constant-
> fold in expressions.

Yes, but as shown in the godbolt link, this form changes GCC's mind
about the __builtin_const-ness of the expression.

>
>> Again, this is following the layout started with fls() in order to avoid
>> each arch opencoding different versions of constant folding.
>>
>> https://godbolt.org/z/r544c49oY
>>
>> When it's forced through the hweightl() interface, even GCC 4.1 decides
>> that it's non-constant and falls back to generic_hweightl().
>>
>>
>> I did spend a *lot* of time with the fls() series checking that all
>> compilers we supported did what we wanted in this case, so I don't
>> expect it to be a problem.
> Right, and I guess I was pointlessly more concerned about popcount than
> I was for ffs() / fls(). The criteria upon which gcc decides whether to
> constant-fold the uses is exactly the same.
>
>>   But, if a library call is emitted, it will
>> be very obvious (link failure), and we can re-evaluate.
> Indeed, we certainly would notice, albeit the diagnostic may be cryptic
> to people.
>
> Bottom line - keep it as is.

Thanks.

~Andrew