[PATCH v2 07/11] xen/bitops: Introduce generic_hweightl() and hweightl()

Andrew Cooper posted 11 patches 2 months, 3 weeks ago
There is a newer version of this series
[PATCH v2 07/11] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Andrew Cooper 2 months, 3 weeks ago
There are 6 remaining callers in Xen:

  * The two hweight32() calls, _domain_struct_bits() and efi_find_gop_mode(),
    are __init only.
  * The two hweight_long() calls are both in bitmap_weight().
  * The two hweight64() calls are hv_vpset_nr_banks() and x86_emulate().

Only bitmap_weight() and possibly hv_vpset_nr_banks() can be considered fast
paths, and they're all of GPR-width form.

Furthermore, the differences between a generic int and generic long form is
only an ADD and SHIFT, and only in !CONFIG_HAS_FAST_MULTIPLY builds.

Therefore, it is definitely not worth having both generic implemenations.

Implement generic_hweightl() based on the current generic_hweight64(),
adjusted to be compatible with ARM32, along with standard SELF_TESTS.

Implement hweightl() with usual constant-folding and arch opt-in support.  PPC
is the only architecture that devates from generic, and it simply uses the
builtin.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Julien Grall <julien@xen.org>
CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
CC: Bertrand Marquis <bertrand.marquis@arm.com>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
CC: Shawn Anastasio <sanastasio@raptorengineering.com>

v2:
 * s/MASK/BCST/.  Extend testing
 * s/__pure/attr_const/.
---
 xen/arch/ppc/include/asm/bitops.h |  2 ++
 xen/common/bitops.c               | 14 +++++++++
 xen/include/xen/bitops.h          | 18 ++++++++++++
 xen/lib/Makefile                  |  1 +
 xen/lib/generic-hweightl.c        | 49 +++++++++++++++++++++++++++++++
 5 files changed, 84 insertions(+)
 create mode 100644 xen/lib/generic-hweightl.c

diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h
index a62c4f99c3bb..64512e949530 100644
--- a/xen/arch/ppc/include/asm/bitops.h
+++ b/xen/arch/ppc/include/asm/bitops.h
@@ -124,6 +124,8 @@ static inline int test_and_set_bit(unsigned int nr, volatile void *addr)
 #define arch_fls(x)  ((x) ? 32 - __builtin_clz(x) : 0)
 #define arch_flsl(x) ((x) ? BITS_PER_LONG - __builtin_clzl(x) : 0)
 
+#define arch_hweightl(x) __builtin_popcountl(x)
+
 /**
  * hweightN - returns the hamming weight of a N-bit word
  * @x: the word to weigh
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
index b504dd1308b8..5e5d20d225d7 100644
--- a/xen/common/bitops.c
+++ b/xen/common/bitops.c
@@ -133,6 +133,19 @@ static void __init test_multiple_bits_set(void)
     CHECK(multiple_bits_set, 0xc000000000000000ULL, true);
 }
 
+static void __init test_hweight(void)
+{
+    /* unsigned int hweightl(unsigned long) */
+    CHECK(hweightl, 0, 0);
+    CHECK(hweightl, 1, 1);
+    CHECK(hweightl, 3, 2);
+    CHECK(hweightl, 7, 3);
+    CHECK(hweightl, 0xff, 8);
+
+    CHECK(hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    CHECK(hweightl, -1UL, BITS_PER_LONG);
+}
+
 static void __init __constructor test_bitops(void)
 {
     test_ffs();
@@ -140,4 +153,5 @@ static void __init __constructor test_bitops(void)
     test_for_each_set_bit();
 
     test_multiple_bits_set();
+    test_hweight();
 }
diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index 1c160b643ed6..96dfe0f2c71a 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -35,6 +35,12 @@ extern void __bitop_bad_size(void);
 unsigned int attr_const generic_ffsl(unsigned long x);
 unsigned int attr_const generic_flsl(unsigned long x);
 
+/*
+ * Hamming Weight, also called Population Count.  Returns the number of set
+ * bits in @x.
+ */
+unsigned int attr_const generic_hweightl(unsigned long x);
+
 /**
  * generic__test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
@@ -308,6 +314,18 @@ static always_inline attr_const unsigned int fls64(uint64_t x)
         (_v & (_v - 1)) != 0;                   \
     })
 
+static always_inline attr_const unsigned int hweightl(unsigned long x)
+{
+    if ( __builtin_constant_p(x) )
+        return __builtin_popcountl(x);
+
+#ifdef arch_hweightl
+    return arch_hweightl(x);
+#else
+    return generic_hweightl(x);
+#endif
+}
+
 /* --------------------- Please tidy below here --------------------- */
 
 #ifndef find_next_bit
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index a48541596470..b6558e108bd9 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -6,6 +6,7 @@ lib-y += ctype.o
 lib-y += find-next-bit.o
 lib-y += generic-ffsl.o
 lib-y += generic-flsl.o
+lib-y += generic-hweightl.o
 lib-y += list-sort.o
 lib-y += memchr.o
 lib-y += memchr_inv.o
diff --git a/xen/lib/generic-hweightl.c b/xen/lib/generic-hweightl.c
new file mode 100644
index 000000000000..c242d4c2d9ab
--- /dev/null
+++ b/xen/lib/generic-hweightl.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/* Value @b broadcast to every byte in a long */
+#if BITS_PER_LONG == 32
+# define BCST(b) ((b) * 0x01010101UL)
+#elif BITS_PER_LONG == 64
+# define BCST(b) ((b) * 0x0101010101010101UL)
+#else
+# error Extend me please
+#endif
+
+unsigned int generic_hweightl(unsigned long x)
+{
+    x -= (x >> 1) & BCST(0x55);
+    x =  (x & BCST(0x33)) + ((x >> 2) & BCST(0x33));
+    x =  (x + (x >> 4)) & BCST(0x0f);
+
+    if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) )
+        return (x * BCST(0x01)) >> (BITS_PER_LONG - 8);
+
+    x += x >> 8;
+    x += x >> 16;
+#if BITS_PER_LONG > 32
+    x += x >> 32;
+#endif
+
+    return x & 0xff;
+}
+
+#ifdef CONFIG_SELF_TESTS
+static void __init __constructor test_generic_hweightl(void)
+{
+    RUNTIME_CHECK(generic_hweightl, 0, 0);
+    RUNTIME_CHECK(generic_hweightl, 1, 1);
+    RUNTIME_CHECK(generic_hweightl, 3, 2);
+    RUNTIME_CHECK(generic_hweightl, 7, 3);
+    RUNTIME_CHECK(generic_hweightl, 0xff, 8);
+
+    RUNTIME_CHECK(generic_hweightl, BCST(0x55), BITS_PER_LONG / 2);
+    RUNTIME_CHECK(generic_hweightl, BCST(0xaa), BITS_PER_LONG / 2);
+
+    RUNTIME_CHECK(generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    RUNTIME_CHECK(generic_hweightl, -1UL, BITS_PER_LONG);
+}
+#endif /* CONFIG_SELF_TESTS */
-- 
2.39.2


Re: [PATCH v2 07/11] xen/bitops: Introduce generic_hweightl() and hweightl()
Posted by Jan Beulich 2 months, 3 weeks ago
On 29.08.2024 00:03, Andrew Cooper wrote:
> There are 6 remaining callers in Xen:
> 
>   * The two hweight32() calls, _domain_struct_bits() and efi_find_gop_mode(),
>     are __init only.
>   * The two hweight_long() calls are both in bitmap_weight().
>   * The two hweight64() calls are hv_vpset_nr_banks() and x86_emulate().
> 
> Only bitmap_weight() and possibly hv_vpset_nr_banks() can be considered fast
> paths, and they're all of GPR-width form.
> 
> Furthermore, the differences between a generic int and generic long form is
> only an ADD and SHIFT, and only in !CONFIG_HAS_FAST_MULTIPLY builds.
> 
> Therefore, it is definitely not worth having both generic implemenations.
> 
> Implement generic_hweightl() based on the current generic_hweight64(),
> adjusted to be compatible with ARM32, along with standard SELF_TESTS.
> 
> Implement hweightl() with usual constant-folding and arch opt-in support.  PPC
> is the only architecture that devates from generic, and it simply uses the
> builtin.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>