It has existed in x86 CPUs since 2008, so we're only 16 years late adding
support. With all the other scafolding in place, implement arch_hweightl()
for x86.
The only complication is that the call to arch_generic_hweightl() is behind
the compilers back. Address this by writing it in ASM and ensure that it
preserves all registers.
Copy the code generation from generic_hweightl(). It's not a complicated
algorithm, and is easy to regenerate if needs be, but cover it with the same
unit tests as test_generic_hweightl() just for piece of mind.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
v3:
* Reinstate dropped CONFIG_SELF_TESTS
* Leave grep fodder for CODE_FILL until we can find a nicer way of doing this.
v2:
* Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
* Rename {arch->x86}-generic-hweightl.{S->c}
* Adjust ASM formating
---
xen/arch/x86/include/asm/bitops.h | 23 ++++++++++
xen/lib/Makefile | 1 +
xen/lib/x86-generic-hweightl.c | 71 +++++++++++++++++++++++++++++++
3 files changed, 95 insertions(+)
create mode 100644 xen/lib/x86-generic-hweightl.c
diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 642d8e58b288..39e37f1cbe55 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -6,6 +6,7 @@
*/
#include <asm/alternative.h>
+#include <asm/asm_defns.h>
#include <asm/cpufeatureset.h>
/*
@@ -475,4 +476,26 @@ static always_inline unsigned int arch_flsl(unsigned long x)
}
#define arch_flsl arch_flsl
+unsigned int arch_generic_hweightl(unsigned long x);
+
+static always_inline unsigned int arch_hweightl(unsigned long x)
+{
+ unsigned int r;
+
+ /*
+ * arch_generic_hweightl() is written in ASM in order to preserve all
+ * registers, as the compiler can't see the call.
+ *
+ * This limits the POPCNT instruction to using the same ABI as a function
+ * call (input in %rdi, output in %eax) but that's fine.
+ */
+ alternative_io("call arch_generic_hweightl",
+ "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
+ ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT),
+ [val] "D" (x));
+
+ return r;
+}
+#define arch_hweightl arch_hweightl
+
#endif /* _X86_BITOPS_H */
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index b6558e108bd9..54440f628aae 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -36,6 +36,7 @@ lib-y += strtol.o
lib-y += strtoll.o
lib-y += strtoul.o
lib-y += strtoull.o
+lib-$(CONFIG_X86) += x86-generic-hweightl.o
lib-$(CONFIG_X86) += xxhash32.o
lib-$(CONFIG_X86) += xxhash64.o
diff --git a/xen/lib/x86-generic-hweightl.c b/xen/lib/x86-generic-hweightl.c
new file mode 100644
index 000000000000..123a5b43928d
--- /dev/null
+++ b/xen/lib/x86-generic-hweightl.c
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/*
+ * An implementation of generic_hweightl() used on hardware without the POPCNT
+ * instruction.
+ *
+ * This function is called from within an ALTERNATIVE in arch_hweightl().
+ * i.e. behind the back of the compiler. Therefore all registers are callee
+ * preserved.
+ *
+ * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
+ * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
+ *
+ * Note: When we can use __attribute__((no_caller_saved_registers))
+ * unconditionally (GCC 7, Clang 5), we can implement this in plain C.
+ */
+asm (
+ ".type arch_generic_hweightl, STT_FUNC\n\t"
+ ".globl arch_generic_hweightl\n\t"
+ ".hidden arch_generic_hweightl\n\t"
+ ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n" /* CODE_FILL */
+ "arch_generic_hweightl:\n\t"
+
+ "push %rdi\n\t"
+ "push %rdx\n\t"
+
+ "movabs $0x5555555555555555, %rdx\n\t"
+ "mov %rdi, %rax\n\t"
+ "shr $1, %rax\n\t"
+ "and %rdx, %rax\n\t"
+ "sub %rax, %rdi\n\t"
+ "movabs $0x3333333333333333, %rax\n\t"
+ "mov %rdi, %rdx\n\t"
+ "shr $2, %rdi\n\t"
+ "and %rax, %rdx\n\t"
+ "and %rax, %rdi\n\t"
+ "add %rdi, %rdx\n\t"
+ "mov %rdx, %rax\n\t"
+ "shr $4, %rax\n\t"
+ "add %rdx, %rax\n\t"
+ "movabs $0x0f0f0f0f0f0f0f0f, %rdx\n\t"
+ "and %rdx, %rax\n\t"
+ "movabs $0x0101010101010101, %rdx\n\t"
+ "imul %rdx, %rax\n\t"
+ "shr $" STR(BITS_PER_LONG) "- 8, %rax\n\t"
+
+ "pop %rdx\n\t"
+ "pop %rdi\n\t"
+
+ "ret\n\t"
+
+ ".size arch_generic_hweightl, . - arch_generic_hweightl\n\t"
+);
+
+#ifdef CONFIG_SELF_TESTS
+static void __init __constructor test_arch_generic_hweightl(void)
+{
+ RUNTIME_CHECK(arch_generic_hweightl, 0, 0);
+ RUNTIME_CHECK(arch_generic_hweightl, 1, 1);
+ RUNTIME_CHECK(arch_generic_hweightl, 3, 2);
+ RUNTIME_CHECK(arch_generic_hweightl, 7, 3);
+ RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8);
+
+ RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+ RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG);
+}
+#endif
--
2.39.2
On 05.09.2024 00:55, Andrew Cooper wrote: > It has existed in x86 CPUs since 2008, so we're only 16 years late adding > support. With all the other scafolding in place, implement arch_hweightl() > for x86. > > The only complication is that the call to arch_generic_hweightl() is behind > the compilers back. Address this by writing it in ASM and ensure that it > preserves all registers. > > Copy the code generation from generic_hweightl(). It's not a complicated > algorithm, and is easy to regenerate if needs be, but cover it with the same > unit tests as test_generic_hweightl() just for piece of mind. > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Acked-by: Jan Beulich <jbeulich@suse.com>
© 2016 - 2024 Red Hat, Inc.