[PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available

Andrew Cooper posted 11 patches 2 months, 3 weeks ago
There is a newer version of this series
[PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available
Posted by Andrew Cooper 2 months, 3 weeks ago
It has existed in x86 CPUs since 2008, so we're only 16 years late adding
support.  With all the other scafolding in place, implement arch_hweightl()
for x86.

The only complication is that the call to arch_generic_hweightl() is behind
the compilers back.  Address this by writing it in ASM and ensure that it
preserves all registers.

Copy the code generation from generic_hweightl().  It's not a complicated
algorithm, and is easy to regenerate if needs be, but cover it with the same
unit tests as test_generic_hweightl() just for piece of mind.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v2:
 * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
 * Rename {arch->x86}-generic-hweightl.{S->c}
 * Adjust ASM formating

The __constructor trick to cause any reference to $foo() to pull in
test_$foo() only works when both are in the same TU.  i.e. what I did in
v1 (putting test_arch_generic_hweightl() in the regular generic-hweightl.c)
didn't work.

This in turn means that arch_generic_hweightl() needs writing in a global asm
block, and also that we can't use FUNC()/END().  While we could adjust it to
work for GCC/binutils, we can't have CPP macros in Clang-IAS strings.

I don't particularly like opencoding FUNC()/END(), but I can't think of
anything better.
---
 xen/arch/x86/include/asm/bitops.h | 23 +++++++++++
 xen/lib/Makefile                  |  1 +
 xen/lib/x86-generic-hweightl.c    | 69 +++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 xen/lib/x86-generic-hweightl.c

diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 642d8e58b288..39e37f1cbe55 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -6,6 +6,7 @@
  */
 
 #include <asm/alternative.h>
+#include <asm/asm_defns.h>
 #include <asm/cpufeatureset.h>
 
 /*
@@ -475,4 +476,26 @@ static always_inline unsigned int arch_flsl(unsigned long x)
 }
 #define arch_flsl arch_flsl
 
+unsigned int arch_generic_hweightl(unsigned long x);
+
+static always_inline unsigned int arch_hweightl(unsigned long x)
+{
+    unsigned int r;
+
+    /*
+     * arch_generic_hweightl() is written in ASM in order to preserve all
+     * registers, as the compiler can't see the call.
+     *
+     * This limits the POPCNT instruction to using the same ABI as a function
+     * call (input in %rdi, output in %eax) but that's fine.
+     */
+    alternative_io("call arch_generic_hweightl",
+                   "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
+                   ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT),
+                   [val] "D" (x));
+
+    return r;
+}
+#define arch_hweightl arch_hweightl
+
 #endif /* _X86_BITOPS_H */
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index b6558e108bd9..54440f628aae 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -36,6 +36,7 @@ lib-y += strtol.o
 lib-y += strtoll.o
 lib-y += strtoul.o
 lib-y += strtoull.o
+lib-$(CONFIG_X86) += x86-generic-hweightl.o
 lib-$(CONFIG_X86) += xxhash32.o
 lib-$(CONFIG_X86) += xxhash64.o
 
diff --git a/xen/lib/x86-generic-hweightl.c b/xen/lib/x86-generic-hweightl.c
new file mode 100644
index 000000000000..e0be25a01c1d
--- /dev/null
+++ b/xen/lib/x86-generic-hweightl.c
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/*
+ * An implementation of generic_hweightl() used on hardware without the POPCNT
+ * instruction.
+ *
+ * This function is called from within an ALTERNATIVE in arch_hweightl().
+ * i.e. behind the back of the compiler.  Therefore all registers are callee
+ * preserved.
+ *
+ * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
+ * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
+ *
+ * Note: When we can use __attribute__((no_caller_saved_registers))
+ *       unconditionally (GCC 7, Clang 5), we can implement this in plain C.
+ */
+asm (
+    ".type arch_generic_hweightl, STT_FUNC\n\t"
+    ".globl arch_generic_hweightl\n\t"
+    ".hidden arch_generic_hweightl\n\t"
+    ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n\t"
+    "arch_generic_hweightl:\n\t"
+
+    "push   %rdi\n\t"
+    "push   %rdx\n\t"
+
+    "movabs $0x5555555555555555, %rdx\n\t"
+    "mov    %rdi, %rax\n\t"
+    "shr    $1, %rax\n\t"
+    "and    %rdx, %rax\n\t"
+    "sub    %rax, %rdi\n\t"
+    "movabs $0x3333333333333333, %rax\n\t"
+    "mov    %rdi, %rdx\n\t"
+    "shr    $2, %rdi\n\t"
+    "and    %rax, %rdx\n\t"
+    "and    %rax, %rdi\n\t"
+    "add    %rdi, %rdx\n\t"
+    "mov    %rdx, %rax\n\t"
+    "shr    $4, %rax\n\t"
+    "add    %rdx, %rax\n\t"
+    "movabs $0x0f0f0f0f0f0f0f0f, %rdx\n\t"
+    "and    %rdx, %rax\n\t"
+    "movabs $0x0101010101010101, %rdx\n\t"
+    "imul   %rdx, %rax\n\t"
+    "shr    $" STR(BITS_PER_LONG) "- 8, %rax\n\t"
+
+    "pop    %rdx\n\t"
+    "pop    %rdi\n\t"
+
+    "ret\n\t"
+
+    ".size arch_generic_hweightl, . - arch_generic_hweightl\n\t"
+);
+
+static void __init __constructor test_arch_generic_hweightl(void)
+{
+    RUNTIME_CHECK(arch_generic_hweightl, 0, 0);
+    RUNTIME_CHECK(arch_generic_hweightl, 1, 1);
+    RUNTIME_CHECK(arch_generic_hweightl, 3, 2);
+    RUNTIME_CHECK(arch_generic_hweightl, 7, 3);
+    RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8);
+
+    RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG);
+}
-- 
2.39.2


Re: [PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available
Posted by Jan Beulich 2 months, 3 weeks ago
On 29.08.2024 00:03, Andrew Cooper wrote:
> It has existed in x86 CPUs since 2008, so we're only 16 years late adding
> support.  With all the other scafolding in place, implement arch_hweightl()
> for x86.
> 
> The only complication is that the call to arch_generic_hweightl() is behind
> the compilers back.  Address this by writing it in ASM and ensure that it
> preserves all registers.
> 
> Copy the code generation from generic_hweightl().  It's not a complicated
> algorithm, and is easy to regenerate if needs be, but cover it with the same
> unit tests as test_generic_hweightl() just for piece of mind.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> 
> v2:
>  * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
>  * Rename {arch->x86}-generic-hweightl.{S->c}
>  * Adjust ASM formating
> 
> The __constructor trick to cause any reference to $foo() to pull in
> test_$foo() only works when both are in the same TU.  i.e. what I did in
> v1 (putting test_arch_generic_hweightl() in the regular generic-hweightl.c)
> didn't work.

I'm afraid I don't understand this. What exactly didn't work, breaking in which
way? Presumably as much as you, I don't really like the global asm() in a C
file, when ideally the same could be written with less clutter in an assembly
one.

> This in turn means that arch_generic_hweightl() needs writing in a global asm
> block, and also that we can't use FUNC()/END().  While we could adjust it to
> work for GCC/binutils, we can't have CPP macros in Clang-IAS strings.

What does Clang different from gcc there? I was hoping that at least their pre-
processors would work in (sufficiently) similar ways.

> --- /dev/null
> +++ b/xen/lib/x86-generic-hweightl.c
> @@ -0,0 +1,69 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <xen/bitops.h>
> +#include <xen/init.h>
> +#include <xen/self-tests.h>
> +
> +/*
> + * An implementation of generic_hweightl() used on hardware without the POPCNT
> + * instruction.
> + *
> + * This function is called from within an ALTERNATIVE in arch_hweightl().
> + * i.e. behind the back of the compiler.  Therefore all registers are callee
> + * preserved.
> + *
> + * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
> + * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
> + *
> + * Note: When we can use __attribute__((no_caller_saved_registers))
> + *       unconditionally (GCC 7, Clang 5), we can implement this in plain C.
> + */
> +asm (
> +    ".type arch_generic_hweightl, STT_FUNC\n\t"
> +    ".globl arch_generic_hweightl\n\t"
> +    ".hidden arch_generic_hweightl\n\t"
> +    ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n\t"

Maybe better avoid open-coding CODE_FILL, in case we want to change that
down the road?

Also could I talk you into dropping the \t there? Canonical assembly code
wants ...

> +    "arch_generic_hweightl:\n\t"

.. labels unindented.

Jan

Re: [PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available
Posted by Andrew Cooper 2 months, 2 weeks ago
On 29/08/2024 3:36 pm, Jan Beulich wrote:
> On 29.08.2024 00:03, Andrew Cooper wrote:
>> +asm (
>> +    ".type arch_generic_hweightl, STT_FUNC\n\t"
>> +    ".globl arch_generic_hweightl\n\t"
>> +    ".hidden arch_generic_hweightl\n\t"
>> +    ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n\t"
> Maybe better avoid open-coding CODE_FILL, in case we want to change that
> down the road?

Sadly not.  They're both inside #ifdef __ASSEMBLY__, causing this to
turn into a "CODE_FILL" literal.

For now I'll leave some grep fodder, until we can think of a nicer way
of doing this.

> Also could I talk you into dropping the \t there?

Done.

~Andrew

Re: [PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available
Posted by Andrew Cooper 2 months, 3 weeks ago
On 29/08/2024 3:36 pm, Jan Beulich wrote:
> On 29.08.2024 00:03, Andrew Cooper wrote:
>> It has existed in x86 CPUs since 2008, so we're only 16 years late adding
>> support.  With all the other scafolding in place, implement arch_hweightl()
>> for x86.
>>
>> The only complication is that the call to arch_generic_hweightl() is behind
>> the compilers back.  Address this by writing it in ASM and ensure that it
>> preserves all registers.
>>
>> Copy the code generation from generic_hweightl().  It's not a complicated
>> algorithm, and is easy to regenerate if needs be, but cover it with the same
>> unit tests as test_generic_hweightl() just for piece of mind.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>>
>> v2:
>>  * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
>>  * Rename {arch->x86}-generic-hweightl.{S->c}
>>  * Adjust ASM formating
>>
>> The __constructor trick to cause any reference to $foo() to pull in
>> test_$foo() only works when both are in the same TU.  i.e. what I did in
>> v1 (putting test_arch_generic_hweightl() in the regular generic-hweightl.c)
>> didn't work.
> I'm afraid I don't understand this. What exactly didn't work, breaking in which
> way? Presumably as much as you, I don't really like the global asm() in a C
> file, when ideally the same could be written with less clutter in an assembly
> one.

We have lib-y because we wish to not include arch_generic_hweightl() if
it isn't referenced.

But, test_arch_generic_hweightl() unconditionally references
arch_generic_hweightl() (in CONFIG_SELF_TESTS builds).

So, the trick is to have both {test_,}arch_generic_hweightl() together
in the same object file, with test_* being entirely self contained as a
constructor.

That way, if and only if something else references
arch_generic_hweightl() do we pull in this object file, and pick up the
tests as side effect.


v1 of this patch had the test in a separate object file, causing
arch_generic_hweightl() to be referenced based on the inclusion criteria
for regular generic-hweightl.c

This patch causes the x86 build of Xen to no longer reference
generic_hweightl(), so it was excluded along with its own tests, and
test_arch_generic_hweightl().

Therefore, we had arch_generic_hweightl() in use, but the selftests not
included.


Both {test_,}arch_generic_hweightl() do need to be in the same TU for
this to work (nicely), and I'm very uninterested in writing
test_arch_generic_hweightl() in asm.


>
>> This in turn means that arch_generic_hweightl() needs writing in a global asm
>> block, and also that we can't use FUNC()/END().  While we could adjust it to
>> work for GCC/binutils, we can't have CPP macros in Clang-IAS strings.
> What does Clang different from gcc there? I was hoping that at least their pre-
> processors would work in (sufficiently) similar ways.

It's inside a string, not outside, so CPP on the C file does nothing.

Passing CPP over the intermediate .S would work, but we don't have an
intermediate .S with Clang-IAS.

I'm also not particularly interested in doubling up xen/linkage.h with
different quoting in !__ASSEMBLY__ case.


One other option which comes to mind is to have:

    .macro func name
        FUNC(\name)
    .endm

which (I think) could be used as:

    asm (
        "FUNC foo\n\t"
        ...
        "ret\n\t"
        "END foo"
    );

It reads almost the same, and avoids opencoding contents of FUNC, but
even this requires changing the __ASSEMBLY__-ness of linkage.h and I
can't see a nice way of making it happen.

Or we finally bump our minimum toolchain version...  GCC 7 is already
old enough to be out of the recent LTS's...

~Andrew

Re: [PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available
Posted by Jan Beulich 2 months, 3 weeks ago
On 30.08.2024 22:03, Andrew Cooper wrote:
> On 29/08/2024 3:36 pm, Jan Beulich wrote:
>> On 29.08.2024 00:03, Andrew Cooper wrote:
>>> It has existed in x86 CPUs since 2008, so we're only 16 years late adding
>>> support.  With all the other scafolding in place, implement arch_hweightl()
>>> for x86.
>>>
>>> The only complication is that the call to arch_generic_hweightl() is behind
>>> the compilers back.  Address this by writing it in ASM and ensure that it
>>> preserves all registers.
>>>
>>> Copy the code generation from generic_hweightl().  It's not a complicated
>>> algorithm, and is easy to regenerate if needs be, but cover it with the same
>>> unit tests as test_generic_hweightl() just for piece of mind.
>>>
>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>> ---
>>> CC: Jan Beulich <JBeulich@suse.com>
>>> CC: Roger Pau Monné <roger.pau@citrix.com>
>>>
>>> v2:
>>>  * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
>>>  * Rename {arch->x86}-generic-hweightl.{S->c}
>>>  * Adjust ASM formating
>>>
>>> The __constructor trick to cause any reference to $foo() to pull in
>>> test_$foo() only works when both are in the same TU.  i.e. what I did in
>>> v1 (putting test_arch_generic_hweightl() in the regular generic-hweightl.c)
>>> didn't work.
>> I'm afraid I don't understand this. What exactly didn't work, breaking in which
>> way? Presumably as much as you, I don't really like the global asm() in a C
>> file, when ideally the same could be written with less clutter in an assembly
>> one.
> 
> We have lib-y because we wish to not include arch_generic_hweightl() if
> it isn't referenced.
> 
> But, test_arch_generic_hweightl() unconditionally references
> arch_generic_hweightl() (in CONFIG_SELF_TESTS builds).

Which I assumed was intentional: Always have the tests, to make sure the code
doesn't go stale (or even break).

Looking at the v2 code again I notice that while now you're running the tests
only when the to-be-tested function is referenced from elsewhere, the testing
has become independent of CONFIG_SELF_TESTS. I don't think that was intended?

> Both {test_,}arch_generic_hweightl() do need to be in the same TU for
> this to work (nicely), and I'm very uninterested in writing
> test_arch_generic_hweightl() in asm.

Well - surely absolutely no question about this last aspect.

>>> This in turn means that arch_generic_hweightl() needs writing in a global asm
>>> block, and also that we can't use FUNC()/END().  While we could adjust it to
>>> work for GCC/binutils, we can't have CPP macros in Clang-IAS strings.
>> What does Clang different from gcc there? I was hoping that at least their pre-
>> processors would work in (sufficiently) similar ways.
> 
> It's inside a string, not outside, so CPP on the C file does nothing.
> 
> Passing CPP over the intermediate .S would work, but we don't have an
> intermediate .S with Clang-IAS.

There's never an intermediate .S; it's always .s (not subject to C pre-
processing). Therefore I continue to struggle with how CPP macros would
come into (inconsistent) play here, unless they were used in construction
of the string literal (and then work similarly for gcc and Clang).

But as you say, we want to avoid playing with the intermediate file in
any event.

Jan

Re: [PATCH v2 11/11] x86/bitops: Use the POPCNT instruction when available
Posted by Andrew Cooper 2 months, 2 weeks ago
On 02/09/2024 9:06 am, Jan Beulich wrote:
> On 30.08.2024 22:03, Andrew Cooper wrote:
>> On 29/08/2024 3:36 pm, Jan Beulich wrote:
>>> On 29.08.2024 00:03, Andrew Cooper wrote:
>>>> It has existed in x86 CPUs since 2008, so we're only 16 years late adding
>>>> support.  With all the other scafolding in place, implement arch_hweightl()
>>>> for x86.
>>>>
>>>> The only complication is that the call to arch_generic_hweightl() is behind
>>>> the compilers back.  Address this by writing it in ASM and ensure that it
>>>> preserves all registers.
>>>>
>>>> Copy the code generation from generic_hweightl().  It's not a complicated
>>>> algorithm, and is easy to regenerate if needs be, but cover it with the same
>>>> unit tests as test_generic_hweightl() just for piece of mind.
>>>>
>>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>>> ---
>>>> CC: Jan Beulich <JBeulich@suse.com>
>>>> CC: Roger Pau Monné <roger.pau@citrix.com>
>>>>
>>>> v2:
>>>>  * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
>>>>  * Rename {arch->x86}-generic-hweightl.{S->c}
>>>>  * Adjust ASM formating
>>>>
>>>> The __constructor trick to cause any reference to $foo() to pull in
>>>> test_$foo() only works when both are in the same TU.  i.e. what I did in
>>>> v1 (putting test_arch_generic_hweightl() in the regular generic-hweightl.c)
>>>> didn't work.
>>> I'm afraid I don't understand this. What exactly didn't work, breaking in which
>>> way? Presumably as much as you, I don't really like the global asm() in a C
>>> file, when ideally the same could be written with less clutter in an assembly
>>> one.
>> We have lib-y because we wish to not include arch_generic_hweightl() if
>> it isn't referenced.
>>
>> But, test_arch_generic_hweightl() unconditionally references
>> arch_generic_hweightl() (in CONFIG_SELF_TESTS builds).
> Which I assumed was intentional: Always have the tests, to make sure the code
> doesn't go stale (or even break).

Come to think of it, we already have two subtlety different things.

common/bitops.c is SELF_TESTS only, and does both compile time
(constant-folding) and runtime (non-folded) tests on the top-level APIs.

This checks that the compiler agrees with the test parameters that I (or
hopefully in the future, others) have chosen, and that every arch_$foo()
override agrees with the compiler version.

It has already found bugs.  Both __builtin foo() vs fool() mismatches
and I dread to think how long a bugs like that could have gone
unnoticed.  I caught ppc's arch_hweightl() before posting v1, but noone
noticed ARM's arch_flsl() snafu when it was on the list (I found it when
retrofitting SELF_TESTS around the series).


Separately, anything that causes any {arch_,}generic_$foo()'s to be
included also pulls in the SELF_TEST for that specific function. 
They're runtime only tests (no constant folding in an intentionally
out-of-line function).

But, it was intentional to try and make CONFIG_SELF_TESTS not blindly
pull in the unused generic_$foo()'s.  They'll get tested if they are
referenced in a build, but the tests are only liable to break are during
bitops development or a new/weird arch/environments.

Furthermore, I expect (/hope) that we'll get to the point where we're
being conservative with selftests.  It's part of why I'm trying to keep
the current ones pretty lean.


> Looking at the v2 code again I notice that while now you're running the tests
> only when the to-be-tested function is referenced from elsewhere, the testing
> has become independent of CONFIG_SELF_TESTS. I don't think that was intended?

Correct - that was an oversight.  The CONFIG_SELF_TESTS guard wants to
go back in.

~Andrew