x86/bitops: Optimise arch_ffs()/etc some more on AMD

[PATCH] x86/bitops: Optimise arch_ffs()/etc some more on AMD

Posted by Andrew Cooper 5 months ago

One aspect of the old ffs()/etc infrastructure was the use of TZCNT/LZCNT
without a CPUID check.  Given no obvious justification, I dropped it during
the cleanup.

It turns out there is a good reason, on all but the most recent AMD CPUs.

Reinstate the use of "blind" TZCNT/LZCNT when safe to do so.  This happens to
be preferentially in loops where a repeated saving of 5-6 uops becomes far
more relevant than a one-off scenario.

Leave an explanation of why.

No functional change.

Fixes: 989e5f08d33e ("x86/bitops: Improve arch_ffs() in the general case")
Fixes: 5ed26fc0768d ("xen/bitops: Implement ffsl() in common logic")
Fixes: 54b10ef6c810 ("xen/bitops: Implement fls()/flsl() in common logic")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/include/asm/bitops.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 87eac7782f10..62e6ca26f5b0 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -399,9 +399,16 @@ static always_inline unsigned int arch_ffs(unsigned int x)
          *
          * and the optimiser really can work with the knowledge of x being
          * non-zero without knowing it's exact value, in which case we don't
-         * need to compensate for BSF's corner cases.  Otherwise...
+         * need to compensate for BSF's corner cases.
+         *
+         * That said, we intentionally use TZCNT on capable hardware when the
+         * behaviour for the 0 case doesn't matter.  On AMD CPUs prior to
+         * Zen4, TZCNT is 1-2 uops while BSF is 6-8 with a latency to match.
+         * Intel CPUs don't suffer this discrepancy.
+         *
+         * Otherwise...
          */
-        asm ( "bsf %[val], %[res]"
+        asm ( "rep bsf %[val], %[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     }
@@ -428,7 +435,7 @@ static always_inline unsigned int arch_ffsl(unsigned long x)
 
     /* See arch_ffs() for safety discussions. */
     if ( __builtin_constant_p(x > 0) && x > 0 )
-        asm ( "bsf %[val], %q[res]"
+        asm ( "rep bsf %[val], %q[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     else
@@ -446,7 +453,7 @@ static always_inline unsigned int arch_fls(unsigned int x)
 
     /* See arch_ffs() for safety discussions. */
     if ( __builtin_constant_p(x > 0) && x > 0 )
-        asm ( "bsr %[val], %[res]"
+        asm ( "rep bsr %[val], %[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     else
@@ -464,7 +471,7 @@ static always_inline unsigned int arch_flsl(unsigned long x)
 
     /* See arch_ffs() for safety discussions. */
     if ( __builtin_constant_p(x > 0) && x > 0 )
-        asm ( "bsr %[val], %q[res]"
+        asm ( "rep bsr %[val], %q[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     else

base-commit: d965e2ee07c56c341d8896852550914d87ea5374
prerequisite-patch-id: 8da89000c73c38aab6abfa6622217ea9eff07fbd
prerequisite-patch-id: 74830838bac94ed1e036a8173cf3210a314b35d8
prerequisite-patch-id: 0654835c28df8d40b6c97006d041c4d31447a9a6
prerequisite-patch-id: 2d47d646c6a6e0019918c57753d6c01a752c377f
prerequisite-patch-id: d8f8c4221a2d7039bae6f3d38e93fe90b2091d01
prerequisite-patch-id: e0397c86b545a1d65f2e6b2049c282b926c40c64
prerequisite-patch-id: ea21abe4540ee229f4f8725ce3f701d9ba4bd4a8
-- 
2.39.5

Re: [PATCH] x86/bitops: Optimise arch_ffs()/etc some more on AMD

Posted by Jan Beulich 5 months ago

On 28.05.2025 00:29, Andrew Cooper wrote:
> One aspect of the old ffs()/etc infrastructure was the use of TZCNT/LZCNT
> without a CPUID check.  Given no obvious justification, I dropped it during
> the cleanup.
> 
> It turns out there is a good reason, on all but the most recent AMD CPUs.
> 
> Reinstate the use of "blind" TZCNT/LZCNT when safe to do so.  This happens to
> be preferentially in loops where a repeated saving of 5-6 uops becomes far
> more relevant than a one-off scenario.
> 
> Leave an explanation of why.
> 
> No functional change.
> 
> Fixes: 989e5f08d33e ("x86/bitops: Improve arch_ffs() in the general case")
> Fixes: 5ed26fc0768d ("xen/bitops: Implement ffsl() in common logic")
> Fixes: 54b10ef6c810 ("xen/bitops: Implement fls()/flsl() in common logic")
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

In principle
Reviewed-by: Jan Beulich <jbeulich@suse.com>
but ...

> --- a/xen/arch/x86/include/asm/bitops.h
> +++ b/xen/arch/x86/include/asm/bitops.h
> @@ -399,9 +399,16 @@ static always_inline unsigned int arch_ffs(unsigned int x)
>           *
>           * and the optimiser really can work with the knowledge of x being
>           * non-zero without knowing it's exact value, in which case we don't
> -         * need to compensate for BSF's corner cases.  Otherwise...
> +         * need to compensate for BSF's corner cases.
> +         *
> +         * That said, we intentionally use TZCNT on capable hardware when the
> +         * behaviour for the 0 case doesn't matter.  On AMD CPUs prior to
> +         * Zen4, TZCNT is 1-2 uops while BSF is 6-8 with a latency to match.
> +         * Intel CPUs don't suffer this discrepancy.
> +         *
> +         * Otherwise...
>           */
> -        asm ( "bsf %[val], %[res]"
> +        asm ( "rep bsf %[val], %[res]"

... why would we use REP (again) when gas 2.25 supports LZCNT/TZCNT?

Jan

>                : [res] "=r" (r)
>                : [val] "rm" (x) );
>      }
> @@ -428,7 +435,7 @@ static always_inline unsigned int arch_ffsl(unsigned long x)
>  
>      /* See arch_ffs() for safety discussions. */
>      if ( __builtin_constant_p(x > 0) && x > 0 )
> -        asm ( "bsf %[val], %q[res]"
> +        asm ( "rep bsf %[val], %q[res]"
>                : [res] "=r" (r)
>                : [val] "rm" (x) );
>      else
> @@ -446,7 +453,7 @@ static always_inline unsigned int arch_fls(unsigned int x)
>  
>      /* See arch_ffs() for safety discussions. */
>      if ( __builtin_constant_p(x > 0) && x > 0 )
> -        asm ( "bsr %[val], %[res]"
> +        asm ( "rep bsr %[val], %[res]"
>                : [res] "=r" (r)
>                : [val] "rm" (x) );
>      else
> @@ -464,7 +471,7 @@ static always_inline unsigned int arch_flsl(unsigned long x)
>  
>      /* See arch_ffs() for safety discussions. */
>      if ( __builtin_constant_p(x > 0) && x > 0 )
> -        asm ( "bsr %[val], %q[res]"
> +        asm ( "rep bsr %[val], %q[res]"
>                : [res] "=r" (r)
>                : [val] "rm" (x) );
>      else
> 
> base-commit: d965e2ee07c56c341d8896852550914d87ea5374
> prerequisite-patch-id: 8da89000c73c38aab6abfa6622217ea9eff07fbd
> prerequisite-patch-id: 74830838bac94ed1e036a8173cf3210a314b35d8
> prerequisite-patch-id: 0654835c28df8d40b6c97006d041c4d31447a9a6
> prerequisite-patch-id: 2d47d646c6a6e0019918c57753d6c01a752c377f
> prerequisite-patch-id: d8f8c4221a2d7039bae6f3d38e93fe90b2091d01
> prerequisite-patch-id: e0397c86b545a1d65f2e6b2049c282b926c40c64
> prerequisite-patch-id: ea21abe4540ee229f4f8725ce3f701d9ba4bd4a8

[PATCH v2] x86/bitops: Optimise arch_ffs{,l}() some more on AMD

Posted by Andrew Cooper 2 months ago

One aspect of the old ffs() infrastructure was the use of TZCNT without a
CPUID check.  Given no obvious justification, I dropped it during the cleanup.

It turns out there is a good reason, on all but the most recent AMD CPUs.

Reinstate the use of "blind" TZCNT when safe to do so.  This happens to be
preferentially in loops where a repeated saving of 5-6 uops becomes far more
relevant than a one-off scenario.  Leave an explanation of why.

It is not safe to perform this transformation for BSR and LZCNT, because they
count from opposite ends of the input operand.  Furthermore, this incorrect
transform was not caught by the selftests.

Extend the selftests with test_for_each_set_bit_reverse() as that exercises
the "x known non-zero" optimsiation path in x86's helpers.  Adjust the
existing test_for_each_set_bit() to intentionally use an asymmetric input bit
pattern, otherwise "counting from the wrong end" bugs continue to slip by.

No functional change.

Fixes: 989e5f08d33e ("x86/bitops: Improve arch_ffs() in the general case")
Fixes: 5ed26fc0768d ("xen/bitops: Implement ffsl() in common logic")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v2:
 * Use TZCNT mnemonic given the updated toolchain baseline
 * Do not convert LZCNT.
 * Extend selftests.

https://gitlab.com/xen-project/hardware/xen-staging/-/pipelines/2004918013

I've kept Jan's R-by as this is a subset of what was previously reviewed, but
I'm posting it for clarity as there are a lot of selftest changes.
---
 xen/arch/x86/include/asm/bitops.h | 13 ++++--
 xen/common/bitops.c               | 78 ++++++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 87eac7782f10..b17d67e8ca61 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -399,9 +399,16 @@ static always_inline unsigned int arch_ffs(unsigned int x)
          *
          * and the optimiser really can work with the knowledge of x being
          * non-zero without knowing it's exact value, in which case we don't
-         * need to compensate for BSF's corner cases.  Otherwise...
+         * need to compensate for BSF's corner cases.
+         *
+         * That said, we intentionally use TZCNT on capable hardware when the
+         * behaviour for the 0 case doesn't matter.  On AMD CPUs prior to
+         * Zen4, TZCNT is 1-2 uops while BSF is 6-8 with a latency to match.
+         * Intel CPUs don't suffer this discrepancy.
+         *
+         * Otherwise...
          */
-        asm ( "bsf %[val], %[res]"
+        asm ( "tzcnt %[val], %[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     }
@@ -428,7 +435,7 @@ static always_inline unsigned int arch_ffsl(unsigned long x)
 
     /* See arch_ffs() for safety discussions. */
     if ( __builtin_constant_p(x > 0) && x > 0 )
-        asm ( "bsf %[val], %q[res]"
+        asm ( "tzcnt %[val], %q[res]"
               : [res] "=r" (r)
               : [val] "rm" (x) );
     else
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
index bd17ddef5700..14e6265037f1 100644
--- a/xen/common/bitops.c
+++ b/xen/common/bitops.c
@@ -97,14 +97,14 @@ static void __init test_for_each_set_bit(void)
     if ( ui != ui_res )
         panic("for_each_set_bit(uint) expected %#x, got %#x\n", ui, ui_res);
 
-    ul = HIDE(1UL << (BITS_PER_LONG - 1) | 1);
+    ul = HIDE(1UL << (BITS_PER_LONG - 1) | 0x11);
     for_each_set_bit ( i, ul )
         ul_res |= 1UL << i;
 
     if ( ul != ul_res )
         panic("for_each_set_bit(ulong) expected %#lx, got %#lx\n", ul, ul_res);
 
-    ull = HIDE(0x8000000180000001ULL);
+    ull = HIDE(0x8000000180000011ULL);
     for_each_set_bit ( i, ull )
         ull_res |= 1ULL << i;
 
@@ -127,6 +127,79 @@ static void __init test_for_each_set_bit(void)
         panic("for_each_set_bit(break) expected 0x1008, got %#x\n", ui_res);
 }
 
+/*
+ * A type-generic fls() which picks the appropriate fls{,l,64}() based on it's
+ * argument.
+ */
+#define fls_g(x)                                        \
+    (sizeof(x) <= sizeof(int)      ? fls(x) :           \
+     sizeof(x) <= sizeof(long)     ? flsl(x) :          \
+     sizeof(x) <= sizeof(uint64_t) ? fls64(x) :         \
+     ({ BUILD_ERROR("fls_g() Bad input type"); 0; }))
+
+/*
+ * for_each_set_bit_reverse() - Iterate over all set bits in a scalar value,
+ * from MSB to LSB.
+ *
+ * @iter An iterator name.  Scoped is within the loop only.
+ * @val  A scalar value to iterate over.
+ *
+ * A copy of @val is taken internally.
+ */
+#define for_each_set_bit_reverse(iter, val)             \
+    for ( typeof(val) __v = (val); __v; __v = 0 )       \
+        for ( unsigned int (iter);                      \
+              __v && ((iter) = fls_g(__v) - 1, true);   \
+              __clear_bit(iter, &__v) )
+
+/*
+ * Xen doesn't have need of for_each_set_bit_reverse() at present, but the
+ * construct does exercise a case of arch_fls*() not covered anywhere else by
+ * these tests.
+ */
+static void __init test_for_each_set_bit_reverse(void)
+{
+    unsigned int  ui,  ui_res = 0, tmp;
+    unsigned long ul,  ul_res = 0;
+    uint64_t      ull, ull_res = 0;
+
+    ui = HIDE(0x80008001U);
+    for_each_set_bit_reverse ( i, ui )
+        ui_res |= 1U << i;
+
+    if ( ui != ui_res )
+        panic("for_each_set_bit_reverse(uint) expected %#x, got %#x\n", ui, ui_res);
+
+    ul = HIDE(1UL << (BITS_PER_LONG - 1) | 0x11);
+    for_each_set_bit_reverse ( i, ul )
+        ul_res |= 1UL << i;
+
+    if ( ul != ul_res )
+        panic("for_each_set_bit_reverse(ulong) expected %#lx, got %#lx\n", ul, ul_res);
+
+    ull = HIDE(0x8000000180000011ULL);
+    for_each_set_bit_reverse ( i, ull )
+        ull_res |= 1ULL << i;
+
+    if ( ull != ull_res )
+        panic("for_each_set_bit_reverse(uint64) expected %#"PRIx64", got %#"PRIx64"\n", ull, ull_res);
+
+    /* Check that we can break from the middle of the loop. */
+    ui = HIDE(0x80001008U);
+    tmp = 0;
+    ui_res = 0;
+    for_each_set_bit_reverse ( i, ui )
+    {
+        if ( tmp++ > 1 )
+            break;
+
+        ui_res |= 1U << i;
+    }
+
+    if ( ui_res != 0x80001000U )
+        panic("for_each_set_bit_reverse(break) expected 0x80001000, got %#x\n", ui_res);
+}
+
 static void __init test_multiple_bits_set(void)
 {
     /*
@@ -169,6 +242,7 @@ static void __init __constructor test_bitops(void)
     test_ffs();
     test_fls();
     test_for_each_set_bit();
+    test_for_each_set_bit_reverse();
 
     test_multiple_bits_set();
     test_hweight();
-- 
2.39.5