bitops: optimize fns() for more

[PATCH 3/4] bitops: squeeze even more out of fns()

Posted by Yury Norov 1 year, 9 months ago

The function clears N-1 first set bits to find the N'th one with:

	while (word && n--)
		word &= word - 1;

In the worst case, it would take 63 iterations.

Instead of linear walk through the set bits, we can do a binary search
by using hweight(). This would work even better on platforms supporting
hardware-assisted hweight() - pretty much every modern arch.

On my Ryzen 9 5900X, the test_fns() benchmark runs binary fns() twice
faster, comparing to linear one.

The fns8() returns 64 to make sure that in case of no bit found, the
return value will be greater than the bit capacity of arguments of all
fnsXX() functions up to fns64().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 57ecef354f47..1c4773db56e0 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -247,17 +247,49 @@ static inline unsigned long __ffs64(u64 word)
 	return __ffs((unsigned long)word);
 }
 
+static inline unsigned long fns8(u8 word, unsigned int n)
+{
+	while (word && n--)
+		word &= word - 1;
+
+	return word ? __ffs(word) : 64;
+}
+
+static inline unsigned long fns16(u16 word, unsigned int n)
+{
+	unsigned int w = hweight8((u8)word);
+
+	return n < w ? fns8((u8)word, n) : 8 + fns8((u8)(word >> 8), n - w);
+}
+
+static inline unsigned long fns32(u32 word, unsigned int n)
+{
+	unsigned int w = hweight16((u16)word);
+
+	return n < w ? fns16((u16)word, n) : 16 + fns16((u16)(word >> 16), n - w);
+}
+
+static inline unsigned long fns64(u64 word, unsigned int n)
+{
+	unsigned int w = hweight32((u32)word);
+
+	return n < w ? fns32((u32)word, n) : 32 + fns32((u32)(word >> 32), n - w);
+}
+
 /**
  * fns - find N'th set bit in a word
  * @word: The word to search
- * @n: Bit to find
+ * @n: Bit to find, counting from 0
+ *
+ * Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG
  */
 static inline unsigned long fns(unsigned long word, unsigned int n)
 {
-	while (word && n--)
-		word &= word - 1;
-
-	return word ? __ffs(word) : BITS_PER_LONG;
+#if BITS_PER_LONG == 64
+	return fns64(word, n);
+#else
+	return fns32(word, n);
+#endif
 }
 
 /**
-- 
2.40.1

Re: [PATCH 3/4] bitops: squeeze even more out of fns()

Posted by Kuan-Wei Chiu 1 year, 9 months ago

+Cc Chin-Chun Chen & Ching-Chun (Jim) Huang

On Thu, May 02, 2024 at 04:32:03PM -0700, Yury Norov wrote:
> The function clears N-1 first set bits to find the N'th one with:
> 
> 	while (word && n--)
> 		word &= word - 1;
> 
> In the worst case, it would take 63 iterations.
> 
> Instead of linear walk through the set bits, we can do a binary search
> by using hweight(). This would work even better on platforms supporting
> hardware-assisted hweight() - pretty much every modern arch.
> 
Chin-Chun once proposed a method similar to binary search combined with
hamming weight and discussed it privately with me and Jim. However,
Chin-Chun found that binary search would actually impair performance
when n is small. Since we are unsure about the typical range of n in
our actual workload, we have not yet proposed any relevant patches. If
considering only the overall benchmark results, this patch looks good
to me.

Regards,
Kuan-Wei

> On my Ryzen 9 5900X, the test_fns() benchmark runs binary fns() twice
> faster, comparing to linear one.
> 
> The fns8() returns 64 to make sure that in case of no bit found, the
> return value will be greater than the bit capacity of arguments of all
> fnsXX() functions up to fns64().
> 
> Signed-off-by: Yury Norov <yury.norov@gmail.com>
> ---
>  include/linux/bitops.h | 42 +++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 37 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index 57ecef354f47..1c4773db56e0 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -247,17 +247,49 @@ static inline unsigned long __ffs64(u64 word)
>  	return __ffs((unsigned long)word);
>  }
>  
> +static inline unsigned long fns8(u8 word, unsigned int n)
> +{
> +	while (word && n--)
> +		word &= word - 1;
> +
> +	return word ? __ffs(word) : 64;
> +}
> +
> +static inline unsigned long fns16(u16 word, unsigned int n)
> +{
> +	unsigned int w = hweight8((u8)word);
> +
> +	return n < w ? fns8((u8)word, n) : 8 + fns8((u8)(word >> 8), n - w);
> +}
> +
> +static inline unsigned long fns32(u32 word, unsigned int n)
> +{
> +	unsigned int w = hweight16((u16)word);
> +
> +	return n < w ? fns16((u16)word, n) : 16 + fns16((u16)(word >> 16), n - w);
> +}
> +
> +static inline unsigned long fns64(u64 word, unsigned int n)
> +{
> +	unsigned int w = hweight32((u32)word);
> +
> +	return n < w ? fns32((u32)word, n) : 32 + fns32((u32)(word >> 32), n - w);
> +}
> +
>  /**
>   * fns - find N'th set bit in a word
>   * @word: The word to search
> - * @n: Bit to find
> + * @n: Bit to find, counting from 0
> + *
> + * Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG
>   */
>  static inline unsigned long fns(unsigned long word, unsigned int n)
>  {
> -	while (word && n--)
> -		word &= word - 1;
> -
> -	return word ? __ffs(word) : BITS_PER_LONG;
> +#if BITS_PER_LONG == 64
> +	return fns64(word, n);
> +#else
> +	return fns32(word, n);
> +#endif
>  }
>  
>  /**
> -- 
> 2.40.1
>

Re: [PATCH 3/4] bitops: squeeze even more out of fns()

Posted by Yury Norov 1 year, 9 months ago

On Fri, May 03, 2024 at 10:19:10AM +0800, Kuan-Wei Chiu wrote:
> +Cc Chin-Chun Chen & Ching-Chun (Jim) Huang
> 
> On Thu, May 02, 2024 at 04:32:03PM -0700, Yury Norov wrote:
> > The function clears N-1 first set bits to find the N'th one with:
> > 
> > 	while (word && n--)
> > 		word &= word - 1;
> > 
> > In the worst case, it would take 63 iterations.
> > 
> > Instead of linear walk through the set bits, we can do a binary search
> > by using hweight(). This would work even better on platforms supporting
> > hardware-assisted hweight() - pretty much every modern arch.
> > 
> Chin-Chun once proposed a method similar to binary search combined with
> hamming weight and discussed it privately with me and Jim. However,
> Chin-Chun found that binary search would actually impair performance
> when n is small. Since we are unsure about the typical range of n in
> our actual workload, we have not yet proposed any relevant patches. If
> considering only the overall benchmark results, this patch looks good
> to me.

fns() is used only as a helper to find_nth_bit(). 

In the kernel the find_nth_bit() is used in
 - bitmap_bitremap((),
 - bitmap_remap(), and
 - cpumask_local_spread() via sched_numa_find_nth_cpu()

with the bit to search calculated as n = n % cpumask_weigth(). This
virtually implies random uniformly distributed n and word, just like
in the test_fns().

In rebalance_wq_table() in drivers/crypto/intel/iaa/iaa_crypto_main.c
it's used like:
        
         for (cpu = 0; cpu < nr_cpus_per_node; cpu++) {
                   int node_cpu = cpumask_nth(cpu, node_cpus);
                   ...
         }

This is an API abuse, and should be rewritten with for_each_cpu()

In cpumask_any_housekeeping() at arch/x86/kernel/cpu/resctrl/internal.h
it's used like:

 90         hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
 91         if (hk_cpu == exclude_cpu)
 92                 hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
 93 
 94         if (hk_cpu < nr_cpu_ids)
 95                 cpu = hk_cpu;

And this is another example of the API abuse. We need to introduce a new
helper cpumask_andnot_any_but() and use it like:

        hk_cpu = cpumask_andnot_any_but(exclude_cpu, mask, tick_nohz_full_mask).
        if (hk_cpu < nr_cpu_ids)
                 cpu = hk_cpu;

So, where the use of find_nth_bit() is legitimate, the parameters are
distributed like in the test, and I would expect the real-life
performance impact to be similar to the test.

Optimizing the helper for non-legitimate cases doesn't worth the
effort.

Thanks,
Yury

Re: [PATCH 3/4] bitops: squeeze even more out of fns()

Posted by Kuan-Wei Chiu 1 year, 9 months ago

On Fri, May 03, 2024 at 09:13:32AM -0700, Yury Norov wrote:
> On Fri, May 03, 2024 at 10:19:10AM +0800, Kuan-Wei Chiu wrote:
> > +Cc Chin-Chun Chen & Ching-Chun (Jim) Huang
> > 
> > On Thu, May 02, 2024 at 04:32:03PM -0700, Yury Norov wrote:
> > > The function clears N-1 first set bits to find the N'th one with:
> > > 
> > > 	while (word && n--)
> > > 		word &= word - 1;
> > > 
> > > In the worst case, it would take 63 iterations.
> > > 
> > > Instead of linear walk through the set bits, we can do a binary search
> > > by using hweight(). This would work even better on platforms supporting
> > > hardware-assisted hweight() - pretty much every modern arch.
> > > 
> > Chin-Chun once proposed a method similar to binary search combined with
> > hamming weight and discussed it privately with me and Jim. However,
> > Chin-Chun found that binary search would actually impair performance
> > when n is small. Since we are unsure about the typical range of n in
> > our actual workload, we have not yet proposed any relevant patches. If
> > considering only the overall benchmark results, this patch looks good
> > to me.
> 
> fns() is used only as a helper to find_nth_bit(). 
> 
> In the kernel the find_nth_bit() is used in
>  - bitmap_bitremap((),
>  - bitmap_remap(), and
>  - cpumask_local_spread() via sched_numa_find_nth_cpu()
> 
> with the bit to search calculated as n = n % cpumask_weigth(). This
> virtually implies random uniformly distributed n and word, just like
> in the test_fns().
> 
> In rebalance_wq_table() in drivers/crypto/intel/iaa/iaa_crypto_main.c
> it's used like:
>         
>          for (cpu = 0; cpu < nr_cpus_per_node; cpu++) {
>                    int node_cpu = cpumask_nth(cpu, node_cpus);
>                    ...
>          }
> 
> This is an API abuse, and should be rewritten with for_each_cpu()
> 
> In cpumask_any_housekeeping() at arch/x86/kernel/cpu/resctrl/internal.h
> it's used like:
> 
>  90         hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
>  91         if (hk_cpu == exclude_cpu)
>  92                 hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
>  93 
>  94         if (hk_cpu < nr_cpu_ids)
>  95                 cpu = hk_cpu;
> 
> And this is another example of the API abuse. We need to introduce a new
> helper cpumask_andnot_any_but() and use it like:
> 
>         hk_cpu = cpumask_andnot_any_but(exclude_cpu, mask, tick_nohz_full_mask).
>         if (hk_cpu < nr_cpu_ids)
>                  cpu = hk_cpu;
> 
> So, where the use of find_nth_bit() is legitimate, the parameters are
> distributed like in the test, and I would expect the real-life
> performance impact to be similar to the test.
> 
> Optimizing the helper for non-legitimate cases doesn't worth the
> effort.
>
Got it, thank you for your detailed explanation :)

Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>

Regards,
Kuan-Wei