[v4] lib: optimize find_bit() functions

[PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions

Posted by Yury Norov 3 years, 6 months ago

Over the past couple years, the function _find_next_bit() was extended
with parameters that modify its behavior to implement and- zero- and le-
flavors. The parameters are passed at compile time, but current design
prevents a compiler from optimizing out the conditionals.

As find_next_bit() API grows, I expect that more parameters will be added.
Current design would require more conditional code in _find_next_bit(),
which would bloat the helper even more and make it barely readable.

This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds
a set of wrappers, so that the compile-time optimizations become possible.

The common logic is moved to the new macro, and all flavors may be
generated by providing a FETCH macro parameter, like in this example:

  #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ...

  find_next_xornot_and_bit(addr1, addr2, addr3, size, start)
  {
	return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx],
				/* nop */, size, start);
  }

The FETCH may be of any complexity, as soon as it only refers the bitmap(s)
and an iterator idx.

MUNGE is here to support _le code generation for BE builds. May be
empty.

I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top
of 6.0-rc2 + this series. The results for kvm/x86_64 are:

                      v6.0-rc2  Optimized       Difference  Z-score
Random dense bitmap         ns         ns        ns      %
find_next_bit:          787735     670546    117189   14.9     3.97
find_next_zero_bit:     777492     664208    113284   14.6    10.51
find_last_bit:          830925     687573    143352   17.3     2.35
find_first_bit:        3874366    3306635    567731   14.7     1.84
find_first_and_bit:   40677125   37739887   2937238    7.2     1.36
find_next_and_bit:      347865     304456     43409   12.5     1.35

Random sparse bitmap
find_next_bit:           19816      14021      5795   29.2     6.10
find_next_zero_bit:    1318901    1223794     95107    7.2     1.41
find_last_bit:           14573      13514      1059    7.3     6.92
find_first_bit:        1313321    1249024     64297    4.9     1.53
find_first_and_bit:       8921       8098       823    9.2     4.56
find_next_and_bit:        9796       7176      2620   26.7     5.39

Where the statistics is significant (z-score > 3), the improvement
is ~15%.

According to the bloat-o-meter, the Image size is 10-11K less:

x86_64/defconfig:
add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177)

arm64/defconfig:
add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948)

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h |  23 ++++++---
 lib/find_bit.c       | 119 +++++++++++++++++++++++++------------------
 2 files changed, 85 insertions(+), 57 deletions(-)

diff --git a/include/linux/find.h b/include/linux/find.h
index 2464bff5de04..dead6f53a97b 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -8,9 +8,12 @@
 
 #include <linux/bitops.h>
 
-extern unsigned long _find_next_bit(const unsigned long *addr1,
-		const unsigned long *addr2, unsigned long nbits,
-		unsigned long start, unsigned long invert, unsigned long le);
+unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
+				unsigned long start);
+unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+					unsigned long nbits, unsigned long start);
+unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
+					 unsigned long start);
 extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 					 const unsigned long *addr2, unsigned long size);
@@ -19,6 +22,10 @@ extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long siz
 
 #ifdef __BIG_ENDIAN
 unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
+unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
+					long size, unsigned long offset);
+unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
+				long size, unsigned long offset);
 #endif
 
 #ifndef find_next_bit
@@ -45,7 +52,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 		return val ? __ffs(val) : size;
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+	return _find_next_bit(addr, size, offset);
 }
 #endif
 
@@ -75,7 +82,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
 		return val ? __ffs(val) : size;
 	}
 
-	return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+	return _find_next_and_bit(addr1, addr2, size, offset);
 }
 #endif
 
@@ -103,7 +110,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 		return val == ~0UL ? size : ffz(val);
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+	return _find_next_zero_bit(addr, size, offset);
 }
 #endif
 
@@ -251,7 +258,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
 		return val == ~0UL ? size : ffz(val);
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+	return _find_next_zero_bit_le(addr, size, offset);
 }
 #endif
 
@@ -284,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 		return val ? __ffs(val) : size;
 	}
 
-	return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+	return _find_next_bit_le(addr, size, offset);
 }
 #endif
 
diff --git a/lib/find_bit.c b/lib/find_bit.c
index f4d9b9684811..8707b4ef3e5e 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -40,57 +40,33 @@
 	sz;									\
 })
 
-#if !defined(find_next_bit) || !defined(find_next_zero_bit) ||			\
-	!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) ||	\
-	!defined(find_next_and_bit)
 /*
- * This is a common helper function for find_next_bit, find_next_zero_bit, and
- * find_next_and_bit. The differences are:
- *  - The "invert" argument, which is XORed with each fetched word before
- *    searching it for one bits.
- *  - The optional "addr2", which is anded with "addr1" if present.
+ * Common helper for find_next_bit() function family
+ * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
+ * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
  */
-unsigned long _find_next_bit(const unsigned long *addr1,
-		const unsigned long *addr2, unsigned long nbits,
-		unsigned long start, unsigned long invert, unsigned long le)
-{
-	unsigned long tmp, mask;
-
-	if (unlikely(start >= nbits))
-		return nbits;
-
-	tmp = addr1[start / BITS_PER_LONG];
-	if (addr2)
-		tmp &= addr2[start / BITS_PER_LONG];
-	tmp ^= invert;
-
-	/* Handle 1st word. */
-	mask = BITMAP_FIRST_WORD_MASK(start);
-	if (le)
-		mask = swab(mask);
-
-	tmp &= mask;
-
-	start = round_down(start, BITS_PER_LONG);
-
-	while (!tmp) {
-		start += BITS_PER_LONG;
-		if (start >= nbits)
-			return nbits;
-
-		tmp = addr1[start / BITS_PER_LONG];
-		if (addr2)
-			tmp &= addr2[start / BITS_PER_LONG];
-		tmp ^= invert;
-	}
-
-	if (le)
-		tmp = swab(tmp);
-
-	return min(start + __ffs(tmp), nbits);
-}
-EXPORT_SYMBOL(_find_next_bit);
-#endif
+#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)				\
+({										\
+	unsigned long mask, idx, tmp, sz = (size), __start = (start);		\
+										\
+	if (unlikely(__start >= sz))						\
+		goto out;							\
+										\
+	mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start));				\
+	idx = __start / BITS_PER_LONG;						\
+										\
+	for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) {			\
+		if ((idx + 1) * BITS_PER_LONG >= sz)				\
+			goto out;						\
+		idx++;								\
+	}									\
+										\
+	sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz);			\
+out:										\
+	sz;									\
+})
 
 #ifndef find_first_bit
 /*
@@ -127,6 +103,32 @@ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size
 EXPORT_SYMBOL(_find_first_zero_bit);
 #endif
 
+#ifndef find_next_bit
+unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start)
+{
+	return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_bit);
+#endif
+
+#ifndef find_next_and_bit
+unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+					unsigned long nbits, unsigned long start)
+{
+	return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_and_bit);
+#endif
+
+#ifndef find_next_zero_bit
+unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
+					 unsigned long start)
+{
+	return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
+}
+EXPORT_SYMBOL(_find_next_zero_bit);
+#endif
+
 #ifndef find_last_bit
 unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
 {
@@ -173,4 +175,23 @@ unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long s
 EXPORT_SYMBOL(_find_first_zero_bit_le);
 #endif
 
+#ifndef find_next_zero_bit_le
+unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned
+		long size, unsigned long offset)
+{
+	return FIND_NEXT_BIT(~addr[idx], swab, size, offset);
+}
+EXPORT_SYMBOL(_find_next_zero_bit_le);
+#endif
+
+#ifndef find_next_bit_le
+unsigned long _find_next_bit_le(const unsigned long  *addr, unsigned
+		long size, unsigned long offset)
+{
+	return FIND_NEXT_BIT(addr[idx], swab, size, offset);
+}
+EXPORT_SYMBOL(_find_next_bit_le);
+
+#endif
+
 #endif /* __BIG_ENDIAN */
-- 
2.34.1

Re: [PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions

Posted by Andy Shevchenko 3 years, 6 months ago

On Wed, Sep 14, 2022 at 07:07:29PM -0700, Yury Norov wrote:
> Over the past couple years, the function _find_next_bit() was extended
> with parameters that modify its behavior to implement and- zero- and le-
> flavors. The parameters are passed at compile time, but current design
> prevents a compiler from optimizing out the conditionals.
> 
> As find_next_bit() API grows, I expect that more parameters will be added.
> Current design would require more conditional code in _find_next_bit(),
> which would bloat the helper even more and make it barely readable.
> 
> This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds
> a set of wrappers, so that the compile-time optimizations become possible.
> 
> The common logic is moved to the new macro, and all flavors may be
> generated by providing a FETCH macro parameter, like in this example:
> 
>   #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ...
> 
>   find_next_xornot_and_bit(addr1, addr2, addr3, size, start)
>   {
> 	return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx],
> 				/* nop */, size, start);
>   }
> 
> The FETCH may be of any complexity, as soon as it only refers the bitmap(s)
> and an iterator idx.
> 
> MUNGE is here to support _le code generation for BE builds. May be
> empty.
> 
> I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top
> of 6.0-rc2 + this series. The results for kvm/x86_64 are:
> 
>                       v6.0-rc2  Optimized       Difference  Z-score
> Random dense bitmap         ns         ns        ns      %
> find_next_bit:          787735     670546    117189   14.9     3.97
> find_next_zero_bit:     777492     664208    113284   14.6    10.51
> find_last_bit:          830925     687573    143352   17.3     2.35
> find_first_bit:        3874366    3306635    567731   14.7     1.84
> find_first_and_bit:   40677125   37739887   2937238    7.2     1.36
> find_next_and_bit:      347865     304456     43409   12.5     1.35
> 
> Random sparse bitmap
> find_next_bit:           19816      14021      5795   29.2     6.10
> find_next_zero_bit:    1318901    1223794     95107    7.2     1.41
> find_last_bit:           14573      13514      1059    7.3     6.92
> find_first_bit:        1313321    1249024     64297    4.9     1.53
> find_first_and_bit:       8921       8098       823    9.2     4.56
> find_next_and_bit:        9796       7176      2620   26.7     5.39
> 
> Where the statistics is significant (z-score > 3), the improvement
> is ~15%.
> 
> According to the bloat-o-meter, the Image size is 10-11K less:
> 
> x86_64/defconfig:
> add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177)
> 
> arm64/defconfig:
> add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948)

...

>  /*

Seems like you wanted this to be a kernel doc, but it isn't right now.

> - * This is a common helper function for find_next_bit, find_next_zero_bit, and
> - * find_next_and_bit. The differences are:
> - *  - The "invert" argument, which is XORed with each fetched word before
> - *    searching it for one bits.
> - *  - The optional "addr2", which is anded with "addr1" if present.
> + * Common helper for find_next_bit() function family

In such case this should start with a name of the macro

 * FIND_NEXT_BIT - ...

> + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
> + * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
> + * @size: The bitmap size in bits
> + * @start: The bitnumber to start searching at
>   */

...

> +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)				\
> +({										\
> +	unsigned long mask, idx, tmp, sz = (size), __start = (start);		\
> +										\
> +	if (unlikely(__start >= sz))						\
> +		goto out;							\
> +										\
> +	mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start));				\
> +	idx = __start / BITS_PER_LONG;						\
> +										\
> +	for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) {			\
> +		if ((idx + 1) * BITS_PER_LONG >= sz)				\
> +			goto out;						\
> +		idx++;								\
> +	}									\
> +										\
> +	sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz);			\
> +out:										\

I dunno if GCC expression limits the scope of goto labels, but on the safe side
you can add a prefix to it, so it becomes:

FIND_NEXT_BIT_out:

(or alike).

> +	sz;									\
> +})

...

> +unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned
> +		long size, unsigned long offset)

Usually we don't split parameters between lines.

...

> +unsigned long _find_next_bit_le(const unsigned long  *addr, unsigned
> +		long size, unsigned long offset)

Ditto.

-- 
With Best Regards,
Andy Shevchenko

Re: [PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions

Posted by Yury Norov 3 years, 6 months ago

On Mon, Sep 19, 2022 at 04:45:54PM +0300, Andy Shevchenko wrote:
> On Wed, Sep 14, 2022 at 07:07:29PM -0700, Yury Norov wrote:
> > Over the past couple years, the function _find_next_bit() was extended
> > with parameters that modify its behavior to implement and- zero- and le-
> > flavors. The parameters are passed at compile time, but current design
> > prevents a compiler from optimizing out the conditionals.
> > 
> > As find_next_bit() API grows, I expect that more parameters will be added.
> > Current design would require more conditional code in _find_next_bit(),
> > which would bloat the helper even more and make it barely readable.
> > 
> > This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds
> > a set of wrappers, so that the compile-time optimizations become possible.
> > 
> > The common logic is moved to the new macro, and all flavors may be
> > generated by providing a FETCH macro parameter, like in this example:
> > 
> >   #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ...
> > 
> >   find_next_xornot_and_bit(addr1, addr2, addr3, size, start)
> >   {
> > 	return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx],
> > 				/* nop */, size, start);
> >   }
> > 
> > The FETCH may be of any complexity, as soon as it only refers the bitmap(s)
> > and an iterator idx.
> > 
> > MUNGE is here to support _le code generation for BE builds. May be
> > empty.
> > 
> > I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top
> > of 6.0-rc2 + this series. The results for kvm/x86_64 are:
> > 
> >                       v6.0-rc2  Optimized       Difference  Z-score
> > Random dense bitmap         ns         ns        ns      %
> > find_next_bit:          787735     670546    117189   14.9     3.97
> > find_next_zero_bit:     777492     664208    113284   14.6    10.51
> > find_last_bit:          830925     687573    143352   17.3     2.35
> > find_first_bit:        3874366    3306635    567731   14.7     1.84
> > find_first_and_bit:   40677125   37739887   2937238    7.2     1.36
> > find_next_and_bit:      347865     304456     43409   12.5     1.35
> > 
> > Random sparse bitmap
> > find_next_bit:           19816      14021      5795   29.2     6.10
> > find_next_zero_bit:    1318901    1223794     95107    7.2     1.41
> > find_last_bit:           14573      13514      1059    7.3     6.92
> > find_first_bit:        1313321    1249024     64297    4.9     1.53
> > find_first_and_bit:       8921       8098       823    9.2     4.56
> > find_next_and_bit:        9796       7176      2620   26.7     5.39
> > 
> > Where the statistics is significant (z-score > 3), the improvement
> > is ~15%.
> > 
> > According to the bloat-o-meter, the Image size is 10-11K less:
> > 
> > x86_64/defconfig:
> > add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177)
> > 
> > arm64/defconfig:
> > add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948)
> 
> ...
> 
> >  /*
> 
> Seems like you wanted this to be a kernel doc, but it isn't right now.

No, I didn't. I can remove '@' below, if that concerns you.
 
> > - * This is a common helper function for find_next_bit, find_next_zero_bit, and
> > - * find_next_and_bit. The differences are:
> > - *  - The "invert" argument, which is XORed with each fetched word before
> > - *    searching it for one bits.
> > - *  - The optional "addr2", which is anded with "addr1" if present.
> > + * Common helper for find_next_bit() function family
> 
> In such case this should start with a name of the macro
> 
>  * FIND_NEXT_BIT - ...
> 
> > + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
> > + * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
> > + * @size: The bitmap size in bits
> > + * @start: The bitnumber to start searching at
> >   */
> 
> ...
> 
> > +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)				\
> > +({										\
> > +	unsigned long mask, idx, tmp, sz = (size), __start = (start);		\
> > +										\
> > +	if (unlikely(__start >= sz))						\
> > +		goto out;							\
> > +										\
> > +	mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start));				\
> > +	idx = __start / BITS_PER_LONG;						\
> > +										\
> > +	for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) {			\
> > +		if ((idx + 1) * BITS_PER_LONG >= sz)				\
> > +			goto out;						\
> > +		idx++;								\
> > +	}									\
> > +										\
> > +	sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz);			\
> > +out:										\
> 
> I dunno if GCC expression limits the scope of goto labels, but on the safe side
> you can add a prefix to it, so it becomes:
> 
> FIND_NEXT_BIT_out:
> 
> (or alike).

As Linus already said, the 'out' is function-scope. We can make it a
block-scope with __label__, but this would make an impression that we
are OK with stacking many FIND macros in a single function.

I spend some time trying to figure out a legitimate usecase for it, but
nothing came in mind. There are many real cases when we need 2 or more
find functions at once but all that cases would work with regular
wrappers around FIND_BIT(). Check this, for example:

https://lore.kernel.org/lkml/20220919210559.1509179-6-yury.norov@gmail.com/

I don't know how FIND_BIT() machinery will evolve with time. For now
it's a clean and neat local helper with a very straightforward usage.
Lets keep it simple now? If someone will decide to call FIND_BIT()
twice and fail, it would be a good hint that he's doing something
wrong.

> > +	sz;									\
> > +})
> 
> ...
> 
> > +unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned
> > +		long size, unsigned long offset)
> 
> Usually we don't split parameters between lines.

Ok

Thanks,
Yury

Re: [PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions

Posted by Linus Torvalds 3 years, 6 months ago

On Mon, Sep 19, 2022 at 6:46 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> > +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)                             \
> > +({                                                                           \
[..]
> > +out:                                                                         \
>
> I dunno if GCC expression limits the scope of goto labels

No. Labels are function-global by default. If you need block-scope for
them, you need to declare them explicitly in tha block before use with
"__label__".

> but on the safe side you can add a prefix to it, so it becomes:
>
> FIND_NEXT_BIT_out:

That doesn't really help, since if you were to use the macro twice,
you'd still get a name clash.

That said, I'm not convinced any of this matters, since these macros
aren't supposed to be used anywhere else, and in fact, they aren't
even in any header file that would allow anybody else to use them.

So I think all the normal "make macros safe" rules are simply
irrelevant for these cases - despite the readable name, these macros
are local special cases for code generation and avoiding duplication,
not generic "use this macro to find a bit".

So it's one thing if a macro is in a header file to be used by random
code. It's a different thing entirely if it's a specialized local
macro for a local issue, that nobody else is ever going to even see.

So I don't think it would be wrong to use __label__ to block-scope it,
or to use a longer name, but I also don't think it's really required.

It's not exactly super-common, but we have various cases of macros
that generate full (or partial) function bodies in various places,
where the macro does various things that should *never* be done in a
"regular" macro that gets used by normal code.

You can see one class of this with something like

   git grep '^static.*##.*(.*\\$' -- '*.c'

but to *really* go blind, see the "SYSCALL_DEFINE*()" macros in
<linux/syscalls.h>.

Those will mess with your mind, and go "whoever wrote those macros
needs to be institutionalized". They do some impressive things,
including creating local functions _and_ starting a new function
definition where the actual code then isn't part of the macro, but
actually just continues where the macro was used.

Which is all very natural and actually looks quite nice and readable
in the places that use it, with users looking like

  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
  {
        int fd;
        struct pid *p;
   ...

which is all pretty legible. But there's no question that that macro
violates every single "normal" macro rule.

The FIND_NEXT_BIT() macro in comparison is pretty tame.

                  Linus

Re: [PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions

Posted by Andy Shevchenko 3 years, 6 months ago

On Mon, Sep 19, 2022 at 08:23:00AM -0700, Linus Torvalds wrote:
> On Mon, Sep 19, 2022 at 6:46 AM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> >
> > > +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)                             \
> > > +({                                                                           \
> [..]
> > > +out:                                                                         \
> >
> > I dunno if GCC expression limits the scope of goto labels
> 
> No. Labels are function-global by default. If you need block-scope for
> them, you need to declare them explicitly in tha block before use with
> "__label__".
> 
> > but on the safe side you can add a prefix to it, so it becomes:
> >
> > FIND_NEXT_BIT_out:
> 
> That doesn't really help, since if you were to use the macro twice,
> you'd still get a name clash.
> 
> That said, I'm not convinced any of this matters, since these macros
> aren't supposed to be used anywhere else, and in fact, they aren't
> even in any header file that would allow anybody else to use them.
> 
> So I think all the normal "make macros safe" rules are simply
> irrelevant for these cases - despite the readable name, these macros
> are local special cases for code generation and avoiding duplication,
> not generic "use this macro to find a bit".
> 
> So it's one thing if a macro is in a header file to be used by random
> code. It's a different thing entirely if it's a specialized local
> macro for a local issue, that nobody else is ever going to even see.
> 
> So I don't think it would be wrong to use __label__ to block-scope it,
> or to use a longer name, but I also don't think it's really required.
> 
> It's not exactly super-common, but we have various cases of macros
> that generate full (or partial) function bodies in various places,
> where the macro does various things that should *never* be done in a
> "regular" macro that gets used by normal code.
> 
> You can see one class of this with something like
> 
>    git grep '^static.*##.*(.*\\$' -- '*.c'
> 
> but to *really* go blind, see the "SYSCALL_DEFINE*()" macros in
> <linux/syscalls.h>.
> 
> Those will mess with your mind, and go "whoever wrote those macros
> needs to be institutionalized". They do some impressive things,
> including creating local functions _and_ starting a new function
> definition where the actual code then isn't part of the macro, but
> actually just continues where the macro was used.
> 
> Which is all very natural and actually looks quite nice and readable
> in the places that use it, with users looking like
> 
>   SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
>   {
>         int fd;
>         struct pid *p;
>    ...
> 
> which is all pretty legible. But there's no question that that macro
> violates every single "normal" macro rule.
> 
> The FIND_NEXT_BIT() macro in comparison is pretty tame.

Thanks for elaboration. It makes a lot of sense and something TIL material
to me.

-- 
With Best Regards,
Andy Shevchenko

Re: [PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions

Posted by Valentin Schneider 3 years, 6 months ago

On 14/09/22 19:07, Yury Norov wrote:
> Over the past couple years, the function _find_next_bit() was extended
> with parameters that modify its behavior to implement and- zero- and le-
> flavors. The parameters are passed at compile time, but current design
> prevents a compiler from optimizing out the conditionals.
>
> As find_next_bit() API grows, I expect that more parameters will be added.
> Current design would require more conditional code in _find_next_bit(),
> which would bloat the helper even more and make it barely readable.
>
> This patch replaces _find_next_bit() with a macro FIND_NEXT_BIT, and adds
> a set of wrappers, so that the compile-time optimizations become possible.
>
> The common logic is moved to the new macro, and all flavors may be
> generated by providing a FETCH macro parameter, like in this example:
>
>   #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) ...
>
>   find_next_xornot_and_bit(addr1, addr2, addr3, size, start)
>   {
>       return FIND_NEXT_BIT(addr1[idx] ^ ~addr2[idx] & addr3[idx],
>                               /* nop */, size, start);
>   }
>
> The FETCH may be of any complexity, as soon as it only refers the bitmap(s)
> and an iterator idx.
>
> MUNGE is here to support _le code generation for BE builds. May be
> empty.
>
> I ran find_bit_benchmark 16 times on top of 6.0-rc2 and 16 times on top
> of 6.0-rc2 + this series. The results for kvm/x86_64 are:
>
>                       v6.0-rc2  Optimized       Difference  Z-score
> Random dense bitmap         ns         ns        ns      %
> find_next_bit:          787735     670546    117189   14.9     3.97
> find_next_zero_bit:     777492     664208    113284   14.6    10.51
> find_last_bit:          830925     687573    143352   17.3     2.35
> find_first_bit:        3874366    3306635    567731   14.7     1.84
> find_first_and_bit:   40677125   37739887   2937238    7.2     1.36
> find_next_and_bit:      347865     304456     43409   12.5     1.35
>
> Random sparse bitmap
> find_next_bit:           19816      14021      5795   29.2     6.10
> find_next_zero_bit:    1318901    1223794     95107    7.2     1.41
> find_last_bit:           14573      13514      1059    7.3     6.92
> find_first_bit:        1313321    1249024     64297    4.9     1.53
> find_first_and_bit:       8921       8098       823    9.2     4.56
> find_next_and_bit:        9796       7176      2620   26.7     5.39
>
> Where the statistics is significant (z-score > 3), the improvement
> is ~15%.
>
> According to the bloat-o-meter, the Image size is 10-11K less:
>
> x86_64/defconfig:
> add/remove: 32/14 grow/shrink: 61/782 up/down: 6344/-16521 (-10177)
>
> arm64/defconfig:
> add/remove: 3/2 grow/shrink: 50/714 up/down: 608/-11556 (-10948)
>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Yury Norov <yury.norov@gmail.com>

Reviewed-by: Valentin Schneider <vschneid@redhat.com>

[PATCH v4 1/4] lib/find_bit: introduce FIND_FIRST_BIT() macro
[PATCH v4 2/4] lib/find_bit: create find_first_zero_bit_le()
[PATCH v4 3/4] lib/find_bit: optimize find_next_bit() functions
[PATCH v4 4/4] tools: sync find_bit() implementation