[v3] bitops: optimize code and add tests

[PATCH v3 1/5] m68k/bitops: force inlining of all bitops functions

Posted by Vincent Mailhol 2 years, 1 month ago

The inline keyword actually does not guarantee that the compiler will
inline a functions. Whenever the goal is to actually inline a
function, __always_inline should always be preferred instead.

On an allyesconfig, with GCC 13.2.1, it saves roughly 5 KB.

  $ size --format=GNU vmlinux.before vmlinux.after
        text       data        bss      total filename
    60449738   70975612    2288988  133714338 vmlinux.before
    60446534   70972412    2289596  133708542 vmlinux.after

Reference: commit 8dd5032d9c54 ("x86/asm/bitops: Force inlining of
test_and_set_bit and friends")
Link: https://git.kernel.org/torvalds/c/8dd5032d9c54

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
---
 arch/m68k/include/asm/bitops.h | 87 +++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 14c64a6f1217..ae0457d582b8 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -28,7 +28,7 @@
  *	So we use the best form possible on a given platform.
  */
 
-static inline void bset_reg_set_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bset_reg_set_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 
@@ -38,7 +38,7 @@ static inline void bset_reg_set_bit(int nr, volatile unsigned long *vaddr)
 		: "memory");
 }
 
-static inline void bset_mem_set_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bset_mem_set_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 
@@ -47,7 +47,7 @@ static inline void bset_mem_set_bit(int nr, volatile unsigned long *vaddr)
 		: "di" (nr & 7));
 }
 
-static inline void bfset_mem_set_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bfset_mem_set_bit(int nr, volatile unsigned long *vaddr)
 {
 	__asm__ __volatile__ ("bfset %1{%0:#1}"
 		:
@@ -71,7 +71,7 @@ arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 	set_bit(nr, addr);
 }
 
-static inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 
@@ -81,7 +81,7 @@ static inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr)
 		: "memory");
 }
 
-static inline void bclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 
@@ -90,7 +90,7 @@ static inline void bclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
 		: "di" (nr & 7));
 }
 
-static inline void bfclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bfclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
 {
 	__asm__ __volatile__ ("bfclr %1{%0:#1}"
 		:
@@ -114,7 +114,7 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 	clear_bit(nr, addr);
 }
 
-static inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 
@@ -124,7 +124,7 @@ static inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr)
 		: "memory");
 }
 
-static inline void bchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 
@@ -133,7 +133,7 @@ static inline void bchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
 		: "di" (nr & 7));
 }
 
-static inline void bfchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
+static __always_inline void bfchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
 {
 	__asm__ __volatile__ ("bfchg %1{%0:#1}"
 		:
@@ -160,8 +160,8 @@ arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 #define arch_test_bit generic_test_bit
 #define arch_test_bit_acquire generic_test_bit_acquire
 
-static inline int bset_reg_test_and_set_bit(int nr,
-					    volatile unsigned long *vaddr)
+static __always_inline int
+bset_reg_test_and_set_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 	char retval;
@@ -173,8 +173,8 @@ static inline int bset_reg_test_and_set_bit(int nr,
 	return retval;
 }
 
-static inline int bset_mem_test_and_set_bit(int nr,
-					    volatile unsigned long *vaddr)
+static __always_inline int
+bset_mem_test_and_set_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 	char retval;
@@ -185,8 +185,8 @@ static inline int bset_mem_test_and_set_bit(int nr,
 	return retval;
 }
 
-static inline int bfset_mem_test_and_set_bit(int nr,
-					     volatile unsigned long *vaddr)
+static __always_inline int
+bfset_mem_test_and_set_bit(int nr, volatile unsigned long *vaddr)
 {
 	char retval;
 
@@ -213,8 +213,8 @@ arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 	return test_and_set_bit(nr, addr);
 }
 
-static inline int bclr_reg_test_and_clear_bit(int nr,
-					      volatile unsigned long *vaddr)
+static __always_inline int
+bclr_reg_test_and_clear_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 	char retval;
@@ -226,8 +226,8 @@ static inline int bclr_reg_test_and_clear_bit(int nr,
 	return retval;
 }
 
-static inline int bclr_mem_test_and_clear_bit(int nr,
-					      volatile unsigned long *vaddr)
+static __always_inline int
+bclr_mem_test_and_clear_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 	char retval;
@@ -238,8 +238,8 @@ static inline int bclr_mem_test_and_clear_bit(int nr,
 	return retval;
 }
 
-static inline int bfclr_mem_test_and_clear_bit(int nr,
-					       volatile unsigned long *vaddr)
+static __always_inline int
+bfclr_mem_test_and_clear_bit(int nr, volatile unsigned long *vaddr)
 {
 	char retval;
 
@@ -266,8 +266,8 @@ arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 	return test_and_clear_bit(nr, addr);
 }
 
-static inline int bchg_reg_test_and_change_bit(int nr,
-					       volatile unsigned long *vaddr)
+static __always_inline int
+bchg_reg_test_and_change_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 	char retval;
@@ -279,8 +279,8 @@ static inline int bchg_reg_test_and_change_bit(int nr,
 	return retval;
 }
 
-static inline int bchg_mem_test_and_change_bit(int nr,
-					       volatile unsigned long *vaddr)
+static __always_inline int
+bchg_mem_test_and_change_bit(int nr, volatile unsigned long *vaddr)
 {
 	char *p = (char *)vaddr + (nr ^ 31) / 8;
 	char retval;
@@ -291,8 +291,8 @@ static inline int bchg_mem_test_and_change_bit(int nr,
 	return retval;
 }
 
-static inline int bfchg_mem_test_and_change_bit(int nr,
-						volatile unsigned long *vaddr)
+static __always_inline int
+bfchg_mem_test_and_change_bit(int nr, volatile unsigned long *vaddr)
 {
 	char retval;
 
@@ -319,8 +319,8 @@ arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 	return test_and_change_bit(nr, addr);
 }
 
-static inline bool xor_unlock_is_negative_byte(unsigned long mask,
-		volatile unsigned long *p)
+static __always_inline bool
+xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *p)
 {
 #ifdef CONFIG_COLDFIRE
 	__asm__ __volatile__ ("eorl %1, %0"
@@ -350,8 +350,8 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 #include <asm-generic/bitops/ffz.h>
 #else
 
-static inline int find_first_zero_bit(const unsigned long *vaddr,
-				      unsigned size)
+static __always_inline int
+find_first_zero_bit(const unsigned long *vaddr, unsigned size)
 {
 	const unsigned long *p = vaddr;
 	int res = 32;
@@ -376,8 +376,8 @@ static inline int find_first_zero_bit(const unsigned long *vaddr,
 }
 #define find_first_zero_bit find_first_zero_bit
 
-static inline int find_next_zero_bit(const unsigned long *vaddr, int size,
-				     int offset)
+static __always_inline int
+find_next_zero_bit(const unsigned long *vaddr, int size, int offset)
 {
 	const unsigned long *p = vaddr + (offset >> 5);
 	int bit = offset & 31UL, res;
@@ -406,7 +406,8 @@ static inline int find_next_zero_bit(const unsigned long *vaddr, int size,
 }
 #define find_next_zero_bit find_next_zero_bit
 
-static inline int find_first_bit(const unsigned long *vaddr, unsigned size)
+static __always_inline int
+find_first_bit(const unsigned long *vaddr, unsigned size)
 {
 	const unsigned long *p = vaddr;
 	int res = 32;
@@ -431,8 +432,8 @@ static inline int find_first_bit(const unsigned long *vaddr, unsigned size)
 }
 #define find_first_bit find_first_bit
 
-static inline int find_next_bit(const unsigned long *vaddr, int size,
-				int offset)
+static __always_inline int
+find_next_bit(const unsigned long *vaddr, int size, int offset)
 {
 	const unsigned long *p = vaddr + (offset >> 5);
 	int bit = offset & 31UL, res;
@@ -465,7 +466,7 @@ static inline int find_next_bit(const unsigned long *vaddr, int size,
  * ffz = Find First Zero in word. Undefined if no zero exists,
  * so code should check against ~0UL first..
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
 	int res;
 
@@ -488,7 +489,7 @@ static inline unsigned long ffz(unsigned long word)
  */
 #if (defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
 	!defined(CONFIG_M68000)
-static inline unsigned long __ffs(unsigned long x)
+static __always_inline unsigned long __ffs(unsigned long x)
 {
 	__asm__ __volatile__ ("bitrev %0; ff1 %0"
 		: "=d" (x)
@@ -496,7 +497,7 @@ static inline unsigned long __ffs(unsigned long x)
 	return x;
 }
 
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
 	if (!x)
 		return 0;
@@ -518,7 +519,7 @@ static inline int ffs(int x)
  *	the libc and compiler builtin ffs routines, therefore
  *	differs in spirit from the above ffz (man ffs).
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
 	int cnt;
 
@@ -528,7 +529,7 @@ static inline int ffs(int x)
 	return 32 - cnt;
 }
 
-static inline unsigned long __ffs(unsigned long x)
+static __always_inline unsigned long __ffs(unsigned long x)
 {
 	return ffs(x) - 1;
 }
@@ -536,7 +537,7 @@ static inline unsigned long __ffs(unsigned long x)
 /*
  *	fls: find last bit set.
  */
-static inline int fls(unsigned int x)
+static __always_inline int fls(unsigned int x)
 {
 	int cnt;
 
@@ -546,7 +547,7 @@ static inline int fls(unsigned int x)
 	return 32 - cnt;
 }
 
-static inline unsigned long __fls(unsigned long x)
+static __always_inline unsigned long __fls(unsigned long x)
 {
 	return fls(x) - 1;
 }
-- 
2.25.1

Re: [PATCH v3 1/5] m68k/bitops: force inlining of all bitops functions

Posted by Geert Uytterhoeven 2 years, 1 month ago

Hi Vincent,

Thanks for your patch!

On Sun, Dec 17, 2023 at 8:13 AM Vincent Mailhol
<mailhol.vincent@wanadoo.fr> wrote:
> The inline keyword actually does not guarantee that the compiler will
> inline a functions. Whenever the goal is to actually inline a
> function, __always_inline should always be preferred instead.
>
> On an allyesconfig, with GCC 13.2.1, it saves roughly 5 KB.
>
>   $ size --format=GNU vmlinux.before vmlinux.after
>         text       data        bss      total filename
>     60449738   70975612    2288988  133714338 vmlinux.before
>     60446534   70972412    2289596  133708542 vmlinux.after

With gcc 9.5.0-1ubuntu1~22.04, the figures are completely different
(i.e. a size increase):

allyesconfig:

      text       data        bss      total filename
  58878600   72415994    2283652  133578246 vmlinux.before
  58882250   72419706    2284004  133585960 vmlinux.after

atari_defconfig:

      text       data        bss      total filename
   4112060    1579862     151680    5843602 vmlinux-v6.7-rc8
   4117008    1579350     151680    5848038
vmlinux-v6.7-rc8-1-m68k-bitops-force-inlining

The next patch offsets that for allyesconfig, but not for atari_defconfig.

> Reference: commit 8dd5032d9c54 ("x86/asm/bitops: Force inlining of
> test_and_set_bit and friends")

Please don't split lines containing tags.

> Link: https://git.kernel.org/torvalds/c/8dd5032d9c54
>
> Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

Re: [PATCH v3 1/5] m68k/bitops: force inlining of all bitops functions

Posted by Vincent MAILHOL 2 years, 1 month ago

On Tue. 2 janv. 2024 at 19:28, Geert Uytterhoeven <geert@linux-m68k.org> wrote:
>
> Hi Vincent,
>
> Thanks for your patch!

Thanks for the review and for running the benchmark.

> On Sun, Dec 17, 2023 at 8:13 AM Vincent Mailhol
> <mailhol.vincent@wanadoo.fr> wrote:
> > The inline keyword actually does not guarantee that the compiler will
> > inline a functions. Whenever the goal is to actually inline a
> > function, __always_inline should always be preferred instead.
> >
> > On an allyesconfig, with GCC 13.2.1, it saves roughly 5 KB.
> >
> >   $ size --format=GNU vmlinux.before vmlinux.after
> >         text       data        bss      total filename
> >     60449738   70975612    2288988  133714338 vmlinux.before
> >     60446534   70972412    2289596  133708542 vmlinux.after
>
> With gcc 9.5.0-1ubuntu1~22.04, the figures are completely different
> (i.e. a size increase):

Those results are not normal, there should not be such a big
discrepancy between two versions of the same compiler. I double
checked everything and found out that I made a mistake when computing
the figures: not sure what exactly, but at some point, the ASLR seeds
(or other similar randomization feature) got reset and so, the
decrease I witnessed was just a "lucky roll".

After rerunning the benchmark (making sure to keep every seeds), I got
similar results as you:

        text       data        bss      total filename
    60449738   70975356    2288988  133714082
vmlinux_allyesconfig.before_this_series
    60446534   70979068    2289596  133715198
vmlinux_allyesconfig.after_first_patch
    60429746   70979132    2291676  133700554
vmlinux_allyesconfig.final_second_patch

Note that there are still some kind of randomness on the data segment
as shown in those other benchmarks I run:

        text       data        bss      total filename
    60449738   70976124    2288988  133714850
vmlinux_allyesconfig.before_this_series
    60446534   70980092    2289596  133716222
vmlinux_allyesconfig.after_first_patch
    60429746   70979388    2291676  133700810
vmlinux_allyesconfig.after_second_patch

        text       data        bss      total filename
    60449738   70975612    2288988  133714338
vmlinux_allyesconfig.before_this_series
    60446534   70980348    2289596  133716478
vmlinux_allyesconfig.after_first_patch
    60429746   70979900    2291676  133701322
vmlinux_allyesconfig.after_second_patch

But the error margin is within 1K.

So, in short, I inlined some functions which I shouldn't have. I am
preparing a v4 in which I will only inline the bit-find functions
(namely: __ffs(), ffs(), ffz(), __fls(), fls() and fls64()). Here are
the new figures:

        text       data        bss      total filename
    60453552   70955485    2288620  133697657
vmlinux_allyesconfig.before_this_series
    60450304   70953085    2289260  133692649
vmlinux_allyesconfig.after_first_patch
    60433536   70952637    2291340  133677513
vmlinux_allyesconfig.after_second_patch

N.B. The new figures were after a rebase, so do not try to compare
with the previous benchmarks. I will send the v4 soon, after I finish
to update the patch comments and double check things.

Concerning the other functions in bitops.h, there may be some other
ones worth a __always_inline. But I will narrow the scope of this
series only to the bit-find function. If a good samaritan wants to
investigate the other functions, go ahead!

Yours sincerely,
Vincent Mailhol




> allyesconfig:
>
>       text       data        bss      total filename
>   58878600   72415994    2283652  133578246 vmlinux.before
>   58882250   72419706    2284004  133585960 vmlinux.after
>
> atari_defconfig:
>
>       text       data        bss      total filename
>    4112060    1579862     151680    5843602 vmlinux-v6.7-rc8
>    4117008    1579350     151680    5848038
> vmlinux-v6.7-rc8-1-m68k-bitops-force-inlining
>
> The next patch offsets that for allyesconfig, but not for atari_defconfig.
>
> > Reference: commit 8dd5032d9c54 ("x86/asm/bitops: Force inlining of
> > test_and_set_bit and friends")
>
> Please don't split lines containing tags.
>
> > Link: https://git.kernel.org/torvalds/c/8dd5032d9c54
> >
> > Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
>
> Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
>
> Gr{oetje,eeting}s,
>
>                         Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                 -- Linus Torvalds

[PATCH v3 1/5] m68k/bitops: force inlining of all bitops functions
[PATCH v3 2/5] m68k/bitops: use __builtin_{clz,ctzl,ffs} to evaluate constant expressions
[PATCH v3 3/5] hexagon/bitops: force inlining of all bitops functions
[PATCH v3 4/5] hexagon/bitops: use __builtin_{clz,ctzl,ffs} to evaluate constant expressions
[PATCH v3 5/5] lib: test_bitops: add compile-time optimization/evaluations assertions