Perform constant-folding unconditionally, rather than having it implemented
inconsistency between architectures.
Confirm the expected behaviour with compile time and boot time tests.
For non-constant inputs, use arch_ffs() if provided but fall back to
generic_ffsl() if not. In particular, RISC-V doesn't have a builtin that
works in all configurations.
For x86, rename ffs() to arch_ffs() and adjust the prototype.
For PPC, __builtin_ctz() is 1/3 of the size of size of the transform to
generic_fls(). Drop the definition entirely. ARM too benefits in the general
case by using __builtin_ctz(), but less dramatically because it using
optimised asm().
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Julien Grall <julien@xen.org>
CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
CC: Bertrand Marquis <bertrand.marquis@arm.com>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
CC: Shawn Anastasio <sanastasio@raptorengineering.com>
CC: consulting@bugseng.com <consulting@bugseng.com>
CC: Simone Ballarin <simone.ballarin@bugseng.com>
CC: Federico Serafini <federico.serafini@bugseng.com>
CC: Nicola Vetrini <nicola.vetrini@bugseng.com>
v2:
* Fall back to generic, not builtin.
* Extend the testing with multi-bit values.
* Use always_inline for x86
* Defer x86 optimisation to a later change
---
xen/arch/arm/include/asm/bitops.h | 2 +-
xen/arch/ppc/include/asm/bitops.h | 2 +-
xen/arch/x86/include/asm/bitops.h | 3 ++-
xen/common/Makefile | 1 +
xen/common/bitops.c | 19 +++++++++++++++++++
xen/include/xen/bitops.h | 17 +++++++++++++++++
6 files changed, 41 insertions(+), 3 deletions(-)
create mode 100644 xen/common/bitops.c
diff --git a/xen/arch/arm/include/asm/bitops.h b/xen/arch/arm/include/asm/bitops.h
index ec1cf7b9b323..a88ec2612e16 100644
--- a/xen/arch/arm/include/asm/bitops.h
+++ b/xen/arch/arm/include/asm/bitops.h
@@ -157,7 +157,7 @@ static inline int fls(unsigned int x)
}
-#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
+#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0)
#define ffsl(x) ({ unsigned long __t = (x); flsl(ISOLATE_LSB(__t)); })
/**
diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h
index ab692d01717b..5c36a6cc0ce3 100644
--- a/xen/arch/ppc/include/asm/bitops.h
+++ b/xen/arch/ppc/include/asm/bitops.h
@@ -173,7 +173,7 @@ static inline int __test_and_clear_bit(int nr, volatile void *addr)
#define flsl(x) generic_flsl(x)
#define fls(x) generic_flsl(x)
-#define ffs(x) ({ unsigned int t_ = (x); fls(t_ & -t_); })
+#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0)
#define ffsl(x) ({ unsigned long t_ = (x); flsl(t_ & -t_); })
/**
diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 5a71afbc89d5..122767fc0d10 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -430,7 +430,7 @@ static inline int ffsl(unsigned long x)
return (int)r+1;
}
-static inline int ffs(unsigned int x)
+static always_inline unsigned int arch_ffs(unsigned int x)
{
int r;
@@ -440,6 +440,7 @@ static inline int ffs(unsigned int x)
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
+#define arch_ffs arch_ffs
/**
* fls - find last bit set
diff --git a/xen/common/Makefile b/xen/common/Makefile
index d512cad5243f..21a4fb4c7166 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -1,5 +1,6 @@
obj-$(CONFIG_ARGO) += argo.o
obj-y += bitmap.o
+obj-bin-$(CONFIG_DEBUG) += bitops.init.o
obj-$(CONFIG_GENERIC_BUG_FRAME) += bug.o
obj-$(CONFIG_HYPFS_CONFIG) += config_data.o
obj-$(CONFIG_CORE_PARKING) += core_parking.o
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
new file mode 100644
index 000000000000..8c161b8ea7fa
--- /dev/null
+++ b/xen/common/bitops.c
@@ -0,0 +1,19 @@
+#include <xen/bitops.h>
+#include <xen/boot-check.h>
+#include <xen/init.h>
+
+static void __init test_ffs(void)
+{
+ /* unsigned int ffs(unsigned int) */
+ CHECK(ffs, 0, 0);
+ CHECK(ffs, 1, 1);
+ CHECK(ffs, 3, 1);
+ CHECK(ffs, 7, 1);
+ CHECK(ffs, 6, 2);
+ CHECK(ffs, 0x80000000U, 32);
+}
+
+static void __init __constructor test_bitops(void)
+{
+ test_ffs();
+}
diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index cd405df96180..f7e90a2893a5 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -31,6 +31,23 @@ unsigned int __pure generic_flsl(unsigned long x);
#include <asm/bitops.h>
+/*
+ * Find First/Last Set bit (all forms).
+ *
+ * Bits are labelled from 1. Returns 0 if given 0.
+ */
+static always_inline __pure unsigned int ffs(unsigned int x)
+{
+ if ( __builtin_constant_p(x) )
+ return __builtin_ffs(x);
+
+#ifdef arch_ffs
+ return arch_ffs(x);
+#else
+ return generic_ffsl(x);
+#endif
+}
+
/* --------------------- Please tidy below here --------------------- */
#ifndef find_next_bit
--
2.30.2
On Fri, 24 May 2024, Andrew Cooper wrote: > Perform constant-folding unconditionally, rather than having it implemented > inconsistency between architectures. > > Confirm the expected behaviour with compile time and boot time tests. > > For non-constant inputs, use arch_ffs() if provided but fall back to > generic_ffsl() if not. In particular, RISC-V doesn't have a builtin that > works in all configurations. > > For x86, rename ffs() to arch_ffs() and adjust the prototype. > > For PPC, __builtin_ctz() is 1/3 of the size of size of the transform to > generic_fls(). Drop the definition entirely. ARM too benefits in the general > case by using __builtin_ctz(), but less dramatically because it using > optimised asm(). > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> This patch made me realize that we should add __builtin_ctz, __builtin_constant_p and always_inline to docs/misra/C-language-toolchain.rst as they don't seem to be currently documented and they are not part of the C standard Patch welcome :-) Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> > --- > CC: Jan Beulich <JBeulich@suse.com> > CC: Roger Pau Monné <roger.pau@citrix.com> > CC: Wei Liu <wl@xen.org> > CC: Stefano Stabellini <sstabellini@kernel.org> > CC: Julien Grall <julien@xen.org> > CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com> > CC: Bertrand Marquis <bertrand.marquis@arm.com> > CC: Michal Orzel <michal.orzel@amd.com> > CC: Oleksii Kurochko <oleksii.kurochko@gmail.com> > CC: Shawn Anastasio <sanastasio@raptorengineering.com> > CC: consulting@bugseng.com <consulting@bugseng.com> > CC: Simone Ballarin <simone.ballarin@bugseng.com> > CC: Federico Serafini <federico.serafini@bugseng.com> > CC: Nicola Vetrini <nicola.vetrini@bugseng.com> > > v2: > * Fall back to generic, not builtin. > * Extend the testing with multi-bit values. > * Use always_inline for x86 > * Defer x86 optimisation to a later change > --- > xen/arch/arm/include/asm/bitops.h | 2 +- > xen/arch/ppc/include/asm/bitops.h | 2 +- > xen/arch/x86/include/asm/bitops.h | 3 ++- > xen/common/Makefile | 1 + > xen/common/bitops.c | 19 +++++++++++++++++++ > xen/include/xen/bitops.h | 17 +++++++++++++++++ > 6 files changed, 41 insertions(+), 3 deletions(-) > create mode 100644 xen/common/bitops.c > > diff --git a/xen/arch/arm/include/asm/bitops.h b/xen/arch/arm/include/asm/bitops.h > index ec1cf7b9b323..a88ec2612e16 100644 > --- a/xen/arch/arm/include/asm/bitops.h > +++ b/xen/arch/arm/include/asm/bitops.h > @@ -157,7 +157,7 @@ static inline int fls(unsigned int x) > } > > > -#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); }) > +#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0) > #define ffsl(x) ({ unsigned long __t = (x); flsl(ISOLATE_LSB(__t)); }) > > /** > diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h > index ab692d01717b..5c36a6cc0ce3 100644 > --- a/xen/arch/ppc/include/asm/bitops.h > +++ b/xen/arch/ppc/include/asm/bitops.h > @@ -173,7 +173,7 @@ static inline int __test_and_clear_bit(int nr, volatile void *addr) > > #define flsl(x) generic_flsl(x) > #define fls(x) generic_flsl(x) > -#define ffs(x) ({ unsigned int t_ = (x); fls(t_ & -t_); }) > +#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0) > #define ffsl(x) ({ unsigned long t_ = (x); flsl(t_ & -t_); }) > > /** > diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h > index 5a71afbc89d5..122767fc0d10 100644 > --- a/xen/arch/x86/include/asm/bitops.h > +++ b/xen/arch/x86/include/asm/bitops.h > @@ -430,7 +430,7 @@ static inline int ffsl(unsigned long x) > return (int)r+1; > } > > -static inline int ffs(unsigned int x) > +static always_inline unsigned int arch_ffs(unsigned int x) > { > int r; > > @@ -440,6 +440,7 @@ static inline int ffs(unsigned int x) > "1:" : "=r" (r) : "rm" (x)); > return r + 1; > } > +#define arch_ffs arch_ffs > > /** > * fls - find last bit set > diff --git a/xen/common/Makefile b/xen/common/Makefile > index d512cad5243f..21a4fb4c7166 100644 > --- a/xen/common/Makefile > +++ b/xen/common/Makefile > @@ -1,5 +1,6 @@ > obj-$(CONFIG_ARGO) += argo.o > obj-y += bitmap.o > +obj-bin-$(CONFIG_DEBUG) += bitops.init.o > obj-$(CONFIG_GENERIC_BUG_FRAME) += bug.o > obj-$(CONFIG_HYPFS_CONFIG) += config_data.o > obj-$(CONFIG_CORE_PARKING) += core_parking.o > diff --git a/xen/common/bitops.c b/xen/common/bitops.c > new file mode 100644 > index 000000000000..8c161b8ea7fa > --- /dev/null > +++ b/xen/common/bitops.c > @@ -0,0 +1,19 @@ > +#include <xen/bitops.h> > +#include <xen/boot-check.h> > +#include <xen/init.h> > + > +static void __init test_ffs(void) > +{ > + /* unsigned int ffs(unsigned int) */ > + CHECK(ffs, 0, 0); > + CHECK(ffs, 1, 1); > + CHECK(ffs, 3, 1); > + CHECK(ffs, 7, 1); > + CHECK(ffs, 6, 2); > + CHECK(ffs, 0x80000000U, 32); > +} > + > +static void __init __constructor test_bitops(void) > +{ > + test_ffs(); > +} > diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h > index cd405df96180..f7e90a2893a5 100644 > --- a/xen/include/xen/bitops.h > +++ b/xen/include/xen/bitops.h > @@ -31,6 +31,23 @@ unsigned int __pure generic_flsl(unsigned long x); > > #include <asm/bitops.h> > > +/* > + * Find First/Last Set bit (all forms). > + * > + * Bits are labelled from 1. Returns 0 if given 0. > + */ > +static always_inline __pure unsigned int ffs(unsigned int x) > +{ > + if ( __builtin_constant_p(x) ) > + return __builtin_ffs(x); > + > +#ifdef arch_ffs > + return arch_ffs(x); > +#else > + return generic_ffsl(x); > +#endif > +} > + > /* --------------------- Please tidy below here --------------------- */ > > #ifndef find_next_bit > -- > 2.30.2 >
On 2024-05-31 03:14, Stefano Stabellini wrote: > On Fri, 24 May 2024, Andrew Cooper wrote: >> Perform constant-folding unconditionally, rather than having it >> implemented >> inconsistency between architectures. >> >> Confirm the expected behaviour with compile time and boot time tests. >> >> For non-constant inputs, use arch_ffs() if provided but fall back to >> generic_ffsl() if not. In particular, RISC-V doesn't have a builtin >> that >> works in all configurations. >> >> For x86, rename ffs() to arch_ffs() and adjust the prototype. >> >> For PPC, __builtin_ctz() is 1/3 of the size of size of the transform >> to >> generic_fls(). Drop the definition entirely. ARM too benefits in the >> general >> case by using __builtin_ctz(), but less dramatically because it using >> optimised asm(). >> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > > This patch made me realize that we should add __builtin_ctz, > __builtin_constant_p and always_inline to > docs/misra/C-language-toolchain.rst as they don't seem to be currently > documented and they are not part of the C standard > > Patch welcome :-) > I can send a patch for the builtins. I think that for attributes it was decided to document the use of the __attribute__ token, rather than listing all the attributes used by Xen > Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> > > >> --- >> CC: Jan Beulich <JBeulich@suse.com> >> CC: Roger Pau Monné <roger.pau@citrix.com> >> CC: Wei Liu <wl@xen.org> >> CC: Stefano Stabellini <sstabellini@kernel.org> >> CC: Julien Grall <julien@xen.org> >> CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com> >> CC: Bertrand Marquis <bertrand.marquis@arm.com> >> CC: Michal Orzel <michal.orzel@amd.com> >> CC: Oleksii Kurochko <oleksii.kurochko@gmail.com> >> CC: Shawn Anastasio <sanastasio@raptorengineering.com> >> CC: consulting@bugseng.com <consulting@bugseng.com> >> CC: Simone Ballarin <simone.ballarin@bugseng.com> >> CC: Federico Serafini <federico.serafini@bugseng.com> >> CC: Nicola Vetrini <nicola.vetrini@bugseng.com> >> >> v2: >> * Fall back to generic, not builtin. >> * Extend the testing with multi-bit values. >> * Use always_inline for x86 >> * Defer x86 optimisation to a later change >> --- >> xen/arch/arm/include/asm/bitops.h | 2 +- >> xen/arch/ppc/include/asm/bitops.h | 2 +- >> xen/arch/x86/include/asm/bitops.h | 3 ++- >> xen/common/Makefile | 1 + >> xen/common/bitops.c | 19 +++++++++++++++++++ >> xen/include/xen/bitops.h | 17 +++++++++++++++++ >> 6 files changed, 41 insertions(+), 3 deletions(-) >> create mode 100644 xen/common/bitops.c >> >> diff --git a/xen/arch/arm/include/asm/bitops.h >> b/xen/arch/arm/include/asm/bitops.h >> index ec1cf7b9b323..a88ec2612e16 100644 >> --- a/xen/arch/arm/include/asm/bitops.h >> +++ b/xen/arch/arm/include/asm/bitops.h >> @@ -157,7 +157,7 @@ static inline int fls(unsigned int x) >> } >> >> >> -#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); }) >> +#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0) >> #define ffsl(x) ({ unsigned long __t = (x); flsl(ISOLATE_LSB(__t)); >> }) >> >> /** >> diff --git a/xen/arch/ppc/include/asm/bitops.h >> b/xen/arch/ppc/include/asm/bitops.h >> index ab692d01717b..5c36a6cc0ce3 100644 >> --- a/xen/arch/ppc/include/asm/bitops.h >> +++ b/xen/arch/ppc/include/asm/bitops.h >> @@ -173,7 +173,7 @@ static inline int __test_and_clear_bit(int nr, >> volatile void *addr) >> >> #define flsl(x) generic_flsl(x) >> #define fls(x) generic_flsl(x) >> -#define ffs(x) ({ unsigned int t_ = (x); fls(t_ & -t_); }) >> +#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0) >> #define ffsl(x) ({ unsigned long t_ = (x); flsl(t_ & -t_); }) >> >> /** >> diff --git a/xen/arch/x86/include/asm/bitops.h >> b/xen/arch/x86/include/asm/bitops.h >> index 5a71afbc89d5..122767fc0d10 100644 >> --- a/xen/arch/x86/include/asm/bitops.h >> +++ b/xen/arch/x86/include/asm/bitops.h >> @@ -430,7 +430,7 @@ static inline int ffsl(unsigned long x) >> return (int)r+1; >> } >> >> -static inline int ffs(unsigned int x) >> +static always_inline unsigned int arch_ffs(unsigned int x) >> { >> int r; >> >> @@ -440,6 +440,7 @@ static inline int ffs(unsigned int x) >> "1:" : "=r" (r) : "rm" (x)); >> return r + 1; >> } >> +#define arch_ffs arch_ffs >> >> /** >> * fls - find last bit set >> diff --git a/xen/common/Makefile b/xen/common/Makefile >> index d512cad5243f..21a4fb4c7166 100644 >> --- a/xen/common/Makefile >> +++ b/xen/common/Makefile >> @@ -1,5 +1,6 @@ >> obj-$(CONFIG_ARGO) += argo.o >> obj-y += bitmap.o >> +obj-bin-$(CONFIG_DEBUG) += bitops.init.o >> obj-$(CONFIG_GENERIC_BUG_FRAME) += bug.o >> obj-$(CONFIG_HYPFS_CONFIG) += config_data.o >> obj-$(CONFIG_CORE_PARKING) += core_parking.o >> diff --git a/xen/common/bitops.c b/xen/common/bitops.c >> new file mode 100644 >> index 000000000000..8c161b8ea7fa >> --- /dev/null >> +++ b/xen/common/bitops.c >> @@ -0,0 +1,19 @@ >> +#include <xen/bitops.h> >> +#include <xen/boot-check.h> >> +#include <xen/init.h> >> + >> +static void __init test_ffs(void) >> +{ >> + /* unsigned int ffs(unsigned int) */ >> + CHECK(ffs, 0, 0); >> + CHECK(ffs, 1, 1); >> + CHECK(ffs, 3, 1); >> + CHECK(ffs, 7, 1); >> + CHECK(ffs, 6, 2); >> + CHECK(ffs, 0x80000000U, 32); >> +} >> + >> +static void __init __constructor test_bitops(void) >> +{ >> + test_ffs(); >> +} >> diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h >> index cd405df96180..f7e90a2893a5 100644 >> --- a/xen/include/xen/bitops.h >> +++ b/xen/include/xen/bitops.h >> @@ -31,6 +31,23 @@ unsigned int __pure generic_flsl(unsigned long x); >> >> #include <asm/bitops.h> >> >> +/* >> + * Find First/Last Set bit (all forms). >> + * >> + * Bits are labelled from 1. Returns 0 if given 0. >> + */ >> +static always_inline __pure unsigned int ffs(unsigned int x) >> +{ >> + if ( __builtin_constant_p(x) ) >> + return __builtin_ffs(x); >> + >> +#ifdef arch_ffs >> + return arch_ffs(x); >> +#else >> + return generic_ffsl(x); >> +#endif >> +} >> + >> /* --------------------- Please tidy below here --------------------- >> */ >> >> #ifndef find_next_bit >> -- >> 2.30.2 >> -- Nicola Vetrini, BSc Software Engineer, BUGSENG srl (https://bugseng.com)
On 31/05/2024 7:56 am, Nicola Vetrini wrote: > On 2024-05-31 03:14, Stefano Stabellini wrote: >> On Fri, 24 May 2024, Andrew Cooper wrote: >>> Perform constant-folding unconditionally, rather than having it >>> implemented >>> inconsistency between architectures. >>> >>> Confirm the expected behaviour with compile time and boot time tests. >>> >>> For non-constant inputs, use arch_ffs() if provided but fall back to >>> generic_ffsl() if not. In particular, RISC-V doesn't have a builtin >>> that >>> works in all configurations. >>> >>> For x86, rename ffs() to arch_ffs() and adjust the prototype. >>> >>> For PPC, __builtin_ctz() is 1/3 of the size of size of the transform to >>> generic_fls(). Drop the definition entirely. ARM too benefits in >>> the general >>> case by using __builtin_ctz(), but less dramatically because it using >>> optimised asm(). >>> >>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> >> >> This patch made me realize that we should add __builtin_ctz, >> __builtin_constant_p and always_inline to >> docs/misra/C-language-toolchain.rst as they don't seem to be currently >> documented and they are not part of the C standard >> >> Patch welcome :-) >> > > I can send a patch for the builtins. That's very kind of you. In total by the end of this series, we've got __builtin_constant_p() (definitely used elsewhere already), and __builtin_{ffs,ctz,clz}{,l}() (3x primitives, 2x input types). If we're going for a list of the primitive operations, lets add __builtin_popcnt{,l}() too right away, because if it weren't for 4.19 code freeze, I'd have cleaned up the hweight() helpers too. ~Andrew
On 31/05/2024 9:34 am, Andrew Cooper wrote: > On 31/05/2024 7:56 am, Nicola Vetrini wrote: >> On 2024-05-31 03:14, Stefano Stabellini wrote: >>> On Fri, 24 May 2024, Andrew Cooper wrote: >>>> Perform constant-folding unconditionally, rather than having it >>>> implemented >>>> inconsistency between architectures. >>>> >>>> Confirm the expected behaviour with compile time and boot time tests. >>>> >>>> For non-constant inputs, use arch_ffs() if provided but fall back to >>>> generic_ffsl() if not. In particular, RISC-V doesn't have a builtin >>>> that >>>> works in all configurations. >>>> >>>> For x86, rename ffs() to arch_ffs() and adjust the prototype. >>>> >>>> For PPC, __builtin_ctz() is 1/3 of the size of size of the transform to >>>> generic_fls(). Drop the definition entirely. ARM too benefits in >>>> the general >>>> case by using __builtin_ctz(), but less dramatically because it using >>>> optimised asm(). >>>> >>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> >>> This patch made me realize that we should add __builtin_ctz, >>> __builtin_constant_p and always_inline to >>> docs/misra/C-language-toolchain.rst as they don't seem to be currently >>> documented and they are not part of the C standard >>> >>> Patch welcome :-) >>> >> I can send a patch for the builtins. > That's very kind of you. > > In total by the end of this series, we've got __builtin_constant_p() > (definitely used elsewhere already), and __builtin_{ffs,ctz,clz}{,l}() > (3x primitives, 2x input types). > > If we're going for a list of the primitive operations, lets add > __builtin_popcnt{,l}() too right away, because if it weren't for 4.19 > code freeze, I'd have cleaned up the hweight() helpers too. Oh, and it's worth noting that __builtin_{ctz,clz}{,l}() have explicit UB if given an input of 0. (Sadly, even on architectures where the underlying instruction emitted is safe with a 0 input. [0]) This is why every patch in the series using them checks for nonzero input. UBSAN (with an adequate compiler) will instrument this, and Xen has __ubsan_handle_invalid_builtin() to diagnose these. ~Andrew [0] It turns out that Clang has a 2-argument form of the builtin with the second being the "value forwarded" in case the first is 0. I've not investigated whether GCC has the same.
On 2024-05-31 10:48, Andrew Cooper wrote: > On 31/05/2024 9:34 am, Andrew Cooper wrote: >> On 31/05/2024 7:56 am, Nicola Vetrini wrote: >>> On 2024-05-31 03:14, Stefano Stabellini wrote: >>>> On Fri, 24 May 2024, Andrew Cooper wrote: >>>>> Perform constant-folding unconditionally, rather than having it >>>>> implemented >>>>> inconsistency between architectures. >>>>> >>>>> Confirm the expected behaviour with compile time and boot time >>>>> tests. >>>>> >>>>> For non-constant inputs, use arch_ffs() if provided but fall back >>>>> to >>>>> generic_ffsl() if not. In particular, RISC-V doesn't have a >>>>> builtin >>>>> that >>>>> works in all configurations. >>>>> >>>>> For x86, rename ffs() to arch_ffs() and adjust the prototype. >>>>> >>>>> For PPC, __builtin_ctz() is 1/3 of the size of size of the >>>>> transform to >>>>> generic_fls(). Drop the definition entirely. ARM too benefits in >>>>> the general >>>>> case by using __builtin_ctz(), but less dramatically because it >>>>> using >>>>> optimised asm(). >>>>> >>>>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> >>>> This patch made me realize that we should add __builtin_ctz, >>>> __builtin_constant_p and always_inline to >>>> docs/misra/C-language-toolchain.rst as they don't seem to be >>>> currently >>>> documented and they are not part of the C standard >>>> >>>> Patch welcome :-) >>>> >>> I can send a patch for the builtins. >> That's very kind of you. >> >> In total by the end of this series, we've got __builtin_constant_p() >> (definitely used elsewhere already), and >> __builtin_{ffs,ctz,clz}{,l}() >> (3x primitives, 2x input types). >> >> If we're going for a list of the primitive operations, lets add >> __builtin_popcnt{,l}() too right away, because if it weren't for 4.19 >> code freeze, I'd have cleaned up the hweight() helpers too. > > Oh, and it's worth noting that __builtin_{ctz,clz}{,l}() have explicit > UB if given an input of 0. (Sadly, even on architectures where the > underlying instruction emitted is safe with a 0 input. [0]) > > This is why every patch in the series using them checks for nonzero > input. > > UBSAN (with an adequate compiler) will instrument this, and Xen has > __ubsan_handle_invalid_builtin() to diagnose these. > > ~Andrew > > [0] It turns out that Clang has a 2-argument form of the builtin with > the second being the "value forwarded" in case the first is 0. I've > not > investigated whether GCC has the same. Hmm, maybe then it's best if builtins are listed in a separate section in that file, for ease of browsing. Xen also uses (conditionally) __builtin_mem*, __builtin_str* and others, so if all nonstandard intrinsics should be listed (as opposed to the ones in some way relevant for MISRA violations, which was the original scope of the document), then a subset of the attached list would be needed. There are a handful only used in ppc, and since the document only covers x86 and arm, those should be ignored for the time being. Anyway, I'll send an RFC next week to decide the best route. -- Nicola Vetrini, BSc Software Engineer, BUGSENG srl (https://bugseng.com)0|$ git grep -E -o "__builtin[a-z_]+" -- xen | cut -d':' -f2 | sort -u __builtin_bswap __builtin_clz __builtin_clzll __builtin_constant_p __builtin_ctz __builtin_ctzll __builtin_expect __builtin_frame_address __builtin_has_attribute __builtin_memchr __builtin_memcmp __builtin_memcpy __builtin_memmove __builtin_memset __builtin_offsetof __builtin_popcount __builtin_popcountll __builtin_prefetch __builtin_return_address __builtin_strcasecmp __builtin_strchr __builtin_strcmp __builtin_strlen __builtin_strncasecmp __builtin_strncmp __builtin_strrchr __builtin_strstr __builtin_trap __builtin_types_compatible_p __builtin_unreachable __builtin_va_arg __builtin_va_copy __builtin_va_end __builtin_va_list __builtin_va_start
On 24.05.2024 22:03, Andrew Cooper wrote: > Perform constant-folding unconditionally, rather than having it implemented > inconsistency between architectures. > > Confirm the expected behaviour with compile time and boot time tests. > > For non-constant inputs, use arch_ffs() if provided but fall back to > generic_ffsl() if not. In particular, RISC-V doesn't have a builtin that > works in all configurations. > > For x86, rename ffs() to arch_ffs() and adjust the prototype. > > For PPC, __builtin_ctz() is 1/3 of the size of size of the transform to > generic_fls(). Drop the definition entirely. ARM too benefits in the general > case by using __builtin_ctz(), but less dramatically because it using > optimised asm(). > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com>
© 2016 - 2024 Red Hat, Inc.