arch/x86/include/asm/bitops.h | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-)
* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Sun, 27 Apr 2025 at 12:17, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> >
> > ffs/fls are commonly found inside loops where x is the loop condition
> > too. Therefore, using statically_true() to provide a form without the
> > zero compatibility turns out to be a win.
>
> We already have the version without the zero capability - it's just
> called "__ffs()" and "__fls()", and performance-critical code uses
> those.
>
> So fls/ffs are the "standard" library functions that have to handle
> zero, and add that stupid "+1" because that interface was designed by
> some Pascal person who doesn't understand that we start counting from
> 0.
>
> Standards bodies: "companies aren't sending their best people".
>
> But it's silly that we then spend effort on magic cmov in inline asm
> on those things when it's literally the "don't use this version unless
> you don't actually care about performance" case.
>
> I don't think it would be wrong to just make the x86-32 code just do
> the check against zero ahead of time - in C.
>
> And yes, that will generate some extra code - you'll test for zero
> before, and then the caller might also test for a zero result that
> then results in another test for zero that can't actually happen (but
> the compiler doesn't know that). But I suspect that on the whole, it
> is likely to generate better code anyway just because the compiler
> sees that first test and can DTRT.
>
> UNTESTED patch applied in case somebody wants to play with this. It
> removes 10 lines of silly code, and along with them that 'cmov' use.
>
> Anybody?
Makes sense - it seems to boot here, but I only did some very light
testing.
There's a minor text size increase on x86-32 defconfig, GCC 14.2.0:
text data bss dec hex filename
16577728 7598826 1744896 25921450 18b87aa vmlinux.before
16577908 7598838 1744896 25921642 18b886a vmlinux.after
bloatometer output:
add/remove: 2/1 grow/shrink: 201/189 up/down: 5681/-3486 (2195)
Patch with changelog and your SOB added attached. Does it look good to
you?
Thanks,
Ingo
================>
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Apr 2025 08:38:35 +0200
Subject: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Don't do the complicated and probably questionable BS*L+CMOVZL
asm() optimization in variable_ffs() and fls(): performance-critical
code is already using __ffs() and __fls() that use sane interfaces
close to the machine instruction ABI. Check ahead for zero in C.
There's a minor text size increase on x86-32 defconfig:
text data bss dec hex filename
16577728 7598826 1744896 25921450 18b87aa vmlinux.before
16577908 7598838 1744896 25921642 18b886a vmlinux.after
bloatometer output:
add/remove: 2/1 grow/shrink: 201/189 up/down: 5681/-3486 (2195)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/include/asm/bitops.h | 22 ++++++----------------
1 file changed, 6 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 100413aff640..6061c87f14ac 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -321,15 +321,10 @@ static __always_inline int variable_ffs(int x)
asm("bsfl %1,%0"
: "=r" (r)
: ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
- asm("bsfl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "r" (-1));
#else
- asm("bsfl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
+ if (!x)
+ return 0;
+ asm("bsfl %1,%0" : "=r" (r) : "rm" (x));
#endif
return r + 1;
}
@@ -378,15 +373,10 @@ static __always_inline int fls(unsigned int x)
asm("bsrl %1,%0"
: "=r" (r)
: ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
- asm("bsrl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "rm" (-1));
#else
- asm("bsrl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
+ if (!x)
+ return 0;
+ asm("bsrl %1,%0" : "=r" (r) : "rm" (x));
#endif
return r + 1;
}
* Ingo Molnar <mingo@kernel.org> wrote:
> > UNTESTED patch applied in case somebody wants to play with this. It
> > removes 10 lines of silly code, and along with them that 'cmov' use.
> >
> > Anybody?
>
> Makes sense - it seems to boot here, but I only did some very light
> testing.
>
> There's a minor text size increase on x86-32 defconfig, GCC 14.2.0:
>
> text data bss dec hex filename
> 16577728 7598826 1744896 25921450 18b87aa vmlinux.before
> 16577908 7598838 1744896 25921642 18b886a vmlinux.after
>
> bloatometer output:
>
> add/remove: 2/1 grow/shrink: 201/189 up/down: 5681/-3486 (2195)
And once we remove 486, I think we can do the optimization below to
just assume the output doesn't get clobbered by BS*L in the zero-case,
right?
In the text size space it's a substantial optimization on x86-32
defconfig:
text data bss dec hex filename
16,577,728 7598826 1744896 25921450 18b87aa vmlinux.vanilla # CMOV+BS*L
16,577,908 7598838 1744896 25921642 18b886a vmlinux.linus_patch # if()+BS*L
16,573,568 7602922 1744896 25921386 18b876a vmlinux.noclobber # BS*L
Thanks,
Ingo
---
arch/x86/include/asm/bitops.h | 20 ++------------------
1 file changed, 2 insertions(+), 18 deletions(-)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 6061c87f14ac..e3e94a806656 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -308,24 +308,16 @@ static __always_inline int variable_ffs(int x)
{
int r;
-#ifdef CONFIG_X86_64
/*
* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before, except that the
* top 32 bits will be cleared.
- *
- * We cannot do this on 32 bits because at the very least some
- * 486 CPUs did not behave this way.
*/
asm("bsfl %1,%0"
: "=r" (r)
: ASM_INPUT_RM (x), "0" (-1));
-#else
- if (!x)
- return 0;
- asm("bsfl %1,%0" : "=r" (r) : "rm" (x));
-#endif
+
return r + 1;
}
@@ -360,24 +352,16 @@ static __always_inline int fls(unsigned int x)
if (__builtin_constant_p(x))
return x ? 32 - __builtin_clz(x) : 0;
-#ifdef CONFIG_X86_64
/*
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before, except that the
* top 32 bits will be cleared.
- *
- * We cannot do this on 32 bits because at the very least some
- * 486 CPUs did not behave this way.
*/
asm("bsrl %1,%0"
: "=r" (r)
: ASM_INPUT_RM (x), "0" (-1));
-#else
- if (!x)
- return 0;
- asm("bsrl %1,%0" : "=r" (r) : "rm" (x));
-#endif
+
return r + 1;
}
On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>
> And once we remove 486, I think we can do the optimization below to
> just assume the output doesn't get clobbered by BS*L in the zero-case,
> right?
We probably can't, because who knows what "Pentium" CPU's are out there.
Or even if Pentium really does get it right. I doubt we have any
developers with an original Pentium around.
So just leave the "we don't know what the CPU result is for zero"
unless we get some kind of official confirmation.
Linus
On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote: >> >> And once we remove 486, I think we can do the optimization below to >> just assume the output doesn't get clobbered by BS*L in the zero-case, >> right? > >We probably can't, because who knows what "Pentium" CPU's are out there. > >Or even if Pentium really does get it right. I doubt we have any >developers with an original Pentium around. > >So just leave the "we don't know what the CPU result is for zero" >unless we get some kind of official confirmation. > > Linus If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it. I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.
On 28/04/2025 10:38 pm, H. Peter Anvin wrote: > On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote: >>> And once we remove 486, I think we can do the optimization below to >>> just assume the output doesn't get clobbered by BS*L in the zero-case, >>> right? >> We probably can't, because who knows what "Pentium" CPU's are out there. >> >> Or even if Pentium really does get it right. I doubt we have any >> developers with an original Pentium around. >> >> So just leave the "we don't know what the CPU result is for zero" >> unless we get some kind of official confirmation. >> >> Linus > If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it. > > I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark. https://www.sandpile.org/x86/flags.htm That's a lot of "can't even characterise the result" in the P5. Looking at P4 column, that is clearly what the latest SDM has retroactively declared to be architectural. ~Andrew
On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >On 28/04/2025 10:38 pm, H. Peter Anvin wrote: >> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote: >>>> And once we remove 486, I think we can do the optimization below to >>>> just assume the output doesn't get clobbered by BS*L in the zero-case, >>>> right? >>> We probably can't, because who knows what "Pentium" CPU's are out there. >>> >>> Or even if Pentium really does get it right. I doubt we have any >>> developers with an original Pentium around. >>> >>> So just leave the "we don't know what the CPU result is for zero" >>> unless we get some kind of official confirmation. >>> >>> Linus >> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it. >> >> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark. > >https://www.sandpile.org/x86/flags.htm > >That's a lot of "can't even characterise the result" in the P5. > >Looking at P4 column, that is clearly what the latest SDM has >retroactively declared to be architectural. > >~Andrew Yes, but it wasn't about flags here. Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.
On 29/04/2025 3:00 am, H. Peter Anvin wrote: > On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 28/04/2025 10:38 pm, H. Peter Anvin wrote: >>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote: >>>>> And once we remove 486, I think we can do the optimization below to >>>>> just assume the output doesn't get clobbered by BS*L in the zero-case, >>>>> right? >>>> We probably can't, because who knows what "Pentium" CPU's are out there. >>>> >>>> Or even if Pentium really does get it right. I doubt we have any >>>> developers with an original Pentium around. >>>> >>>> So just leave the "we don't know what the CPU result is for zero" >>>> unless we get some kind of official confirmation. >>>> >>>> Linus >>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it. >>> >>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark. >> https://www.sandpile.org/x86/flags.htm >> >> That's a lot of "can't even characterise the result" in the P5. >> >> Looking at P4 column, that is clearly what the latest SDM has >> retroactively declared to be architectural. >> >> ~Andrew > Yes, but it wasn't about flags here. > > Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86. Yes it does generate inline code. https://godbolt.org/z/M45oo5rqT GCC does it branchlessly, but cannot optimise based on context. Clang can optimise based on context, except the 0 case it seems. Moving to -march=i686 causes both GCC and Clang to switch to CMOV and create branchless code, but is still GCC still can't optimise out the CMOV based on context. ~Andrew
On April 28, 2025 7:25:17 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >On 29/04/2025 3:00 am, H. Peter Anvin wrote: >> On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >>> On 28/04/2025 10:38 pm, H. Peter Anvin wrote: >>>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >>>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote: >>>>>> And once we remove 486, I think we can do the optimization below to >>>>>> just assume the output doesn't get clobbered by BS*L in the zero-case, >>>>>> right? >>>>> We probably can't, because who knows what "Pentium" CPU's are out there. >>>>> >>>>> Or even if Pentium really does get it right. I doubt we have any >>>>> developers with an original Pentium around. >>>>> >>>>> So just leave the "we don't know what the CPU result is for zero" >>>>> unless we get some kind of official confirmation. >>>>> >>>>> Linus >>>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it. >>>> >>>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark. >>> https://www.sandpile.org/x86/flags.htm >>> >>> That's a lot of "can't even characterise the result" in the P5. >>> >>> Looking at P4 column, that is clearly what the latest SDM has >>> retroactively declared to be architectural. >>> >>> ~Andrew >> Yes, but it wasn't about flags here. >> >> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86. > >Yes it does generate inline code. https://godbolt.org/z/M45oo5rqT > >GCC does it branchlessly, but cannot optimise based on context. > >Clang can optimise based on context, except the 0 case it seems. > >Moving to -march=i686 causes both GCC and Clang to switch to CMOV and >create branchless code, but is still GCC still can't optimise out the >CMOV based on context. > >~Andrew Maybe a gcc bug report would be better than trying to hack around this in the kernel?
On 29/04/2025 4:13 am, H. Peter Anvin wrote: > On April 28, 2025 7:25:17 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> On 29/04/2025 3:00 am, H. Peter Anvin wrote: >>> On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >>>> On 28/04/2025 10:38 pm, H. Peter Anvin wrote: >>>>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >>>>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote: >>>>>>> And once we remove 486, I think we can do the optimization below to >>>>>>> just assume the output doesn't get clobbered by BS*L in the zero-case, >>>>>>> right? >>>>>> We probably can't, because who knows what "Pentium" CPU's are out there. >>>>>> >>>>>> Or even if Pentium really does get it right. I doubt we have any >>>>>> developers with an original Pentium around. >>>>>> >>>>>> So just leave the "we don't know what the CPU result is for zero" >>>>>> unless we get some kind of official confirmation. >>>>>> >>>>>> Linus >>>>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it. >>>>> >>>>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark. >>>> https://www.sandpile.org/x86/flags.htm >>>> >>>> That's a lot of "can't even characterise the result" in the P5. >>>> >>>> Looking at P4 column, that is clearly what the latest SDM has >>>> retroactively declared to be architectural. >>>> >>>> ~Andrew >>> Yes, but it wasn't about flags here. >>> >>> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86. >> Yes it does generate inline code. https://godbolt.org/z/M45oo5rqT >> >> GCC does it branchlessly, but cannot optimise based on context. >> >> Clang can optimise based on context, except the 0 case it seems. >> >> Moving to -march=i686 causes both GCC and Clang to switch to CMOV and >> create branchless code, but is still GCC still can't optimise out the >> CMOV based on context. >> >> ~Andrew > Maybe a gcc bug report would be better than trying to hack around this in the kernel? I tried that. (The thread started as a question around __builtin_constant_p() but did grow to cover __builtin_ffs().) https://gcc.gnu.org/pipermail/gcc/2024-March/243465.html ~Andrew
On Tue, 29 Apr 2025 at 07:38, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> I tried that. (The thread started as a question around
> __builtin_constant_p() but did grow to cover __builtin_ffs().)
Maybe we could do something like
#define ffs(x) \
(statically_true((x) != 0) ? __ffs(x)+1 : __builtin_ffs(x))
which uses our "statically_true()" helper that is actually fairly good
at the whole "let the compiler tell us that it knows that value cannot
be zero"
I didn't check what code that generated, but I've seen gcc do well on
that statically_true() thing in the past.
Then we can just remove our current variable_ffs() thing entirely,
because we now depend on our (good) __ffs() and the builtin being
"good enough" for the bad case.
(And do the same thing for fls() too, of course)
Linus
On 29/04/2025 7:05 pm, Linus Torvalds wrote:
> On Tue, 29 Apr 2025 at 07:38, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> I tried that. (The thread started as a question around
>> __builtin_constant_p() but did grow to cover __builtin_ffs().)
> Maybe we could do something like
>
> #define ffs(x) \
> (statically_true((x) != 0) ? __ffs(x)+1 : __builtin_ffs(x))
>
> which uses our "statically_true()" helper that is actually fairly good
> at the whole "let the compiler tell us that it knows that value cannot
> be zero"
>
> I didn't check what code that generated, but I've seen gcc do well on
> that statically_true() thing in the past.
>
> Then we can just remove our current variable_ffs() thing entirely,
> because we now depend on our (good) __ffs() and the builtin being
> "good enough" for the bad case.
That would improve code generation for 32bit, but generally regress 64bit.
Preloading the destination register with -1 is better than the CMOV form
emitted by the builtin; BSF's habit of conditionally not writing the
destination register *is* a CMOV of sorts.
When I cleaned this up in Xen, there were several factors where I
thought improvements could be made.
Having both ffs() and __ffs(), where the latter is undefined in a common
case, is a trap waiting for an unwary programmer. I have no particular
love for ffs() being off-by-one from normal, but is well defined for all
inputs.
Also, leaving the constant folding to the arch-optimised form means that
it often gets forgotten. Therefore, I rearranged everything to have
this be common:
static always_inline attr_const unsigned int ffs(unsigned int x)
{
if ( __builtin_constant_p(x) )
return __builtin_ffs(x);
#ifdef arch_ffs
return arch_ffs(x);
#else
return generic_ffsl(x);
#endif
}
with most architectures implementing arch_ffs as:
#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0)
and x86 as:
static always_inline unsigned int arch_ffs(unsigned int x)
{
unsigned int r;
if ( __builtin_constant_p(x > 0) && x > 0 )
{
/*
* A common code pattern is:
*
* while ( bits )
* {
* bit = ffs(bits);
* ...
*
* and the optimiser really can work with the knowledge of x being
* non-zero without knowing it's exact value, in which case we don't
* need to compensate for BSF's corner cases. Otherwise...
*/
asm ( "bsf %[val], %[res]"
: [res] "=r" (r)
: [val] "rm" (x) );
}
else
{
/*
* ... the AMD manual states that BSF won't modify the destination
* register if x=0. The Intel manual states that the result is
* undefined, but the architects have said that the register is
* written back with it's old value (zero extended as normal).
*/
asm ( "bsf %[val], %[res]"
: [res] "=r" (r)
: [val] "rm" (x), "[res]" (-1) );
}
return r + 1;
}
#define arch_ffs arch_ffs
and finally, providing compatibility for the other forms as:
#define __ffs(x) (ffs(x) - 1)
The end result is fewer APIs to implement in arch-specific code, and the
removal of undefined behaviour.
That said, I don't envy anyone wanting to try and untangle this in
Linux, even if consensus were to agree on it as an approach.
~Andrew
On Tue, 29 Apr 2025 at 12:13, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> That would improve code generation for 32bit, but generally regress 64bit.
>
> Preloading the destination register with -1 is better than the CMOV form
> emitted by the builtin; BSF's habit of conditionally not writing the
> destination register *is* a CMOV of sorts.
Right you are. So we'd need to do this just for the x86-32 case. Oh
well. Ugly, but still prettier than what we have now, I guess.
Linus
On April 29, 2025 1:12:48 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >On Tue, 29 Apr 2025 at 12:13, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> >> That would improve code generation for 32bit, but generally regress 64bit. >> >> Preloading the destination register with -1 is better than the CMOV form >> emitted by the builtin; BSF's habit of conditionally not writing the >> destination register *is* a CMOV of sorts. > >Right you are. So we'd need to do this just for the x86-32 case. Oh >well. Ugly, but still prettier than what we have now, I guess. > > Linus Could you file a gcc bug? Gcc shouldn't generate worse code than inline asm ...
On Tue, 29 Apr 2025 at 14:24, H. Peter Anvin <hpa@zytor.com> wrote:
>
> Could you file a gcc bug? Gcc shouldn't generate worse code than inline asm ...
Honestly, I've given up on that idea.
That's basically what the bug report I reported about clts was - gcc
generating worse code than inline asm.
But why would we use gcc builtins at all if we have inline asm that is
better than what som eversions of gcc generates? The inline asm works
for all versions.
Anyway, attached is a test file that seems to generate basically
"optimal" code. It's not a kernel patch, but a standalone C file for
testing with a couple of stupid test-cases to make sure that it gets
the constant argument case right, and that it optimizes the "I know
it's not zero" case.
That is why it also uses that "#if __SIZEOF_LONG__ == 4" instead of
something like CONFIG_64BIT.
So it's a proof-of-concept thing, and it does seem to be fairly simple.
The "important" parts are just the
#define variable_ffs(x) \
(statically_true((x)!=0) ? __ffs(x)+1 : do_variable_ffs(x))
#define constant_ffs(x) __builtin_ffs(x)
#define ffs(x) \
(__builtin_constant_p(x) ? constant_ffs(x) : variable_ffs(x))
lines which end up picking the right choice. The rest is either the
"reimplement the kernel infrastructure for testing" or the trivial
do_variable_ffs() thing.
I just did
gcc -m32 -O2 -S -fomit-frame-pointer t.c
(with, and without that -m32) and looked at the result to see that it
looks sane. No *actual* testing.
Linus
On 29/04/2025 10:53 pm, Linus Torvalds wrote: > I just did > > gcc -m32 -O2 -S -fomit-frame-pointer t.c > > (with, and without that -m32) and looked at the result to see that it > looks sane. No *actual* testing. do_variable_ffs() doesn't quite work. REP BSF is LZCNT, and unconditionally writes it's output operand, and defeats the attempt to preload with -1. Drop the REP prefix, and it should work as intended. ~Andrew
On Tue, 29 Apr 2025 at 14:59, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> do_variable_ffs() doesn't quite work.
>
> REP BSF is LZCNT, and unconditionally writes it's output operand, and
> defeats the attempt to preload with -1.
>
> Drop the REP prefix, and it should work as intended.
Bah. That's what I get for just doing it blindly without actually
looking at the kernel source. I just copied the __ffs() thing - and
there the 'rep' is not for the zero case - which we don't care about -
but because lzcnt performs better on newer CPUs.
So you're obviously right.
Linus
On 29/04/2025 11:04 pm, Linus Torvalds wrote: > On Tue, 29 Apr 2025 at 14:59, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> do_variable_ffs() doesn't quite work. >> >> REP BSF is LZCNT, and unconditionally writes it's output operand, and >> defeats the attempt to preload with -1. >> >> Drop the REP prefix, and it should work as intended. > Bah. That's what I get for just doing it blindly without actually > looking at the kernel source. I just copied the __ffs() thing - and > there the 'rep' is not for the zero case - which we don't care about - > but because lzcnt performs better on newer CPUs. Oh, I didn't realise there was also a perf difference too, but Agner Fog agrees. Apparently in Zen4, BSF and friends have become a single uop with a sensible latency. Previously they were 6-8 uops with a latency to match. Intel appear to have have had them as a single uop since SandyBridge, so quite a long time now. ~Andrew
On Tue, 29 Apr 2025 at 15:22, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> Oh, I didn't realise there was also a perf difference too, but Agner Fog
> agrees.
The perf difference is exactly because of the issue where the non-rep
one acts as a cmov, and has basically two inputs (the bits to test in
the source, and the old value of the result register)
I guess it's not "fundamental", but lzcnt is basically a bit simpler
for hardware to implement, and the non-rep legacy bsf instruction
basically has a dependency on the previous value of the result
register.
So even when it's a single uop for both cases, that single uop can be
slower for the bsf because of the (typically false) dependency and
extra pressure on the rename registers.
Linus
On April 29, 2025 3:04:30 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote: >On Tue, 29 Apr 2025 at 14:59, Andrew Cooper <andrew.cooper3@citrix.com> wrote: >> >> do_variable_ffs() doesn't quite work. >> >> REP BSF is LZCNT, and unconditionally writes it's output operand, and >> defeats the attempt to preload with -1. >> >> Drop the REP prefix, and it should work as intended. > >Bah. That's what I get for just doing it blindly without actually >looking at the kernel source. I just copied the __ffs() thing - and >there the 'rep' is not for the zero case - which we don't care about - >but because lzcnt performs better on newer CPUs. > >So you're obviously right. > > Linus Yeah, the encoding of lzcnt was a real mistake, because the outputs are different (so you still need instruction-specific postprocessing.)
On Mon, 28 Apr 2025 at 19:00, H. Peter Anvin <hpa@zytor.com> wrote:
>
> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.
Yeah, I think we can just use __builtin_ffs() directly and get rid of
all the games.
Not all gcc builtins are great: I did a bugzilla about gcc ctzl some time ago:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106471
but for ffs() that does sound like it's the simplest thing to do.
Linus
* Ingo Molnar <mingo@kernel.org> wrote:
> And once we remove 486, I think we can do the optimization below to
> just assume the output doesn't get clobbered by BS*L in the
> zero-case, right?
>
> In the text size space it's a substantial optimization on x86-32
> defconfig:
>
> text data bss dec hex filename
> 16,577,728 7598826 1744896 25921450 18b87aa vmlinux.vanilla # CMOV+BS*L
> 16,577,908 7598838 1744896 25921642 18b886a vmlinux.linus_patch # if()+BS*L
> 16,573,568 7602922 1744896 25921386 18b876a vmlinux.noclobber # BS*L
And BTW, *that* is a price that all of non-486 x86-32 was paying for
486 support...
And, just out of intellectual curiosity, I also tried to measure the
code generation price of the +1 standards-quirk in the fls()/ffs()
interface as well:
text data bss dec hex filename
16,577,728 7598826 1744896 25921450 18b87aa vmlinux.vanilla # CMOV+BS*L
16,577,908 7598838 1744896 25921642 18b886a vmlinux.linus_patch # if()+BS*L
16,573,568 7602922 1744896 25921386 18b876a vmlinux.noclobber # BS*L
..........
16,573,552 7602922 1744896 25921370 18b875a vmlinux.broken # BROKEN: 0 baseline instead of 1
... and unless I messed up the patch, it seems to have a surprisingly
low impact - maybe because the compiler can amortize its cost by
adjusting all dependent code mostly at build time, so the +1 doesn't
end up being generated most of the time?
Thanks,
Ingo
===============================>
This broken patch is broken: it intentionally breaks the ffs()/fls()
interface in an attempt to measure the code generation effects of
interface details.
NOT-Signed-off-by: <anyone@anywhere.anytime>
---
arch/x86/include/asm/bitops.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index e3e94a806656..21707696bafe 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -318,7 +318,7 @@ static __always_inline int variable_ffs(int x)
: "=r" (r)
: ASM_INPUT_RM (x), "0" (-1));
- return r + 1;
+ return r;
}
/**
@@ -362,7 +362,7 @@ static __always_inline int fls(unsigned int x)
: "=r" (r)
: ASM_INPUT_RM (x), "0" (-1));
- return r + 1;
+ return r;
}
/**
On Mon, 28 Apr 2025 at 00:14, Ingo Molnar <mingo@kernel.org> wrote:
>
> And, just out of intellectual curiosity, I also tried to measure the
> code generation price of the +1 standards-quirk in the fls()/ffs()
> interface as well:
>
> ... and unless I messed up the patch, it seems to have a surprisingly
> low impact - maybe because the compiler can amortize its cost by
> adjusting all dependent code mostly at build time, so the +1 doesn't
> end up being generated most of the time?
No, I think one issue is that most users actually end up subtracting
one from the return value of 'ffs()', because the "bit #0 returns 1"
semantics of the standard ffs() function really is insane.
It's not just that it doesn't match sane hardware, it's also that it
doesn't match sane *users*. If bit #0 is set, people want '0', so they
typically subtract 1.
So when you stop adding one, you aren't actually removing code -
you're often adding it.
Just see how many hits you get from
git grep '\<ffs(.*).*-.*1'
which is obviously not a very precise pattern, but just look at the
output and see just *how* common that "subtract one" thing is.
I really don't understand how anybody *ever* thought that the whole
"return one bigger" was a good idea for ffs().
Sure, I understand that zero is special and needs a special return
value, but returning a negative value would have been pretty simple
(or just do what our bitops finding functions do, which is to return
past the end, which is often convenient but does tend to make the
error condition check a bit more complex).
Anyway, the fact that so many users subtract one means that your "look
at the size of the binary" model doesn't work. You're counting both
the wins (when that addition doesn't happen) and the losses (when the
"subtract one in the user" happens).
So the "+1" doesn't cause much code generation - as long as it's done
by the compiler that can also undo it - but it's just a horrid user
interface.
But maybe people really were poisoned by the Pascal mindset. Or maybe
it was invented by some ancient Roman who hadn't heard of the concept
of zero. Who knows?
Linus
* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Mon, 28 Apr 2025 at 00:14, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > And, just out of intellectual curiosity, I also tried to measure the
> > code generation price of the +1 standards-quirk in the fls()/ffs()
> > interface as well:
> >
> > ... and unless I messed up the patch, it seems to have a surprisingly
> > low impact - maybe because the compiler can amortize its cost by
> > adjusting all dependent code mostly at build time, so the +1 doesn't
> > end up being generated most of the time?
>
> No, I think one issue is that most users actually end up subtracting
> one from the return value of 'ffs()', because the "bit #0 returns 1"
> semantics of the standard ffs() function really is insane.
>
> It's not just that it doesn't match sane hardware, it's also that it
> doesn't match sane *users*. If bit #0 is set, people want '0', so they
> typically subtract 1.
>
> So when you stop adding one, you aren't actually removing code -
> you're often adding it.
>
> Just see how many hits you get from
>
> git grep '\<ffs(.*).*-.*1'
>
> which is obviously not a very precise pattern, but just look at the
> output and see just *how* common that "subtract one" thing is.
>
> I really don't understand how anybody *ever* thought that the whole
> "return one bigger" was a good idea for ffs().
Yeah. No argument from me that it's a badly thought out interface - I
was just surprised that it doesn't seem to impact performance as badly
as I expected. I have to add that a lot of work went into absorbing the
negative effects of the ffs()/fls() interfaces:
starship:~/tip> git grep -Ee '__ffs\(|__fls\(' | wc -l
1055
So it impacts code quality negatively, which is arguably the worse side
effect.
> But maybe people really were poisoned by the Pascal mindset. Or maybe
> it was invented by some ancient Roman who hadn't heard of the concept
> of zero. Who knows?
Hey, ancient Romans didn't even have the concept of *whitespaces* and
punctuation to begin with:
https://historyofinformation.com/images/Vergilius_Augusteus,_Georgica_121.jpg
Lazy stonemasons the lot of them.
Romans were the worst ever coders too I suspect. What have the Romans
ever done for us??
Ingo
On April 29, 2025 3:08:03 AM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Mon, 28 Apr 2025 at 00:14, Ingo Molnar <mingo@kernel.org> wrote:
>> >
>> > And, just out of intellectual curiosity, I also tried to measure the
>> > code generation price of the +1 standards-quirk in the fls()/ffs()
>> > interface as well:
>> >
>> > ... and unless I messed up the patch, it seems to have a surprisingly
>> > low impact - maybe because the compiler can amortize its cost by
>> > adjusting all dependent code mostly at build time, so the +1 doesn't
>> > end up being generated most of the time?
>>
>> No, I think one issue is that most users actually end up subtracting
>> one from the return value of 'ffs()', because the "bit #0 returns 1"
>> semantics of the standard ffs() function really is insane.
>>
>> It's not just that it doesn't match sane hardware, it's also that it
>> doesn't match sane *users*. If bit #0 is set, people want '0', so they
>> typically subtract 1.
>>
>> So when you stop adding one, you aren't actually removing code -
>> you're often adding it.
>>
>> Just see how many hits you get from
>>
>> git grep '\<ffs(.*).*-.*1'
>>
>> which is obviously not a very precise pattern, but just look at the
>> output and see just *how* common that "subtract one" thing is.
>>
>> I really don't understand how anybody *ever* thought that the whole
>> "return one bigger" was a good idea for ffs().
>
>Yeah. No argument from me that it's a badly thought out interface - I
>was just surprised that it doesn't seem to impact performance as badly
>as I expected. I have to add that a lot of work went into absorbing the
>negative effects of the ffs()/fls() interfaces:
>
> starship:~/tip> git grep -Ee '__ffs\(|__fls\(' | wc -l
> 1055
>
>So it impacts code quality negatively, which is arguably the worse side
>effect.
>
>> But maybe people really were poisoned by the Pascal mindset. Or maybe
>> it was invented by some ancient Roman who hadn't heard of the concept
>> of zero. Who knows?
>
>Hey, ancient Romans didn't even have the concept of *whitespaces* and
>punctuation to begin with:
>
> https://historyofinformation.com/images/Vergilius_Augusteus,_Georgica_121.jpg
>
>Lazy stonemasons the lot of them.
>
>Romans were the worst ever coders too I suspect. What have the Romans
>ever done for us??
>
> Ingo
Well, they did build the roads... 🤣
Roman numerals obviously were not a positional system, but at least in accounting they used N for zero (nulla.)
ROMANI•ITE•DOMVM
On April 28, 2025 12:14:40 AM PDT, Ingo Molnar <mingo@kernel.org> wrote: > >* Ingo Molnar <mingo@kernel.org> wrote: > >> And once we remove 486, I think we can do the optimization below to >> just assume the output doesn't get clobbered by BS*L in the >> zero-case, right? >> >> In the text size space it's a substantial optimization on x86-32 >> defconfig: >> >> text data bss dec hex filename >> 16,577,728 7598826 1744896 25921450 18b87aa vmlinux.vanilla # CMOV+BS*L >> 16,577,908 7598838 1744896 25921642 18b886a vmlinux.linus_patch # if()+BS*L >> 16,573,568 7602922 1744896 25921386 18b876a vmlinux.noclobber # BS*L > >And BTW, *that* is a price that all of non-486 x86-32 was paying for >486 support... > >And, just out of intellectual curiosity, I also tried to measure the >code generation price of the +1 standards-quirk in the fls()/ffs() >interface as well: > > text data bss dec hex filename > 16,577,728 7598826 1744896 25921450 18b87aa vmlinux.vanilla # CMOV+BS*L > 16,577,908 7598838 1744896 25921642 18b886a vmlinux.linus_patch # if()+BS*L > 16,573,568 7602922 1744896 25921386 18b876a vmlinux.noclobber # BS*L > .......... > 16,573,552 7602922 1744896 25921370 18b875a vmlinux.broken # BROKEN: 0 baseline instead of 1 > >... and unless I messed up the patch, it seems to have a surprisingly >low impact - maybe because the compiler can amortize its cost by >adjusting all dependent code mostly at build time, so the +1 doesn't >end up being generated most of the time? > >Thanks, > > Ingo > >===============================> > >This broken patch is broken: it intentionally breaks the ffs()/fls() >interface in an attempt to measure the code generation effects of >interface details. > >NOT-Signed-off-by: <anyone@anywhere.anytime> >--- > arch/x86/include/asm/bitops.h | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > >diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h >index e3e94a806656..21707696bafe 100644 >--- a/arch/x86/include/asm/bitops.h >+++ b/arch/x86/include/asm/bitops.h >@@ -318,7 +318,7 @@ static __always_inline int variable_ffs(int x) > : "=r" (r) > : ASM_INPUT_RM (x), "0" (-1)); > >- return r + 1; >+ return r; > } > > /** >@@ -362,7 +362,7 @@ static __always_inline int fls(unsigned int x) > : "=r" (r) > : ASM_INPUT_RM (x), "0" (-1)); > >- return r + 1; >+ return r; > } > > /** My recollection was that you can't assume that even for 586; that it is only safe for 686, but it has been a long time...
On Mon, Apr 28, 2025, at 09:14, Ingo Molnar wrote:
>
> ... and unless I messed up the patch, it seems to have a surprisingly
> low impact - maybe because the compiler can amortize its cost by
> adjusting all dependent code mostly at build time, so the +1 doesn't
> end up being generated most of the time?
Is there any reason we can't just use the compiler-builtins directly
like we do on other architectures, at least for 32-bit?
Looking at a couple of vmlinux objects confirms the findings from
fdb6649ab7c1 ("x86/asm/bitops: Use __builtin_ctzl() to evaluate
constant expressions") from looking at the object file that using
the built-in helpers is slightly better than the current asm code
for all 32-bit targets with both gcc and clang. It's also better
for 64-bit targets with clang, but not with gcc, where the inline
asm often saves a cmov but in other cases the compiler finds an
even better instruction sequence.
Arnd
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index eebbc8889e70..bdeae9a497e5 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -246,184 +246,18 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
-static __always_inline unsigned long variable__ffs(unsigned long word)
-{
- asm("tzcnt %1,%0"
- : "=r" (word)
- : ASM_INPUT_RM (word));
- return word;
-}
-
-/**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-#define __ffs(word) \
- (__builtin_constant_p(word) ? \
- (unsigned long)__builtin_ctzl(word) : \
- variable__ffs(word))
-
-static __always_inline unsigned long variable_ffz(unsigned long word)
-{
- return variable__ffs(~word);
-}
-
-/**
- * ffz - find first zero bit in word
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-#define ffz(word) \
- (__builtin_constant_p(word) ? \
- (unsigned long)__builtin_ctzl(~word) : \
- variable_ffz(word))
-
-/*
- * __fls: find last set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static __always_inline unsigned long __fls(unsigned long word)
-{
- if (__builtin_constant_p(word))
- return BITS_PER_LONG - 1 - __builtin_clzl(word);
-
- asm("bsr %1,%0"
- : "=r" (word)
- : ASM_INPUT_RM (word));
- return word;
-}
-
#undef ADDR
-#ifdef __KERNEL__
-static __always_inline int variable_ffs(int x)
-{
- int r;
-
-#ifdef CONFIG_X86_64
- /*
- * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
- * dest reg is undefined if x==0, but their CPU architect says its
- * value is written to set it to the same as before, except that the
- * top 32 bits will be cleared.
- *
- * We cannot do this on 32 bits because at the very least some
- * 486 CPUs did not behave this way.
- */
- asm("bsfl %1,%0"
- : "=r" (r)
- : ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
- asm("bsfl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "r" (-1));
-#else
- asm("bsfl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
-#endif
- return r + 1;
-}
-
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
-
-/**
- * fls - find last set bit in word
- * @x: the word to search
- *
- * This is defined in a similar way as the libc and compiler builtin
- * ffs, but returns the position of the most significant set bit.
- *
- * fls(value) returns 0 if value is 0 or the position of the last
- * set bit if value is nonzero. The last (most significant) bit is
- * at position 32.
- */
-static __always_inline int fls(unsigned int x)
-{
- int r;
-
- if (__builtin_constant_p(x))
- return x ? 32 - __builtin_clz(x) : 0;
-
-#ifdef CONFIG_X86_64
- /*
- * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
- * dest reg is undefined if x==0, but their CPU architect says its
- * value is written to set it to the same as before, except that the
- * top 32 bits will be cleared.
- *
- * We cannot do this on 32 bits because at the very least some
- * 486 CPUs did not behave this way.
- */
- asm("bsrl %1,%0"
- : "=r" (r)
- : ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
- asm("bsrl %1,%0\n\t"
- "cmovzl %2,%0"
- : "=&r" (r) : "rm" (x), "rm" (-1));
-#else
- asm("bsrl %1,%0\n\t"
- "jnz 1f\n\t"
- "movl $-1,%0\n"
- "1:" : "=r" (r) : "rm" (x));
-#endif
- return r + 1;
-}
-
-/**
- * fls64 - find last set bit in a 64-bit word
- * @x: the word to search
- *
- * This is defined in a similar way as the libc and compiler builtin
- * ffsll, but returns the position of the most significant set bit.
- *
- * fls64(value) returns 0 if value is 0 or the position of the last
- * set bit if value is nonzero. The last (most significant) bit is
- * at position 64.
- */
-#ifdef CONFIG_X86_64
-static __always_inline int fls64(__u64 x)
-{
- int bitpos = -1;
-
- if (__builtin_constant_p(x))
- return x ? 64 - __builtin_clzll(x) : 0;
- /*
- * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
- * dest reg is undefined if x==0, but their CPU architect says its
- * value is written to set it to the same as before.
- */
- asm("bsrq %1,%q0"
- : "+r" (bitpos)
- : ASM_INPUT_RM (x));
- return bitpos + 1;
-}
-#else
+#include <asm-generic/bitops/__ffs.h>
+#include <asm-generic/bitops/ffz.h>
+#include <asm-generic/bitops/builtin-fls.h>
+#include <asm-generic/bitops/__fls.h>
#include <asm-generic/bitops/fls64.h>
-#endif
-#include <asm-generic/bitops/sched.h>
+#include <asm-generic/bitops/sched.h>
+#include <asm-generic/bitops/builtin-ffs.h>
#include <asm/arch_hweight.h>
-
#include <asm-generic/bitops/const_hweight.h>
#include <asm-generic/bitops/instrumented-atomic.h>
@@ -434,5 +268,4 @@ static __always_inline int fls64(__u64 x)
#include <asm-generic/bitops/ext2-atomic-setbit.h>
-#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */
© 2016 - 2025 Red Hat, Inc.