[PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C

Ingo Molnar posted 1 patch 7 months, 3 weeks ago
Failed in applying to current master (apply log)
There is a newer version of this series
arch/x86/include/asm/bitops.h | 22 ++++++----------------
1 file changed, 6 insertions(+), 16 deletions(-)
[PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Ingo Molnar 7 months, 3 weeks ago

* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, 27 Apr 2025 at 12:17, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> >
> > ffs/fls are commonly found inside loops where x is the loop condition
> > too.  Therefore, using statically_true() to provide a form without the
> > zero compatibility turns out to be a win.
> 
> We already have the version without the zero capability - it's just
> called "__ffs()" and "__fls()", and performance-critical code uses
> those.
> 
> So fls/ffs are the "standard" library functions that have to handle
> zero, and add that stupid "+1" because that interface was designed by
> some Pascal person who doesn't understand that we start counting from
> 0.
> 
> Standards bodies: "companies aren't sending their best people".
> 
> But it's silly that we then spend effort on magic cmov in inline asm
> on those things when it's literally the "don't use this version unless
> you don't actually care about performance" case.
> 
> I don't think it would be wrong to just make the x86-32 code just do
> the check against zero ahead of time - in C.
> 
> And yes, that will generate some extra code - you'll test for zero
> before, and then the caller might also test for a zero result that
> then results in another test for zero that can't actually happen (but
> the compiler doesn't know that). But I suspect that on the whole, it
> is likely to generate better code anyway just because the compiler
> sees that first test and can DTRT.
> 
> UNTESTED patch applied in case somebody wants to play with this. It
> removes 10 lines of silly code, and along with them that 'cmov' use.
> 
> Anybody?

Makes sense - it seems to boot here, but I only did some very light 
testing.

There's a minor text size increase on x86-32 defconfig, GCC 14.2.0:

      text       data        bss         dec        hex    filename
  16577728    7598826    1744896    25921450    18b87aa    vmlinux.before
  16577908    7598838    1744896    25921642    18b886a    vmlinux.after

bloatometer output:

  add/remove: 2/1 grow/shrink: 201/189 up/down: 5681/-3486 (2195)

Patch with changelog and your SOB added attached. Does it look good to 
you?

Thanks,

	Ingo

================>
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Apr 2025 08:38:35 +0200
Subject: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C

Don't do the complicated and probably questionable BS*L+CMOVZL
asm() optimization in variable_ffs() and fls(): performance-critical
code is already using __ffs() and __fls() that use sane interfaces
close to the machine instruction ABI. Check ahead for zero in C.

There's a minor text size increase on x86-32 defconfig:

      text       data        bss         dec        hex    filename
  16577728    7598826    1744896    25921450    18b87aa    vmlinux.before
  16577908    7598838    1744896    25921642    18b886a    vmlinux.after

bloatometer output:

  add/remove: 2/1 grow/shrink: 201/189 up/down: 5681/-3486 (2195)

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/bitops.h | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 100413aff640..6061c87f14ac 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -321,15 +321,10 @@ static __always_inline int variable_ffs(int x)
 	asm("bsfl %1,%0"
 	    : "=r" (r)
 	    : ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
-	asm("bsfl %1,%0\n\t"
-	    "cmovzl %2,%0"
-	    : "=&r" (r) : "rm" (x), "r" (-1));
 #else
-	asm("bsfl %1,%0\n\t"
-	    "jnz 1f\n\t"
-	    "movl $-1,%0\n"
-	    "1:" : "=r" (r) : "rm" (x));
+	if (!x)
+		return 0;
+	asm("bsfl %1,%0" : "=r" (r) : "rm" (x));
 #endif
 	return r + 1;
 }
@@ -378,15 +373,10 @@ static __always_inline int fls(unsigned int x)
 	asm("bsrl %1,%0"
 	    : "=r" (r)
 	    : ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
-	asm("bsrl %1,%0\n\t"
-	    "cmovzl %2,%0"
-	    : "=&r" (r) : "rm" (x), "rm" (-1));
 #else
-	asm("bsrl %1,%0\n\t"
-	    "jnz 1f\n\t"
-	    "movl $-1,%0\n"
-	    "1:" : "=r" (r) : "rm" (x));
+	if (!x)
+		return 0;
+	asm("bsrl %1,%0" : "=r" (r) : "rm" (x));
 #endif
 	return r + 1;
 }
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Ingo Molnar 7 months, 3 weeks ago
* Ingo Molnar <mingo@kernel.org> wrote:

> > UNTESTED patch applied in case somebody wants to play with this. It
> > removes 10 lines of silly code, and along with them that 'cmov' use.
> > 
> > Anybody?
> 
> Makes sense - it seems to boot here, but I only did some very light 
> testing.
> 
> There's a minor text size increase on x86-32 defconfig, GCC 14.2.0:
> 
>       text       data        bss         dec        hex    filename
>   16577728    7598826    1744896    25921450    18b87aa    vmlinux.before
>   16577908    7598838    1744896    25921642    18b886a    vmlinux.after
> 
> bloatometer output:
> 
>   add/remove: 2/1 grow/shrink: 201/189 up/down: 5681/-3486 (2195)

And once we remove 486, I think we can do the optimization below to 
just assume the output doesn't get clobbered by BS*L in the zero-case, 
right?

In the text size space it's a substantial optimization on x86-32 
defconfig:

        text	   data	       bss	     dec	    hex	filename
  16,577,728    7598826    1744896      25921450        18b87aa vmlinux.vanilla      # CMOV+BS*L
  16,577,908	7598838	   1744896	25921642	18b886a	vmlinux.linus_patch  # if()+BS*L
  16,573,568	7602922	   1744896	25921386	18b876a	vmlinux.noclobber    # BS*L

Thanks,

	Ingo

---
 arch/x86/include/asm/bitops.h | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 6061c87f14ac..e3e94a806656 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -308,24 +308,16 @@ static __always_inline int variable_ffs(int x)
 {
 	int r;
 
-#ifdef CONFIG_X86_64
 	/*
 	 * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
 	 * dest reg is undefined if x==0, but their CPU architect says its
 	 * value is written to set it to the same as before, except that the
 	 * top 32 bits will be cleared.
-	 *
-	 * We cannot do this on 32 bits because at the very least some
-	 * 486 CPUs did not behave this way.
 	 */
 	asm("bsfl %1,%0"
 	    : "=r" (r)
 	    : ASM_INPUT_RM (x), "0" (-1));
-#else
-	if (!x)
-		return 0;
-	asm("bsfl %1,%0" : "=r" (r) : "rm" (x));
-#endif
+
 	return r + 1;
 }
 
@@ -360,24 +352,16 @@ static __always_inline int fls(unsigned int x)
 	if (__builtin_constant_p(x))
 		return x ? 32 - __builtin_clz(x) : 0;
 
-#ifdef CONFIG_X86_64
 	/*
 	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
 	 * dest reg is undefined if x==0, but their CPU architect says its
 	 * value is written to set it to the same as before, except that the
 	 * top 32 bits will be cleared.
-	 *
-	 * We cannot do this on 32 bits because at the very least some
-	 * 486 CPUs did not behave this way.
 	 */
 	asm("bsrl %1,%0"
 	    : "=r" (r)
 	    : ASM_INPUT_RM (x), "0" (-1));
-#else
-	if (!x)
-		return 0;
-	asm("bsrl %1,%0" : "=r" (r) : "rm" (x));
-#endif
+
 	return r + 1;
 }
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>
> And once we remove 486, I think we can do the optimization below to
> just assume the output doesn't get clobbered by BS*L in the zero-case,
> right?

We probably can't, because who knows what "Pentium" CPU's are out there.

Or even if Pentium really does get it right. I doubt we have any
developers with an original Pentium around.

So just leave the "we don't know what the CPU result is for zero"
unless we get some kind of official confirmation.

          Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>>
>> And once we remove 486, I think we can do the optimization below to
>> just assume the output doesn't get clobbered by BS*L in the zero-case,
>> right?
>
>We probably can't, because who knows what "Pentium" CPU's are out there.
>
>Or even if Pentium really does get it right. I doubt we have any
>developers with an original Pentium around.
>
>So just leave the "we don't know what the CPU result is for zero"
>unless we get some kind of official confirmation.
>
>          Linus

If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it.

I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Andrew Cooper 7 months, 3 weeks ago
On 28/04/2025 10:38 pm, H. Peter Anvin wrote:
> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>>> And once we remove 486, I think we can do the optimization below to
>>> just assume the output doesn't get clobbered by BS*L in the zero-case,
>>> right?
>> We probably can't, because who knows what "Pentium" CPU's are out there.
>>
>> Or even if Pentium really does get it right. I doubt we have any
>> developers with an original Pentium around.
>>
>> So just leave the "we don't know what the CPU result is for zero"
>> unless we get some kind of official confirmation.
>>
>>          Linus
> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it.
>
> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.

https://www.sandpile.org/x86/flags.htm

That's a lot of "can't even characterise the result" in the P5.

Looking at P4 column, that is clearly what the latest SDM has
retroactively declared to be architectural.

~Andrew
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>On 28/04/2025 10:38 pm, H. Peter Anvin wrote:
>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>>>> And once we remove 486, I think we can do the optimization below to
>>>> just assume the output doesn't get clobbered by BS*L in the zero-case,
>>>> right?
>>> We probably can't, because who knows what "Pentium" CPU's are out there.
>>>
>>> Or even if Pentium really does get it right. I doubt we have any
>>> developers with an original Pentium around.
>>>
>>> So just leave the "we don't know what the CPU result is for zero"
>>> unless we get some kind of official confirmation.
>>>
>>>          Linus
>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it.
>>
>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.
>
>https://www.sandpile.org/x86/flags.htm
>
>That's a lot of "can't even characterise the result" in the P5.
>
>Looking at P4 column, that is clearly what the latest SDM has
>retroactively declared to be architectural.
>
>~Andrew

Yes, but it wasn't about flags here. 

Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Andrew Cooper 7 months, 3 weeks ago
On 29/04/2025 3:00 am, H. Peter Anvin wrote:
> On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 28/04/2025 10:38 pm, H. Peter Anvin wrote:
>>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>>>>> And once we remove 486, I think we can do the optimization below to
>>>>> just assume the output doesn't get clobbered by BS*L in the zero-case,
>>>>> right?
>>>> We probably can't, because who knows what "Pentium" CPU's are out there.
>>>>
>>>> Or even if Pentium really does get it right. I doubt we have any
>>>> developers with an original Pentium around.
>>>>
>>>> So just leave the "we don't know what the CPU result is for zero"
>>>> unless we get some kind of official confirmation.
>>>>
>>>>          Linus
>>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it.
>>>
>>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.
>> https://www.sandpile.org/x86/flags.htm
>>
>> That's a lot of "can't even characterise the result" in the P5.
>>
>> Looking at P4 column, that is clearly what the latest SDM has
>> retroactively declared to be architectural.
>>
>> ~Andrew
> Yes, but it wasn't about flags here. 
>
> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.

Yes it does generate inline code.  https://godbolt.org/z/M45oo5rqT

GCC does it branchlessly, but cannot optimise based on context.

Clang can optimise based on context, except the 0 case it seems.

Moving to -march=i686 causes both GCC and Clang to switch to CMOV and
create branchless code, but is still GCC still can't optimise out the
CMOV based on context.

~Andrew
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 28, 2025 7:25:17 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>On 29/04/2025 3:00 am, H. Peter Anvin wrote:
>> On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>> On 28/04/2025 10:38 pm, H. Peter Anvin wrote:
>>>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>>>>>> And once we remove 486, I think we can do the optimization below to
>>>>>> just assume the output doesn't get clobbered by BS*L in the zero-case,
>>>>>> right?
>>>>> We probably can't, because who knows what "Pentium" CPU's are out there.
>>>>>
>>>>> Or even if Pentium really does get it right. I doubt we have any
>>>>> developers with an original Pentium around.
>>>>>
>>>>> So just leave the "we don't know what the CPU result is for zero"
>>>>> unless we get some kind of official confirmation.
>>>>>
>>>>>          Linus
>>>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it.
>>>>
>>>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.
>>> https://www.sandpile.org/x86/flags.htm
>>>
>>> That's a lot of "can't even characterise the result" in the P5.
>>>
>>> Looking at P4 column, that is clearly what the latest SDM has
>>> retroactively declared to be architectural.
>>>
>>> ~Andrew
>> Yes, but it wasn't about flags here. 
>>
>> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.
>
>Yes it does generate inline code.  https://godbolt.org/z/M45oo5rqT
>
>GCC does it branchlessly, but cannot optimise based on context.
>
>Clang can optimise based on context, except the 0 case it seems.
>
>Moving to -march=i686 causes both GCC and Clang to switch to CMOV and
>create branchless code, but is still GCC still can't optimise out the
>CMOV based on context.
>
>~Andrew

Maybe a gcc bug report would be better than trying to hack around this in the kernel?
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Andrew Cooper 7 months, 3 weeks ago
On 29/04/2025 4:13 am, H. Peter Anvin wrote:
> On April 28, 2025 7:25:17 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 29/04/2025 3:00 am, H. Peter Anvin wrote:
>>> On April 28, 2025 5:12:13 PM PDT, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>>> On 28/04/2025 10:38 pm, H. Peter Anvin wrote:
>>>>> On April 28, 2025 9:14:45 AM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>>>>> On Mon, 28 Apr 2025 at 00:05, Ingo Molnar <mingo@kernel.org> wrote:
>>>>>>> And once we remove 486, I think we can do the optimization below to
>>>>>>> just assume the output doesn't get clobbered by BS*L in the zero-case,
>>>>>>> right?
>>>>>> We probably can't, because who knows what "Pentium" CPU's are out there.
>>>>>>
>>>>>> Or even if Pentium really does get it right. I doubt we have any
>>>>>> developers with an original Pentium around.
>>>>>>
>>>>>> So just leave the "we don't know what the CPU result is for zero"
>>>>>> unless we get some kind of official confirmation.
>>>>>>
>>>>>>          Linus
>>>>> If anyone knows for sure, it is probably Christian Ludloff. However, there was a *huge* tightening of the formal ISA when the i686 was introduced (family=6) and I really believe this was part of it.
>>>>>
>>>>> I also really don't trust that family=5 really means conforms to undocumented P5 behavior, e.g. for Quark.
>>>> https://www.sandpile.org/x86/flags.htm
>>>>
>>>> That's a lot of "can't even characterise the result" in the P5.
>>>>
>>>> Looking at P4 column, that is clearly what the latest SDM has
>>>> retroactively declared to be architectural.
>>>>
>>>> ~Andrew
>>> Yes, but it wasn't about flags here. 
>>>
>>> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.
>> Yes it does generate inline code.  https://godbolt.org/z/M45oo5rqT
>>
>> GCC does it branchlessly, but cannot optimise based on context.
>>
>> Clang can optimise based on context, except the 0 case it seems.
>>
>> Moving to -march=i686 causes both GCC and Clang to switch to CMOV and
>> create branchless code, but is still GCC still can't optimise out the
>> CMOV based on context.
>>
>> ~Andrew
> Maybe a gcc bug report would be better than trying to hack around this in the kernel?

I tried that.  (The thread started as a question around
__builtin_constant_p() but did grow to cover __builtin_ffs().)

https://gcc.gnu.org/pipermail/gcc/2024-March/243465.html

~Andrew
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Tue, 29 Apr 2025 at 07:38, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> I tried that.  (The thread started as a question around
> __builtin_constant_p() but did grow to cover __builtin_ffs().)

Maybe we could do something like

   #define ffs(x) \
        (statically_true((x) != 0) ? __ffs(x)+1 : __builtin_ffs(x))

which uses our "statically_true()" helper that is actually fairly good
at the whole "let the compiler tell us that it knows that value cannot
be zero"

I didn't check what code that generated, but I've seen gcc do well on
that statically_true() thing in the past.

Then we can just remove our current variable_ffs() thing entirely,
because we now depend on our (good) __ffs() and the builtin being
"good enough" for the bad case.

(And do the same thing for fls() too, of course)

               Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Andrew Cooper 7 months, 3 weeks ago
On 29/04/2025 7:05 pm, Linus Torvalds wrote:
> On Tue, 29 Apr 2025 at 07:38, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> I tried that.  (The thread started as a question around
>> __builtin_constant_p() but did grow to cover __builtin_ffs().)
> Maybe we could do something like
>
>    #define ffs(x) \
>         (statically_true((x) != 0) ? __ffs(x)+1 : __builtin_ffs(x))
>
> which uses our "statically_true()" helper that is actually fairly good
> at the whole "let the compiler tell us that it knows that value cannot
> be zero"
>
> I didn't check what code that generated, but I've seen gcc do well on
> that statically_true() thing in the past.
>
> Then we can just remove our current variable_ffs() thing entirely,
> because we now depend on our (good) __ffs() and the builtin being
> "good enough" for the bad case.

That would improve code generation for 32bit, but generally regress 64bit.

Preloading the destination register with -1 is better than the CMOV form
emitted by the builtin; BSF's habit of conditionally not writing the
destination register *is* a CMOV of sorts.


When I cleaned this up in Xen, there were several factors where I
thought improvements could be made.

Having both ffs() and __ffs(), where the latter is undefined in a common
case, is a trap waiting for an unwary programmer.  I have no particular
love for ffs() being off-by-one from normal, but is well defined for all
inputs.

Also, leaving the constant folding to the arch-optimised form means that
it often gets forgotten.  Therefore, I rearranged everything to have
this be common:

static always_inline attr_const unsigned int ffs(unsigned int x)
{
    if ( __builtin_constant_p(x) )
        return __builtin_ffs(x);

#ifdef arch_ffs
    return arch_ffs(x);
#else
    return generic_ffsl(x);
#endif
}

with most architectures implementing arch_ffs as:

#define arch_ffs(x) ((x) ? 1 + __builtin_ctz(x) : 0)

and x86 as:

static always_inline unsigned int arch_ffs(unsigned int x)
{
    unsigned int r;

    if ( __builtin_constant_p(x > 0) && x > 0 )
    {
        /*
         * A common code pattern is:
         *
         *     while ( bits )
         *     {
         *         bit = ffs(bits);
         *         ...
         *
         * and the optimiser really can work with the knowledge of x being
         * non-zero without knowing it's exact value, in which case we don't
         * need to compensate for BSF's corner cases.  Otherwise...
         */
        asm ( "bsf %[val], %[res]"
              : [res] "=r" (r)
              : [val] "rm" (x) );
    }
    else
    {
        /*
         * ... the AMD manual states that BSF won't modify the destination
         * register if x=0.  The Intel manual states that the result is
         * undefined, but the architects have said that the register is
         * written back with it's old value (zero extended as normal).
         */
        asm ( "bsf %[val], %[res]"
              : [res] "=r" (r)
              : [val] "rm" (x), "[res]" (-1) );
    }

    return r + 1;
}
#define arch_ffs arch_ffs

and finally, providing compatibility for the other forms as:

#define __ffs(x) (ffs(x) - 1)


The end result is fewer APIs to implement in arch-specific code, and the
removal of undefined behaviour.

That said, I don't envy anyone wanting to try and untangle this in
Linux, even if consensus were to agree on it as an approach.

~Andrew

Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Tue, 29 Apr 2025 at 12:13, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> That would improve code generation for 32bit, but generally regress 64bit.
>
> Preloading the destination register with -1 is better than the CMOV form
> emitted by the builtin; BSF's habit of conditionally not writing the
> destination register *is* a CMOV of sorts.

Right you are. So we'd need to do this just for the x86-32 case. Oh
well. Ugly, but still prettier than what we have now, I guess.

         Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 29, 2025 1:12:48 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Tue, 29 Apr 2025 at 12:13, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>
>> That would improve code generation for 32bit, but generally regress 64bit.
>>
>> Preloading the destination register with -1 is better than the CMOV form
>> emitted by the builtin; BSF's habit of conditionally not writing the
>> destination register *is* a CMOV of sorts.
>
>Right you are. So we'd need to do this just for the x86-32 case. Oh
>well. Ugly, but still prettier than what we have now, I guess.
>
>         Linus

Could you file a gcc bug? Gcc shouldn't generate worse code than inline asm ...
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Tue, 29 Apr 2025 at 14:24, H. Peter Anvin <hpa@zytor.com> wrote:
>
> Could you file a gcc bug? Gcc shouldn't generate worse code than inline asm ...

Honestly, I've given up on that idea.

That's basically what the bug report I reported about clts was - gcc
generating worse code than inline asm.

But why would we use gcc builtins at all if we have inline asm that is
better than what som eversions of gcc generates? The inline asm works
for all versions.

Anyway, attached is a test file that seems to generate basically
"optimal" code. It's not a kernel patch, but a standalone C file for
testing with a couple of stupid test-cases to make sure that it gets
the constant argument case right, and that it optimizes the "I know
it's not zero" case.

That is why it also uses that "#if __SIZEOF_LONG__ == 4" instead of
something like CONFIG_64BIT.

So it's a proof-of-concept thing, and it does seem to be fairly simple.

The "important" parts are just the

  #define variable_ffs(x) \
        (statically_true((x)!=0) ? __ffs(x)+1 : do_variable_ffs(x))
  #define constant_ffs(x) __builtin_ffs(x)
  #define ffs(x) \
        (__builtin_constant_p(x) ? constant_ffs(x) : variable_ffs(x))

lines which end up picking the right choice. The rest is either the
"reimplement the kernel infrastructure for testing" or the trivial
do_variable_ffs() thing.

I just did

    gcc -m32 -O2 -S -fomit-frame-pointer t.c

(with, and without that -m32) and looked at the result to see that it
looks sane. No *actual* testing.


                Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Andrew Cooper 7 months, 3 weeks ago
On 29/04/2025 10:53 pm, Linus Torvalds wrote:
> I just did
>
>     gcc -m32 -O2 -S -fomit-frame-pointer t.c
>
> (with, and without that -m32) and looked at the result to see that it
> looks sane. No *actual* testing.

do_variable_ffs() doesn't quite work.

REP BSF is LZCNT, and unconditionally writes it's output operand, and
defeats the attempt to preload with -1.

Drop the REP prefix, and it should work as intended.

~Andrew
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Tue, 29 Apr 2025 at 14:59, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> do_variable_ffs() doesn't quite work.
>
> REP BSF is LZCNT, and unconditionally writes it's output operand, and
> defeats the attempt to preload with -1.
>
> Drop the REP prefix, and it should work as intended.

Bah. That's what I get for just doing it blindly without actually
looking at the kernel source. I just copied the __ffs() thing - and
there the 'rep' is not for the zero case - which we don't care about -
but because lzcnt performs better on newer CPUs.

So you're obviously right.

            Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Andrew Cooper 7 months, 3 weeks ago
On 29/04/2025 11:04 pm, Linus Torvalds wrote:
> On Tue, 29 Apr 2025 at 14:59, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> do_variable_ffs() doesn't quite work.
>>
>> REP BSF is LZCNT, and unconditionally writes it's output operand, and
>> defeats the attempt to preload with -1.
>>
>> Drop the REP prefix, and it should work as intended.
> Bah. That's what I get for just doing it blindly without actually
> looking at the kernel source. I just copied the __ffs() thing - and
> there the 'rep' is not for the zero case - which we don't care about -
> but because lzcnt performs better on newer CPUs.

Oh, I didn't realise there was also a perf difference too, but Agner Fog
agrees.

Apparently in Zen4, BSF and friends have become a single uop with a
sensible latency.  Previously they were 6-8 uops with a latency to match.

Intel appear to have have had them as a single uop since SandyBridge, so
quite a long time now.

~Andrew

Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Tue, 29 Apr 2025 at 15:22, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>
> Oh, I didn't realise there was also a perf difference too, but Agner Fog
> agrees.

The perf difference is exactly because of the issue where the non-rep
one acts as a cmov, and has basically two inputs (the bits to test in
the source, and the old value of the result register)

I guess it's not "fundamental", but lzcnt is basically a bit simpler
for hardware to implement, and the non-rep legacy bsf instruction
basically has a dependency on the previous value of the result
register.

So even when it's a single uop for both cases, that single uop can be
slower for the bsf because of the (typically false) dependency and
extra pressure on the rename registers.

       Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 29, 2025 3:04:30 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Tue, 29 Apr 2025 at 14:59, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>
>> do_variable_ffs() doesn't quite work.
>>
>> REP BSF is LZCNT, and unconditionally writes it's output operand, and
>> defeats the attempt to preload with -1.
>>
>> Drop the REP prefix, and it should work as intended.
>
>Bah. That's what I get for just doing it blindly without actually
>looking at the kernel source. I just copied the __ffs() thing - and
>there the 'rep' is not for the zero case - which we don't care about -
>but because lzcnt performs better on newer CPUs.
>
>So you're obviously right.
>
>            Linus

Yeah, the encoding of lzcnt was a real mistake, because the outputs are different (so you still need instruction-specific postprocessing.)
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Mon, 28 Apr 2025 at 19:00, H. Peter Anvin <hpa@zytor.com> wrote:
>
> Now, question: can we just use __builtin_*() for these? I think gcc should always generate inline code for these on x86.

Yeah, I think we can just use __builtin_ffs() directly and get rid of
all the games.

Not all gcc builtins are great: I did a bugzilla about gcc ctzl some time ago:

    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106471

but for ffs() that does sound like it's the simplest thing to do.

               Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Ingo Molnar 7 months, 3 weeks ago
* Ingo Molnar <mingo@kernel.org> wrote:

> And once we remove 486, I think we can do the optimization below to 
> just assume the output doesn't get clobbered by BS*L in the 
> zero-case, right?
> 
> In the text size space it's a substantial optimization on x86-32 
> defconfig:
> 
>         text	   data	       bss	     dec	    hex	filename
>   16,577,728    7598826    1744896      25921450        18b87aa vmlinux.vanilla      # CMOV+BS*L
>   16,577,908	7598838	   1744896	25921642	18b886a	vmlinux.linus_patch  # if()+BS*L
>   16,573,568	7602922	   1744896	25921386	18b876a	vmlinux.noclobber    # BS*L

And BTW, *that* is a price that all of non-486 x86-32 was paying for 
486 support...

And, just out of intellectual curiosity, I also tried to measure the 
code generation price of the +1 standards-quirk in the fls()/ffs() 
interface as well:

         text	   data	       bss	     dec	    hex	filename
   16,577,728   7598826    1744896      25921450        18b87aa vmlinux.vanilla      # CMOV+BS*L
   16,577,908	7598838	   1744896	25921642	18b886a	vmlinux.linus_patch  # if()+BS*L
   16,573,568	7602922	   1744896	25921386	18b876a	vmlinux.noclobber    # BS*L
   ..........
   16,573,552	7602922	   1744896	25921370	18b875a	vmlinux.broken       # BROKEN: 0 baseline instead of 1

... and unless I messed up the patch, it seems to have a surprisingly 
low impact - maybe because the compiler can amortize its cost by 
adjusting all dependent code mostly at build time, so the +1 doesn't 
end up being generated most of the time?

Thanks,

	Ingo

===============================>

This broken patch is broken: it intentionally breaks the ffs()/fls() 
interface in an attempt to measure the code generation effects of 
interface details.

NOT-Signed-off-by: <anyone@anywhere.anytime>
---
 arch/x86/include/asm/bitops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index e3e94a806656..21707696bafe 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -318,7 +318,7 @@ static __always_inline int variable_ffs(int x)
 	    : "=r" (r)
 	    : ASM_INPUT_RM (x), "0" (-1));
 
-	return r + 1;
+	return r;
 }
 
 /**
@@ -362,7 +362,7 @@ static __always_inline int fls(unsigned int x)
 	    : "=r" (r)
 	    : ASM_INPUT_RM (x), "0" (-1));
 
-	return r + 1;
+	return r;
 }
 
 /**
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Linus Torvalds 7 months, 3 weeks ago
On Mon, 28 Apr 2025 at 00:14, Ingo Molnar <mingo@kernel.org> wrote:
>
> And, just out of intellectual curiosity, I also tried to measure the
> code generation price of the +1 standards-quirk in the fls()/ffs()
> interface as well:
>
> ... and unless I messed up the patch, it seems to have a surprisingly
> low impact - maybe because the compiler can amortize its cost by
> adjusting all dependent code mostly at build time, so the +1 doesn't
> end up being generated most of the time?

No, I think one issue is that most users actually end up subtracting
one from the return value of 'ffs()', because the "bit #0 returns 1"
semantics of the standard ffs() function really is insane.

It's not just that it doesn't match sane hardware, it's also that it
doesn't match sane *users*. If bit #0 is set, people want '0', so they
typically subtract 1.

So when you stop adding one, you aren't actually removing code -
you're often adding it.

Just see how many hits you get from

    git grep '\<ffs(.*).*-.*1'

which is obviously not a very precise pattern, but just look at the
output and see just *how* common that "subtract one" thing is.

I really don't understand how anybody *ever* thought that the whole
"return one bigger" was a good idea for ffs().

Sure, I understand that zero is special and needs a special return
value, but returning a negative value would have been pretty simple
(or just do what our bitops finding functions do, which is to return
past the end, which is often convenient but does tend to make the
error condition check a bit more complex).

Anyway, the fact that so many users subtract one means that your "look
at the size of the binary" model doesn't work. You're counting both
the wins (when that addition doesn't happen) and the losses (when the
"subtract one in the user" happens).

So the "+1" doesn't cause much code generation - as long as it's done
by the compiler that can also undo it - but it's just a horrid user
interface.

But maybe people really were poisoned by the Pascal mindset. Or maybe
it was invented by some ancient Roman who hadn't heard of the concept
of zero. Who knows?

               Linus
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Ingo Molnar 7 months, 3 weeks ago
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, 28 Apr 2025 at 00:14, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > And, just out of intellectual curiosity, I also tried to measure the
> > code generation price of the +1 standards-quirk in the fls()/ffs()
> > interface as well:
> >
> > ... and unless I messed up the patch, it seems to have a surprisingly
> > low impact - maybe because the compiler can amortize its cost by
> > adjusting all dependent code mostly at build time, so the +1 doesn't
> > end up being generated most of the time?
> 
> No, I think one issue is that most users actually end up subtracting
> one from the return value of 'ffs()', because the "bit #0 returns 1"
> semantics of the standard ffs() function really is insane.
> 
> It's not just that it doesn't match sane hardware, it's also that it
> doesn't match sane *users*. If bit #0 is set, people want '0', so they
> typically subtract 1.
> 
> So when you stop adding one, you aren't actually removing code -
> you're often adding it.
> 
> Just see how many hits you get from
> 
>     git grep '\<ffs(.*).*-.*1'
> 
> which is obviously not a very precise pattern, but just look at the
> output and see just *how* common that "subtract one" thing is.
> 
> I really don't understand how anybody *ever* thought that the whole
> "return one bigger" was a good idea for ffs().

Yeah. No argument from me that it's a badly thought out interface - I 
was just surprised that it doesn't seem to impact performance as badly 
as I expected. I have to add that a lot of work went into absorbing the 
negative effects of the ffs()/fls() interfaces:

  starship:~/tip> git grep -Ee '__ffs\(|__fls\(' | wc -l
  1055

So it impacts code quality negatively, which is arguably the worse side 
effect.

> But maybe people really were poisoned by the Pascal mindset. Or maybe 
> it was invented by some ancient Roman who hadn't heard of the concept 
> of zero. Who knows?

Hey, ancient Romans didn't even have the concept of *whitespaces* and 
punctuation to begin with:

    https://historyofinformation.com/images/Vergilius_Augusteus,_Georgica_121.jpg

Lazy stonemasons the lot of them.

Romans were the worst ever coders too I suspect. What have the Romans 
ever done for us??

	Ingo
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 29, 2025 3:08:03 AM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Mon, 28 Apr 2025 at 00:14, Ingo Molnar <mingo@kernel.org> wrote:
>> >
>> > And, just out of intellectual curiosity, I also tried to measure the
>> > code generation price of the +1 standards-quirk in the fls()/ffs()
>> > interface as well:
>> >
>> > ... and unless I messed up the patch, it seems to have a surprisingly
>> > low impact - maybe because the compiler can amortize its cost by
>> > adjusting all dependent code mostly at build time, so the +1 doesn't
>> > end up being generated most of the time?
>> 
>> No, I think one issue is that most users actually end up subtracting
>> one from the return value of 'ffs()', because the "bit #0 returns 1"
>> semantics of the standard ffs() function really is insane.
>> 
>> It's not just that it doesn't match sane hardware, it's also that it
>> doesn't match sane *users*. If bit #0 is set, people want '0', so they
>> typically subtract 1.
>> 
>> So when you stop adding one, you aren't actually removing code -
>> you're often adding it.
>> 
>> Just see how many hits you get from
>> 
>>     git grep '\<ffs(.*).*-.*1'
>> 
>> which is obviously not a very precise pattern, but just look at the
>> output and see just *how* common that "subtract one" thing is.
>> 
>> I really don't understand how anybody *ever* thought that the whole
>> "return one bigger" was a good idea for ffs().
>
>Yeah. No argument from me that it's a badly thought out interface - I 
>was just surprised that it doesn't seem to impact performance as badly 
>as I expected. I have to add that a lot of work went into absorbing the 
>negative effects of the ffs()/fls() interfaces:
>
>  starship:~/tip> git grep -Ee '__ffs\(|__fls\(' | wc -l
>  1055
>
>So it impacts code quality negatively, which is arguably the worse side 
>effect.
>
>> But maybe people really were poisoned by the Pascal mindset. Or maybe 
>> it was invented by some ancient Roman who hadn't heard of the concept 
>> of zero. Who knows?
>
>Hey, ancient Romans didn't even have the concept of *whitespaces* and 
>punctuation to begin with:
>
>    https://historyofinformation.com/images/Vergilius_Augusteus,_Georgica_121.jpg
>
>Lazy stonemasons the lot of them.
>
>Romans were the worst ever coders too I suspect. What have the Romans 
>ever done for us??
>
>	Ingo

Well, they did build the roads... 🤣

Roman numerals obviously were not a positional system, but at least in accounting they used N for zero (nulla.)

ROMANI•ITE•DOMVM
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by H. Peter Anvin 7 months, 3 weeks ago
On April 28, 2025 12:14:40 AM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Ingo Molnar <mingo@kernel.org> wrote:
>
>> And once we remove 486, I think we can do the optimization below to 
>> just assume the output doesn't get clobbered by BS*L in the 
>> zero-case, right?
>> 
>> In the text size space it's a substantial optimization on x86-32 
>> defconfig:
>> 
>>         text	   data	       bss	     dec	    hex	filename
>>   16,577,728    7598826    1744896      25921450        18b87aa vmlinux.vanilla      # CMOV+BS*L
>>   16,577,908	7598838	   1744896	25921642	18b886a	vmlinux.linus_patch  # if()+BS*L
>>   16,573,568	7602922	   1744896	25921386	18b876a	vmlinux.noclobber    # BS*L
>
>And BTW, *that* is a price that all of non-486 x86-32 was paying for 
>486 support...
>
>And, just out of intellectual curiosity, I also tried to measure the 
>code generation price of the +1 standards-quirk in the fls()/ffs() 
>interface as well:
>
>         text	   data	       bss	     dec	    hex	filename
>   16,577,728   7598826    1744896      25921450        18b87aa vmlinux.vanilla      # CMOV+BS*L
>   16,577,908	7598838	   1744896	25921642	18b886a	vmlinux.linus_patch  # if()+BS*L
>   16,573,568	7602922	   1744896	25921386	18b876a	vmlinux.noclobber    # BS*L
>   ..........
>   16,573,552	7602922	   1744896	25921370	18b875a	vmlinux.broken       # BROKEN: 0 baseline instead of 1
>
>... and unless I messed up the patch, it seems to have a surprisingly 
>low impact - maybe because the compiler can amortize its cost by 
>adjusting all dependent code mostly at build time, so the +1 doesn't 
>end up being generated most of the time?
>
>Thanks,
>
>	Ingo
>
>===============================>
>
>This broken patch is broken: it intentionally breaks the ffs()/fls() 
>interface in an attempt to measure the code generation effects of 
>interface details.
>
>NOT-Signed-off-by: <anyone@anywhere.anytime>
>---
> arch/x86/include/asm/bitops.h | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
>diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
>index e3e94a806656..21707696bafe 100644
>--- a/arch/x86/include/asm/bitops.h
>+++ b/arch/x86/include/asm/bitops.h
>@@ -318,7 +318,7 @@ static __always_inline int variable_ffs(int x)
> 	    : "=r" (r)
> 	    : ASM_INPUT_RM (x), "0" (-1));
> 
>-	return r + 1;
>+	return r;
> }
> 
> /**
>@@ -362,7 +362,7 @@ static __always_inline int fls(unsigned int x)
> 	    : "=r" (r)
> 	    : ASM_INPUT_RM (x), "0" (-1));
> 
>-	return r + 1;
>+	return r;
> }
> 
> /**

My recollection was that you can't assume that even for 586; that it is only safe for 686, but it has been a long time...
Re: [PATCH] bitops/32: Convert variable_ffs() and fls() zero-case handling to C
Posted by Arnd Bergmann 7 months, 3 weeks ago
On Mon, Apr 28, 2025, at 09:14, Ingo Molnar wrote:
>
> ... and unless I messed up the patch, it seems to have a surprisingly 
> low impact - maybe because the compiler can amortize its cost by 
> adjusting all dependent code mostly at build time, so the +1 doesn't 
> end up being generated most of the time?

Is there any reason we can't just use the compiler-builtins directly
like we do on other architectures, at least for 32-bit?

Looking at a couple of vmlinux objects confirms the  findings from
fdb6649ab7c1 ("x86/asm/bitops: Use __builtin_ctzl() to evaluate
constant expressions") from looking at the object file that using
the built-in helpers is slightly better than the current asm code
for all 32-bit targets with both gcc and clang. It's also better
for 64-bit targets with clang, but not with gcc, where the inline
asm often saves a cmov but in other cases the compiler finds an
even better instruction sequence.

     Arnd

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index eebbc8889e70..bdeae9a497e5 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -246,184 +246,18 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
 					  variable_test_bit(nr, addr);
 }
 
-static __always_inline unsigned long variable__ffs(unsigned long word)
-{
-	asm("tzcnt %1,%0"
-		: "=r" (word)
-		: ASM_INPUT_RM (word));
-	return word;
-}
-
-/**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-#define __ffs(word)				\
-	(__builtin_constant_p(word) ?		\
-	 (unsigned long)__builtin_ctzl(word) :	\
-	 variable__ffs(word))
-
-static __always_inline unsigned long variable_ffz(unsigned long word)
-{
-	return variable__ffs(~word);
-}
-
-/**
- * ffz - find first zero bit in word
- * @word: The word to search
- *
- * Undefined if no zero exists, so code should check against ~0UL first.
- */
-#define ffz(word)				\
-	(__builtin_constant_p(word) ?		\
-	 (unsigned long)__builtin_ctzl(~word) :	\
-	 variable_ffz(word))
-
-/*
- * __fls: find last set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static __always_inline unsigned long __fls(unsigned long word)
-{
-	if (__builtin_constant_p(word))
-		return BITS_PER_LONG - 1 - __builtin_clzl(word);
-
-	asm("bsr %1,%0"
-	    : "=r" (word)
-	    : ASM_INPUT_RM (word));
-	return word;
-}
-
 #undef ADDR
 
-#ifdef __KERNEL__
-static __always_inline int variable_ffs(int x)
-{
-	int r;
-
-#ifdef CONFIG_X86_64
-	/*
-	 * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
-	 * dest reg is undefined if x==0, but their CPU architect says its
-	 * value is written to set it to the same as before, except that the
-	 * top 32 bits will be cleared.
-	 *
-	 * We cannot do this on 32 bits because at the very least some
-	 * 486 CPUs did not behave this way.
-	 */
-	asm("bsfl %1,%0"
-	    : "=r" (r)
-	    : ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
-	asm("bsfl %1,%0\n\t"
-	    "cmovzl %2,%0"
-	    : "=&r" (r) : "rm" (x), "r" (-1));
-#else
-	asm("bsfl %1,%0\n\t"
-	    "jnz 1f\n\t"
-	    "movl $-1,%0\n"
-	    "1:" : "=r" (r) : "rm" (x));
-#endif
-	return r + 1;
-}
-
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
-
-/**
- * fls - find last set bit in word
- * @x: the word to search
- *
- * This is defined in a similar way as the libc and compiler builtin
- * ffs, but returns the position of the most significant set bit.
- *
- * fls(value) returns 0 if value is 0 or the position of the last
- * set bit if value is nonzero. The last (most significant) bit is
- * at position 32.
- */
-static __always_inline int fls(unsigned int x)
-{
-	int r;
-
-	if (__builtin_constant_p(x))
-		return x ? 32 - __builtin_clz(x) : 0;
-
-#ifdef CONFIG_X86_64
-	/*
-	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
-	 * dest reg is undefined if x==0, but their CPU architect says its
-	 * value is written to set it to the same as before, except that the
-	 * top 32 bits will be cleared.
-	 *
-	 * We cannot do this on 32 bits because at the very least some
-	 * 486 CPUs did not behave this way.
-	 */
-	asm("bsrl %1,%0"
-	    : "=r" (r)
-	    : ASM_INPUT_RM (x), "0" (-1));
-#elif defined(CONFIG_X86_CMOV)
-	asm("bsrl %1,%0\n\t"
-	    "cmovzl %2,%0"
-	    : "=&r" (r) : "rm" (x), "rm" (-1));
-#else
-	asm("bsrl %1,%0\n\t"
-	    "jnz 1f\n\t"
-	    "movl $-1,%0\n"
-	    "1:" : "=r" (r) : "rm" (x));
-#endif
-	return r + 1;
-}
-
-/**
- * fls64 - find last set bit in a 64-bit word
- * @x: the word to search
- *
- * This is defined in a similar way as the libc and compiler builtin
- * ffsll, but returns the position of the most significant set bit.
- *
- * fls64(value) returns 0 if value is 0 or the position of the last
- * set bit if value is nonzero. The last (most significant) bit is
- * at position 64.
- */
-#ifdef CONFIG_X86_64
-static __always_inline int fls64(__u64 x)
-{
-	int bitpos = -1;
-
-	if (__builtin_constant_p(x))
-		return x ? 64 - __builtin_clzll(x) : 0;
-	/*
-	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
-	 * dest reg is undefined if x==0, but their CPU architect says its
-	 * value is written to set it to the same as before.
-	 */
-	asm("bsrq %1,%q0"
-	    : "+r" (bitpos)
-	    : ASM_INPUT_RM (x));
-	return bitpos + 1;
-}
-#else
+#include <asm-generic/bitops/__ffs.h>
+#include <asm-generic/bitops/ffz.h>
+#include <asm-generic/bitops/builtin-fls.h>
+#include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
-#endif
 
-#include <asm-generic/bitops/sched.h>
 
+#include <asm-generic/bitops/sched.h>
+#include <asm-generic/bitops/builtin-ffs.h>
 #include <asm/arch_hweight.h>
-
 #include <asm-generic/bitops/const_hweight.h>
 
 #include <asm-generic/bitops/instrumented-atomic.h>
@@ -434,5 +268,4 @@ static __always_inline int fls64(__u64 x)
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
 
-#endif /* __KERNEL__ */
 #endif /* _ASM_X86_BITOPS_H */