[v1] Implementing bitops rotate using riscv Zbb extension

[PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by cp0613@linux.alibaba.com 3 months, 2 weeks ago

From: Chen Pei <cp0613@linux.alibaba.com>

The RISC-V Zbb extension[1] defines bitwise rotation instructions,
which can be used to implement rotate related functions.

[1] https://github.com/riscv/riscv-bitmanip/

Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
---
 arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index d59310f74c2b..be247ef9e686 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -20,17 +20,20 @@
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/rotate.h>
 
 #else
 #define __HAVE_ARCH___FFS
 #define __HAVE_ARCH___FLS
 #define __HAVE_ARCH_FFS
 #define __HAVE_ARCH_FLS
+#define __HAVE_ARCH_ROTATE
 
 #include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/rotate.h>
 
 #include <asm/alternative-macros.h>
 #include <asm/hwcap.h>
@@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
 	 variable_fls(x_);					\
 })
 
+static __always_inline u64 variable_rol64(u64 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rol %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_rol64(word, shift);
+}
+
+static inline u64 variable_ror64(u64 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"ror %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_ror64(word, shift);
+}
+
+static inline u32 variable_rol32(u32 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rolw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_rol32(word, shift);
+}
+
+static inline u32 variable_ror32(u32 word, unsigned int shift)
+{
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rorw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word) : "r" (word), "r" (shift) :);
+
+	return word;
+
+legacy:
+	return generic_ror32(word, shift);
+}
+
+static inline u16 variable_rol16(u16 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 16) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rolw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u16)word32;
+
+legacy:
+	return generic_rol16(word, shift);
+}
+
+static inline u16 variable_ror16(u16 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 16) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rorw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u16)word32;
+
+legacy:
+	return generic_ror16(word, shift);
+}
+
+static inline u8 variable_rol8(u8 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rolw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u8)word32;
+
+legacy:
+	return generic_rol8(word, shift);
+}
+
+static inline u8 variable_ror8(u8 word, unsigned int shift)
+{
+	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
+
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+				      RISCV_ISA_EXT_ZBB, 1)
+			  : : : : legacy);
+
+	asm volatile(
+		".option push\n"
+		".option arch,+zbb\n"
+		"rorw %0, %1, %2\n"
+		".option pop\n"
+		: "=r" (word32) : "r" (word32), "r" (shift) :);
+
+	return (u8)word32;
+
+legacy:
+	return generic_ror8(word, shift);
+}
+
+#define rol64(word, shift) variable_rol64(word, shift)
+#define ror64(word, shift) variable_ror64(word, shift)
+#define rol32(word, shift) variable_rol32(word, shift)
+#define ror32(word, shift) variable_ror32(word, shift)
+#define rol16(word, shift) variable_rol16(word, shift)
+#define ror16(word, shift) variable_ror16(word, shift)
+#define rol8(word, shift)  variable_rol8(word, shift)
+#define ror8(word, shift)  variable_ror8(word, shift)
+
 #endif /* !(defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)) || defined(NO_ALTERNATIVE) */
 
 #include <asm-generic/bitops/ffz.h>
-- 
2.49.0

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by Yury Norov 3 months, 2 weeks ago

On Fri, Jun 20, 2025 at 07:16:10PM +0800, cp0613@linux.alibaba.com wrote:
> From: Chen Pei <cp0613@linux.alibaba.com>
> 
> The RISC-V Zbb extension[1] defines bitwise rotation instructions,
> which can be used to implement rotate related functions.
> 
> [1] https://github.com/riscv/riscv-bitmanip/
> 
> Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
> ---
>  arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
>  1 file changed, 172 insertions(+)
> 
> diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
> index d59310f74c2b..be247ef9e686 100644
> --- a/arch/riscv/include/asm/bitops.h
> +++ b/arch/riscv/include/asm/bitops.h
> @@ -20,17 +20,20 @@
>  #include <asm-generic/bitops/__fls.h>
>  #include <asm-generic/bitops/ffs.h>
>  #include <asm-generic/bitops/fls.h>
> +#include <asm-generic/bitops/rotate.h>
>  
>  #else
>  #define __HAVE_ARCH___FFS
>  #define __HAVE_ARCH___FLS
>  #define __HAVE_ARCH_FFS
>  #define __HAVE_ARCH_FLS
> +#define __HAVE_ARCH_ROTATE
>  
>  #include <asm-generic/bitops/__ffs.h>
>  #include <asm-generic/bitops/__fls.h>
>  #include <asm-generic/bitops/ffs.h>
>  #include <asm-generic/bitops/fls.h>
> +#include <asm-generic/bitops/rotate.h>
>  
>  #include <asm/alternative-macros.h>
>  #include <asm/hwcap.h>
> @@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
>  	 variable_fls(x_);					\
>  })

...

> +static inline u8 variable_ror8(u8 word, unsigned int shift)
> +{
> +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;

Can you add a comment about what is happening here? Are you sure it's
optimized out in case of the 'legacy' alternative?

> +
> +	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
> +				      RISCV_ISA_EXT_ZBB, 1)
> +			  : : : : legacy);
> +
> +	asm volatile(
> +		".option push\n"
> +		".option arch,+zbb\n"
> +		"rorw %0, %1, %2\n"
> +		".option pop\n"
> +		: "=r" (word32) : "r" (word32), "r" (shift) :);
> +
> +	return (u8)word32;
> +
> +legacy:
> +	return generic_ror8(word, shift);
> +}
> +
> +#define rol64(word, shift) variable_rol64(word, shift)
> +#define ror64(word, shift) variable_ror64(word, shift)
> +#define rol32(word, shift) variable_rol32(word, shift)
> +#define ror32(word, shift) variable_ror32(word, shift)
> +#define rol16(word, shift) variable_rol16(word, shift)
> +#define ror16(word, shift) variable_ror16(word, shift)
> +#define rol8(word, shift)  variable_rol8(word, shift)
> +#define ror8(word, shift)  variable_ror8(word, shift)

Here you wire ror/rol() to the variable_ror/rol() unconditionally, and
that breaks compile-time rotation if the parameter is known at compile
time.

I believe, generic implementation will allow compiler to handle this
case better. Can you do a similar thing to what fls() does in the same
file?

Thanks,
Yury

> +
>  #endif /* !(defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)) || defined(NO_ALTERNATIVE) */
>  
>  #include <asm-generic/bitops/ffz.h>
> -- 
> 2.49.0

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by cp0613@linux.alibaba.com 3 months, 1 week ago

On Fri, 20 Jun 2025 12:20:47 -0400, yury.norov@gmail.com wrote:

> Can you add a comment about what is happening here? Are you sure it's
> optimized out in case of the 'legacy' alternative?

Thank you for your review. Yes, I referred to the existing variable__fls()
implementation, which should be fine.

> Here you wire ror/rol() to the variable_ror/rol() unconditionally, and
> that breaks compile-time rotation if the parameter is known at compile
> time.
> 
> I believe, generic implementation will allow compiler to handle this
> case better. Can you do a similar thing to what fls() does in the same
> file?

I did consider it, but I did not find any toolchain that provides an
implementation similar to __builtin_ror or __builtin_rol. If there is one,
please help point it out.
In addition, I did not consider it carefully before. If the rotate function
is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
I missed this step.

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by Yury Norov 3 months, 1 week ago

On Sat, Jun 28, 2025 at 07:13:57PM +0800, cp0613@linux.alibaba.com wrote:
> On Fri, 20 Jun 2025 12:20:47 -0400, yury.norov@gmail.com wrote:
> 
> > Can you add a comment about what is happening here? Are you sure it's
> > optimized out in case of the 'legacy' alternative?
> 
> Thank you for your review. Yes, I referred to the existing variable__fls()
> implementation, which should be fine.

No, it's not fine. Because you trimmed your original email completely,
so there's no way to understand what I'm asking about; and because you
didn't answer my question. So I'll ask again: what exactly you are doing
in the line you've trimmed out? 

> > Here you wire ror/rol() to the variable_ror/rol() unconditionally, and
> > that breaks compile-time rotation if the parameter is known at compile
> > time.
> > 
> > I believe, generic implementation will allow compiler to handle this
> > case better. Can you do a similar thing to what fls() does in the same
> > file?
> 
> I did consider it, but I did not find any toolchain that provides an
> implementation similar to __builtin_ror or __builtin_rol. If there is one,
> please help point it out.

This is the example of the toolchain you're looking for:

  /**
   * rol64 - rotate a 64-bit value left
   * @word: value to rotate
   * @shift: bits to roll
   */
  static inline __u64 rol64(__u64 word, unsigned int shift)
  {
          return (word << (shift & 63)) | (word >> ((-shift) & 63));
  }

What I'm asking is: please show me that compile-time rol/ror is still
calculated at compile time, i.e. ror64(1234, 12) is evaluated at
compile time. 

> In addition, I did not consider it carefully before. If the rotate function
> is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
> I missed this step.

Sorry, I'm lost here about what you've considered and what not. I'm OK
about accelerating ror/rol, but I want to make sure that;

1. The most trivial compile-case is actually evaluated at compile time; and
2. Any arch-specific code is well explained; and
3. legacy case optimized just as well as non-legacy.

Thanks,
Yury

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by cp0613@linux.alibaba.com 3 months, 1 week ago

On Sat, 28 Jun 2025 21:48:02 -0400, yury.norov@gmail.com wrote:

> > > > +static inline u8 variable_ror8(u8 word, unsigned int shift)
> > > > +{
> > > > +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> > > 
> > > Can you add a comment about what is happening here? Are you sure it's
> > > optimized out in case of the 'legacy' alternative?
> > 
> > Thank you for your review. Yes, I referred to the existing variable__fls()
> > implementation, which should be fine.
> 
> No, it's not fine. Because you trimmed your original email completely,
> so there's no way to understand what I'm asking about; and because you
> didn't answer my question. So I'll ask again: what exactly you are doing
> in the line you've trimmed out?

Sorry, I misunderstood your question. Now I have made up for the lost original
email. This is my answer. The RISC-V Zbb extension only provides 64-bit data
rotation instructions rol/ror and 32-bit data rotation instructions rolw/rorw.
Therefore, for 16-bit and 8-bit data, in order to use the rolw/rorw instruction
optimization, the data is cyclically spliced here, and the corresponding number
of bits is truncated after processing to achieve the function.

This data preparation process does introduce additional operations. Compared with
genneric's implementation, I use the web tool provided by David to illustrate.

The two functions that need to be compared are briefly summarized as follows:
```
unsigned char generic_ror8(unsigned char word, unsigned int shift)
{
	return (word >> (shift & 7)) | (word << ((-shift) & 7));
}

unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift)
{
	unsigned int word32 = ((unsigned int)word << 24) | \
	    ((unsigned int)word << 16) | ((unsigned int)word << 8) | word;
#ifdef __riscv
	__asm__ volatile("nop"); // ALTERNATIVE(nop)

	__asm__ volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word32) : "r" (word32), "r" (shift) :);
#endif
	return (unsigned char)word32;
}
```
The disassembly obtained is:
```
generic_ror8:
    andi    a1,a1,7
    negw    a5,a1
    andi    a5,a5,7
    sllw    a5,a0,a5
    srlw    a0,a0,a1
    or      a0,a0,a5
    andi    a0,a0,0xff
    ret

zbb_opt_ror8:
    slli    a5,a0,8
    add     a0,a5,a0
    slliw   a5,a0,16
    addw    a5,a5,a0
    nop
    rorw a5, a5, a1
    andi    a0,a5,0xff
    ret
```
From the perspective of the total number of instructions, although zbb_opt_ror8 has
one more instruction, one of them is a nop, so the difference with generic_ror8 should
be very small, or using the solution provided by David would be better for non-x86.

> > I did consider it, but I did not find any toolchain that provides an
> > implementation similar to __builtin_ror or __builtin_rol. If there is one,
> > please help point it out.
> 
> This is the example of the toolchain you're looking for:
> 
>   /**
>    * rol64 - rotate a 64-bit value left
>    * @word: value to rotate
>    * @shift: bits to roll
>    */
>   static inline __u64 rol64(__u64 word, unsigned int shift)
>   {
>           return (word << (shift & 63)) | (word >> ((-shift) & 63));
>   }
> 
> What I'm asking is: please show me that compile-time rol/ror is still
> calculated at compile time, i.e. ror64(1234, 12) is evaluated at
> compile time.

I see what you mean, I didn't consider the case of constants being evaluated
at compile time, as you pointed out earlier:
"you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks
compile-time rotation if the parameter is known at compile time."

In the absence of compiler built-in function support, I think it can be handled
like this:
```
#define rol16(word, shift) \
	(__builtin_constant_p(word) && __builtin_constant_p(shift) ? \
	generic_ror16(word, shift) : variable_rol16(word, shift))
```
How do you see?

> > In addition, I did not consider it carefully before. If the rotate function
> > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
> > I missed this step.
> 
> Sorry, I'm lost here about what you've considered and what not. I'm OK
> about accelerating ror/rol, but I want to make sure that;
> 
> 1. The most trivial compile-case is actually evaluated at compile time; and
> 2. Any arch-specific code is well explained; and
> 3. legacy case optimized just as well as non-legacy.

1. As in the above reply, use the generic implementation when compile-time evaluation
   is possible。
2. I will improve the comments later.
3. As mentioned before, only 8-bit rotation should have no optimization effect, and
   16-bit and above will have significant optimization.

Thanks,
Pei

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by Yury Norov 3 months, 1 week ago

On Mon, Jun 30, 2025 at 08:04:53PM +0800, cp0613@linux.alibaba.com wrote:
> On Sat, 28 Jun 2025 21:48:02 -0400, yury.norov@gmail.com wrote:
> 
> > > > > +static inline u8 variable_ror8(u8 word, unsigned int shift)
> > > > > +{
> > > > > +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
> > > > 
> > > > Can you add a comment about what is happening here? Are you sure it's
> > > > optimized out in case of the 'legacy' alternative?
> > > 
> > > Thank you for your review. Yes, I referred to the existing variable__fls()
> > > implementation, which should be fine.
> > 
> > No, it's not fine. Because you trimmed your original email completely,
> > so there's no way to understand what I'm asking about; and because you
> > didn't answer my question. So I'll ask again: what exactly you are doing
> > in the line you've trimmed out?
> 
> Sorry, I misunderstood your question. Now I have made up for the lost original
> email. This is my answer. The RISC-V Zbb extension only provides 64-bit data
> rotation instructions rol/ror and 32-bit data rotation instructions rolw/rorw.
> Therefore, for 16-bit and 8-bit data, in order to use the rolw/rorw instruction
> optimization, the data is cyclically spliced here, and the corresponding number
> of bits is truncated after processing to achieve the function.
> 
> This data preparation process does introduce additional operations. Compared with
> genneric's implementation, I use the web tool provided by David to illustrate.
> 
> The two functions that need to be compared are briefly summarized as follows:
> ```
> unsigned char generic_ror8(unsigned char word, unsigned int shift)
> {
> 	return (word >> (shift & 7)) | (word << ((-shift) & 7));
> }
> 
> unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift)
> {
> 	unsigned int word32 = ((unsigned int)word << 24) | \
> 	    ((unsigned int)word << 16) | ((unsigned int)word << 8) | word;
> #ifdef __riscv
> 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> 
> 	__asm__ volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> #endif
> 	return (unsigned char)word32;
> }
> ```
> The disassembly obtained is:
> ```
> generic_ror8:
>     andi    a1,a1,7
>     negw    a5,a1
>     andi    a5,a5,7
>     sllw    a5,a0,a5
>     srlw    a0,a0,a1
>     or      a0,a0,a5
>     andi    a0,a0,0xff
>     ret
> 
> zbb_opt_ror8:
>     slli    a5,a0,8
>     add     a0,a5,a0
>     slliw   a5,a0,16
>     addw    a5,a5,a0
>     nop
>     rorw a5, a5, a1
>     andi    a0,a5,0xff
>     ret
> ```
> From the perspective of the total number of instructions, although zbb_opt_ror8 has
> one more instruction, one of them is a nop, so the difference with generic_ror8 should
> be very small, or using the solution provided by David would be better for non-x86.

And what about performance?

> > > I did consider it, but I did not find any toolchain that provides an
> > > implementation similar to __builtin_ror or __builtin_rol. If there is one,
> > > please help point it out.
> > 
> > This is the example of the toolchain you're looking for:
> > 
> >   /**
> >    * rol64 - rotate a 64-bit value left
> >    * @word: value to rotate
> >    * @shift: bits to roll
> >    */
> >   static inline __u64 rol64(__u64 word, unsigned int shift)
> >   {
> >           return (word << (shift & 63)) | (word >> ((-shift) & 63));
> >   }
> > 
> > What I'm asking is: please show me that compile-time rol/ror is still
> > calculated at compile time, i.e. ror64(1234, 12) is evaluated at
> > compile time.
> 
> I see what you mean, I didn't consider the case of constants being evaluated
> at compile time, as you pointed out earlier:
> "you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks
> compile-time rotation if the parameter is known at compile time."
> 
> In the absence of compiler built-in function support, I think it can be handled
> like this:
> ```
> #define rol16(word, shift) \
> 	(__builtin_constant_p(word) && __builtin_constant_p(shift) ? \
> 	generic_ror16(word, shift) : variable_rol16(word, shift))
> ```
> How do you see?

That's what I meant.
 
> > > In addition, I did not consider it carefully before. If the rotate function
> > > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>.
> > > I missed this step.
> > 
> > Sorry, I'm lost here about what you've considered and what not. I'm OK
> > about accelerating ror/rol, but I want to make sure that;
> > 
> > 1. The most trivial compile-case is actually evaluated at compile time; and
> > 2. Any arch-specific code is well explained; and
> > 3. legacy case optimized just as well as non-legacy.
> 
> 1. As in the above reply, use the generic implementation when compile-time evaluation
>    is possible。
> 2. I will improve the comments later.

I'm particularly interested in ror8/rol8 case:

        u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;

When you expand it to 32-bit word, and want to rotate, you obviously
need to copy lower quarterword to the higher one:

        0xab >> 0xab0000ab

That way generic (u8)ror32(0xab, shift) would work. But I don't understand
why you copy the lower 8 bits to inner quarterwords. Is that a hardware
requirement? Can you point to any arch documentation 

> 3. As mentioned before, only 8-bit rotation should have no optimization effect, and
>    16-bit and above will have significant optimization.

I asked you about the asm goto ("legacy") thing: you calculate that
complex word32 _before_ evaluating the goto. So this word32 may get
unused, and you waste cycles. I want to make sure this isn't the case.

Please find attached a test for compile-time ror/rol evaluation.
Please consider prepending your series with it.

Thanks,
Yury

From 5c5be22117a2bd0a656efae0efd6ed159d4168c5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 30 Jun 2025 12:07:47 -0400
Subject: [PATCH] bitops: add compile-time test for ror() and rol()

If parameters for the functions are passed at compile time, the compiler
must calculate the result at compile time, as well.

Now that architectures introduce accelerated implementations for bit
rotation, we must make sure that they don't break compile-time
evaluation.

This patch adds a test for it, similarly to test_bitmap_const_eval().

Tested on x86_64.

Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 lib/test_bitops.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/lib/test_bitops.c b/lib/test_bitops.c
index 55669624bb28..aa4c06df34f7 100644
--- a/lib/test_bitops.c
+++ b/lib/test_bitops.c
@@ -76,6 +76,56 @@ static int __init test_fns(void)
 	return 0;
 }
 
+static void __init test_bitops_const_eval(void)
+{
+	/*
+	 * ror/rol operations on parameters known at compile-time must be
+	 * optimized to compile-time constants on any supported optimization
+	 * level (-O2, -Os) and all architectures. Otherwise, trigger a build
+	 * bug.
+	 */
+
+	u64 r64 = ror64(0x1234567890abcdefull, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r64));
+	BUILD_BUG_ON(r64 != 0xabcdef1234567890ull);
+
+	u64 l64 = rol64(0x1234567890abcdefull, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l64));
+	BUILD_BUG_ON(l64 != 0x7890abcdef123456ull);
+
+	u32 r32 = ror32(0x12345678, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r32));
+	BUILD_BUG_ON(r32 != 0x34567812);
+
+	u32 l32 = rol32(0x12345678, 24);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l32));
+	BUILD_BUG_ON(l32 != 0x78123456);
+
+	u16 r16 = ror16(0x1234, 12);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r16));
+	BUILD_BUG_ON(r16 != 0x2341);
+
+	u16 l16 = rol16(0x1234, 12);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l16));
+	BUILD_BUG_ON(l16 != 0x4123);
+
+	u8 r8 = ror8(0x12, 6);
+
+	BUILD_BUG_ON(!__builtin_constant_p(r16));
+	BUILD_BUG_ON(r8 != 0x48);
+
+	u8 l8 = rol8(0x12, 6);
+
+	BUILD_BUG_ON(!__builtin_constant_p(l16));
+	BUILD_BUG_ON(l8 != 0x84);
+}
+
 static int __init test_bitops_startup(void)
 {
 	int i, bit_set;
@@ -121,6 +171,7 @@ static int __init test_bitops_startup(void)
 		pr_err("ERROR: FOUND SET BIT %d\n", bit_set);
 
 	test_fns();
+	test_bitops_const_eval();
 
 	pr_info("Completed bitops test\n");
 
-- 
2.43.0

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by David Laight 3 months, 2 weeks ago

On Fri, 20 Jun 2025 12:20:47 -0400
Yury Norov <yury.norov@gmail.com> wrote:

> On Fri, Jun 20, 2025 at 07:16:10PM +0800, cp0613@linux.alibaba.com wrote:
> > From: Chen Pei <cp0613@linux.alibaba.com>
> > 
> > The RISC-V Zbb extension[1] defines bitwise rotation instructions,
> > which can be used to implement rotate related functions.
> > 
> > [1] https://github.com/riscv/riscv-bitmanip/
> > 
> > Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
> > ---
> >  arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
> >  1 file changed, 172 insertions(+)
> > 
> > diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
> > index d59310f74c2b..be247ef9e686 100644
> > --- a/arch/riscv/include/asm/bitops.h
> > +++ b/arch/riscv/include/asm/bitops.h
> > @@ -20,17 +20,20 @@
> >  #include <asm-generic/bitops/__fls.h>
> >  #include <asm-generic/bitops/ffs.h>
> >  #include <asm-generic/bitops/fls.h>
> > +#include <asm-generic/bitops/rotate.h>
> >  
> >  #else
> >  #define __HAVE_ARCH___FFS
> >  #define __HAVE_ARCH___FLS
> >  #define __HAVE_ARCH_FFS
> >  #define __HAVE_ARCH_FLS
> > +#define __HAVE_ARCH_ROTATE
> >  
> >  #include <asm-generic/bitops/__ffs.h>
> >  #include <asm-generic/bitops/__fls.h>
> >  #include <asm-generic/bitops/ffs.h>
> >  #include <asm-generic/bitops/fls.h>
> > +#include <asm-generic/bitops/rotate.h>
> >  
> >  #include <asm/alternative-macros.h>
> >  #include <asm/hwcap.h>
> > @@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
> >  	 variable_fls(x_);					\
> >  })  
> 
> ...
> 
> > +static inline u8 variable_ror8(u8 word, unsigned int shift)
> > +{
> > +	u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;  
> 
> Can you add a comment about what is happening here? Are you sure it's
> optimized out in case of the 'legacy' alternative?

Is it even a gain in the zbb case?
The "rorw" is only ever going to help full word rotates.
Here you might as well do ((word << 8 | word) >> shift).

For "rol8" you'd need ((word << 24 | word) 'rol' shift).
I still bet the generic code is faster (but see below).

Same for 16bit rotates.

Actually the generic version is (probably) horrid for everything except x86.
See https://www.godbolt.org/z/xTxYj57To

unsigned char rol(unsigned char v, unsigned int shift)
{
    return (v << 8 | v) << shift >> 8;
}

unsigned char ror(unsigned char v, unsigned int shift)
{
    return (v << 8 | v) >> shift;
}

	David

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by cp0613@linux.alibaba.com 3 months, 1 week ago

On Wed, 25 Jun 2025 17:02:34 +0100, david.laight.linux@gmail.com wrote:

> Is it even a gain in the zbb case?
> The "rorw" is only ever going to help full word rotates.
> Here you might as well do ((word << 8 | word) >> shift).
> 
> For "rol8" you'd need ((word << 24 | word) 'rol' shift).
> I still bet the generic code is faster (but see below).
> 
> Same for 16bit rotates.
> 
> Actually the generic version is (probably) horrid for everything except x86.
> See https://www.godbolt.org/z/xTxYj57To

Thanks for your suggestion, this website is very inspiring. According to the
results, the generic version is indeed the most friendly to x86. I think this
is also a reason why other architectures should be optimized. Take the riscv64
ror32 implementation as an example, compare the number of assembly instructions
of the following two functions:
```
u32 zbb_opt_ror32(u32 word, unsigned int shift)
{
	asm volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word) : "r" (word), "r" (shift) :);

	return word;
}

u16 generic_ror32(u16 word, unsigned int shift)
{
	return (word >> (shift & 31)) | (word << ((-shift) & 31));
}
```
Their disassembly is:
```
zbb_opt_ror32:
<+0>:     addi    sp,sp,-16
<+2>:     sd      s0,0(sp)
<+4>:     sd      ra,8(sp)
<+6>:     addi    s0,sp,16
<+8>:     .insn   4, 0x60b5553b
<+12>:    ld      ra,8(sp)
<+14>:    ld      s0,0(sp)
<+16>:    sext.w  a0,a0
<+18>:    addi    sp,sp,16
<+20>:    ret

generic_ror32:
<+0>:     addi    sp,sp,-16
<+2>:     andi    a1,a1,31
<+4>:     sd      s0,0(sp)
<+6>:     sd      ra,8(sp)
<+8>:     addi    s0,sp,16
<+10>:    negw    a5,a1
<+14>:    sllw    a5,a0,a5
<+18>:    ld      ra,8(sp)
<+20>:    ld      s0,0(sp)
<+22>:    srlw    a0,a0,a1
<+26>:    or      a0,a0,a5
<+28>:    slli    a0,a0,0x30
<+30>:    srli    a0,a0,0x30
<+32>:    addi    sp,sp,16
<+34>:    ret
```
It can be found that the zbb optimized implementation uses fewer instructions,
even for 16-bit and 8-bit data.

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by David Laight 3 months, 1 week ago

On Sat, 28 Jun 2025 20:08:16 +0800
cp0613@linux.alibaba.com wrote:

> On Wed, 25 Jun 2025 17:02:34 +0100, david.laight.linux@gmail.com wrote:
> 
> > Is it even a gain in the zbb case?
> > The "rorw" is only ever going to help full word rotates.
> > Here you might as well do ((word << 8 | word) >> shift).
> > 
> > For "rol8" you'd need ((word << 24 | word) 'rol' shift).
> > I still bet the generic code is faster (but see below).
> > 
> > Same for 16bit rotates.
> > 
> > Actually the generic version is (probably) horrid for everything except x86.
> > See https://www.godbolt.org/z/xTxYj57To  
> 
> Thanks for your suggestion, this website is very inspiring. According to the
> results, the generic version is indeed the most friendly to x86. I think this
> is also a reason why other architectures should be optimized. Take the riscv64
> ror32 implementation as an example, compare the number of assembly instructions
> of the following two functions:
> ```
> u32 zbb_opt_ror32(u32 word, unsigned int shift)
> {
> 	asm volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word) : "r" (word), "r" (shift) :);
> 
> 	return word;
> }
> 
> u16 generic_ror32(u16 word, unsigned int shift)
> {
> 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
> }
> ```
> Their disassembly is:
> ```
> zbb_opt_ror32:
> <+0>:     addi    sp,sp,-16
> <+2>:     sd      s0,0(sp)
> <+4>:     sd      ra,8(sp)
> <+6>:     addi    s0,sp,16
> <+8>:     .insn   4, 0x60b5553b
> <+12>:    ld      ra,8(sp)
> <+14>:    ld      s0,0(sp)
> <+16>:    sext.w  a0,a0
> <+18>:    addi    sp,sp,16
> <+20>:    ret
> 
> generic_ror32:
> <+0>:     addi    sp,sp,-16
> <+2>:     andi    a1,a1,31
> <+4>:     sd      s0,0(sp)
> <+6>:     sd      ra,8(sp)
> <+8>:     addi    s0,sp,16
> <+10>:    negw    a5,a1
> <+14>:    sllw    a5,a0,a5
> <+18>:    ld      ra,8(sp)
> <+20>:    ld      s0,0(sp)
> <+22>:    srlw    a0,a0,a1
> <+26>:    or      a0,a0,a5
> <+28>:    slli    a0,a0,0x30
> <+30>:    srli    a0,a0,0x30
> <+32>:    addi    sp,sp,16
> <+34>:    ret
> ```
> It can be found that the zbb optimized implementation uses fewer instructions,
> even for 16-bit and 8-bit data.

Far too many register spills to stack.
I think you've forgotten to specify -O2

	David

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by cp0613@linux.alibaba.com 3 months, 1 week ago

On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote:

> > It can be found that the zbb optimized implementation uses fewer instructions,
> > even for 16-bit and 8-bit data.
> 
> Far too many register spills to stack.
> I think you've forgotten to specify -O2

Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
I used the web tool you provided as follows:
```
unsigned int generic_ror32(unsigned int word, unsigned int shift)
{
	return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
{
#ifdef __riscv
	__asm__ volatile("nop"); // ALTERNATIVE(nop)

	__asm__ volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word) : "r" (word), "r" (shift) :);
#endif
	return word;
}

unsigned short generic_ror16(unsigned short word, unsigned int shift)
{
	return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
{
	unsigned int word32 = ((unsigned int)word << 16) | word;
#ifdef __riscv
	__asm__ volatile("nop"); // ALTERNATIVE(nop)

	__asm__ volatile(
		".option push\n"
		".option arch,+zbb\n"
		"rorw %0, %1, %2\n"
		".option pop\n"
		: "=r" (word32) : "r" (word32), "r" (shift) :);
#endif
	return (unsigned short)word;
}
```
The disassembly obtained is:
```
generic_ror32:
    andi    a1,a1,31
    negw    a5,a1
    sllw    a5,a0,a5
    srlw    a0,a0,a1
    or      a0,a5,a0
    ret

zbb_opt_ror32:
    nop
    rorw a0, a0, a1
    sext.w  a0,a0
    ret

generic_ror16:
    andi    a1,a1,15
    negw    a5,a1
    andi    a5,a5,15
    sllw    a5,a0,a5
    srlw    a0,a0,a1
    or      a0,a0,a5
    slli    a0,a0,48
    srli    a0,a0,48
    ret

zbb_opt_ror16:
    slliw   a5,a0,16
    addw    a5,a5,a0
    nop
    rorw a5, a5, a1
    ret
```

Thanks,
Pei

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by David Laight 3 months, 1 week ago

On Mon, 30 Jun 2025 20:14:30 +0800
cp0613@linux.alibaba.com wrote:

> On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote:
> 
> > > It can be found that the zbb optimized implementation uses fewer instructions,
> > > even for 16-bit and 8-bit data.  
> > 
> > Far too many register spills to stack.
> > I think you've forgotten to specify -O2  
> 
> Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
> I used the web tool you provided as follows:
> ```
> unsigned int generic_ror32(unsigned int word, unsigned int shift)
> {
> 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
> }
> 
> unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
> {
> #ifdef __riscv
> 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> 
> 	__asm__ volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word) : "r" (word), "r" (shift) :);
> #endif
> 	return word;
> }
> 
> unsigned short generic_ror16(unsigned short word, unsigned int shift)
> {
> 	return (word >> (shift & 15)) | (word << ((-shift) & 15));
> }
> 
> unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
> {
> 	unsigned int word32 = ((unsigned int)word << 16) | word;
> #ifdef __riscv
> 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> 
> 	__asm__ volatile(
> 		".option push\n"
> 		".option arch,+zbb\n"
> 		"rorw %0, %1, %2\n"
> 		".option pop\n"
> 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> #endif
> 	return (unsigned short)word;
> }
> ```
> The disassembly obtained is:
> ```
> generic_ror32:
>     andi    a1,a1,31

The compiler shouldn't be generating that mask.
After all it knows the negated value doesn't need the same mask.
(I'd guess the cpu just ignores the high bits of the shift - most do.)

>     negw    a5,a1
>     sllw    a5,a0,a5
>     srlw    a0,a0,a1
>     or      a0,a5,a0
>     ret
> 
> zbb_opt_ror32:
>     nop
>     rorw a0, a0, a1
>     sext.w  a0,a0

Is that a sign extend?
Why is it there?
If it is related to the (broken) 'feature' of riscv-64 that 32bit results
are sign extended, why isn't there one in the example above.

You also need to consider the code for non-zbb cpu.

>     ret
> 
> generic_ror16:
>     andi    a1,a1,15
>     negw    a5,a1
>     andi    a5,a5,15
>     sllw    a5,a0,a5
>     srlw    a0,a0,a1
>     or      a0,a0,a5
>     slli    a0,a0,48
>     srli    a0,a0,48

The last two instructions mask the result with 0xffff.
If that is necessary it is missing from the zbb version below.

>     ret
> 
> zbb_opt_ror16:
>     slliw   a5,a0,16
>     addw    a5,a5,a0

At this point you can just do a 'shift right' on all cpu.
For rol16 you can do a variable shift left and a 16 bit
shift right on all cpu.
If the zbb version ends up with a nop (as below) then it is
likely to be much the same speed.

	David

>     nop
>     rorw a5, a5, a1
>     ret
> ```
> 
> Thanks,
> Pei

Re: [PATCH 2/2] bitops: rotate: Add riscv implementation using Zbb extension

Posted by cp0613@linux.alibaba.com 3 months, 1 week ago

On Mon, 30 Jun 2025 18:35:34 +0100, david.laight.linux@gmail.com wrote:

> > On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote:
> > 
> > > > It can be found that the zbb optimized implementation uses fewer instructions,
> > > > even for 16-bit and 8-bit data.  
> > > 
> > > Far too many register spills to stack.
> > > I think you've forgotten to specify -O2  
> > 
> > Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and
> > I used the web tool you provided as follows:
> > ```
> > unsigned int generic_ror32(unsigned int word, unsigned int shift)
> > {
> > 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
> > }
> > 
> > unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift)
> > {
> > #ifdef __riscv
> > 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> > 
> > 	__asm__ volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rorw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word) : "r" (word), "r" (shift) :);
> > #endif
> > 	return word;
> > }
> > 
> > unsigned short generic_ror16(unsigned short word, unsigned int shift)
> > {
> > 	return (word >> (shift & 15)) | (word << ((-shift) & 15));
> > }
> > 
> > unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift)
> > {
> > 	unsigned int word32 = ((unsigned int)word << 16) | word;
> > #ifdef __riscv
> > 	__asm__ volatile("nop"); // ALTERNATIVE(nop)
> > 
> > 	__asm__ volatile(
> > 		".option push\n"
> > 		".option arch,+zbb\n"
> > 		"rorw %0, %1, %2\n"
> > 		".option pop\n"
> > 		: "=r" (word32) : "r" (word32), "r" (shift) :);
> > #endif
> > 	return (unsigned short)word;
> > }
> > ```
> > The disassembly obtained is:
> > ```
> > generic_ror32:
> >     andi    a1,a1,31
> 
> The compiler shouldn't be generating that mask.
> After all it knows the negated value doesn't need the same mask.
> (I'd guess the cpu just ignores the high bits of the shift - most do.)
> 
> >     negw    a5,a1
> >     sllw    a5,a0,a5
> >     srlw    a0,a0,a1
> >     or      a0,a5,a0
> >     ret
> > 
> > zbb_opt_ror32:
> >     nop
> >     rorw a0, a0, a1
> >     sext.w  a0,a0
> 
> Is that a sign extend?
> Why is it there?
> If it is related to the (broken) 'feature' of riscv-64 that 32bit results
> are sign extended, why isn't there one in the example above.
> 
> You also need to consider the code for non-zbb cpu.
> 
> >     ret
> > 
> > generic_ror16:
> >     andi    a1,a1,15
> >     negw    a5,a1
> >     andi    a5,a5,15
> >     sllw    a5,a0,a5
> >     srlw    a0,a0,a1
> >     or      a0,a0,a5
> >     slli    a0,a0,48
> >     srli    a0,a0,48
> 
> The last two instructions mask the result with 0xffff.
> If that is necessary it is missing from the zbb version below.
> 
> >     ret
> > 
> > zbb_opt_ror16:
> >     slliw   a5,a0,16
> >     addw    a5,a5,a0
> 
> At this point you can just do a 'shift right' on all cpu.
> For rol16 you can do a variable shift left and a 16 bit
> shift right on all cpu.
> If the zbb version ends up with a nop (as below) then it is
> likely to be much the same speed.
> 
> 	David
> 
> >     nop
> >     rorw a5, a5, a1
> >     ret
> > ```

Sorry, please allow me to reply in a unified way. I did not check the rationality
of the above assembly, but only used the web tool you provided before to generate
it. In fact, I think it is more in line with the actual situation to disassemble
it from vmlinux. In addition, the code is simplified here, and the complete
implementation takes into account the processor that does not support or has not
enabled zbb.