From: Chen Pei <cp0613@linux.alibaba.com>
The RISC-V Zbb extension[1] defines bitwise rotation instructions,
which can be used to implement rotate related functions.
[1] https://github.com/riscv/riscv-bitmanip/
Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
---
arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++
1 file changed, 172 insertions(+)
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index d59310f74c2b..be247ef9e686 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -20,17 +20,20 @@
#include <asm-generic/bitops/__fls.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/rotate.h>
#else
#define __HAVE_ARCH___FFS
#define __HAVE_ARCH___FLS
#define __HAVE_ARCH_FFS
#define __HAVE_ARCH_FLS
+#define __HAVE_ARCH_ROTATE
#include <asm-generic/bitops/__ffs.h>
#include <asm-generic/bitops/__fls.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/rotate.h>
#include <asm/alternative-macros.h>
#include <asm/hwcap.h>
@@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x)
variable_fls(x_); \
})
+static __always_inline u64 variable_rol64(u64 word, unsigned int shift)
+{
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rol %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word) : "r" (word), "r" (shift) :);
+
+ return word;
+
+legacy:
+ return generic_rol64(word, shift);
+}
+
+static inline u64 variable_ror64(u64 word, unsigned int shift)
+{
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "ror %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word) : "r" (word), "r" (shift) :);
+
+ return word;
+
+legacy:
+ return generic_ror64(word, shift);
+}
+
+static inline u32 variable_rol32(u32 word, unsigned int shift)
+{
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rolw %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word) : "r" (word), "r" (shift) :);
+
+ return word;
+
+legacy:
+ return generic_rol32(word, shift);
+}
+
+static inline u32 variable_ror32(u32 word, unsigned int shift)
+{
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rorw %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word) : "r" (word), "r" (shift) :);
+
+ return word;
+
+legacy:
+ return generic_ror32(word, shift);
+}
+
+static inline u16 variable_rol16(u16 word, unsigned int shift)
+{
+ u32 word32 = ((u32)word << 16) | word;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rolw %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word32) : "r" (word32), "r" (shift) :);
+
+ return (u16)word32;
+
+legacy:
+ return generic_rol16(word, shift);
+}
+
+static inline u16 variable_ror16(u16 word, unsigned int shift)
+{
+ u32 word32 = ((u32)word << 16) | word;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rorw %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word32) : "r" (word32), "r" (shift) :);
+
+ return (u16)word32;
+
+legacy:
+ return generic_ror16(word, shift);
+}
+
+static inline u8 variable_rol8(u8 word, unsigned int shift)
+{
+ u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rolw %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word32) : "r" (word32), "r" (shift) :);
+
+ return (u8)word32;
+
+legacy:
+ return generic_rol8(word, shift);
+}
+
+static inline u8 variable_ror8(u8 word, unsigned int shift)
+{
+ u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word;
+
+ asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+ RISCV_ISA_EXT_ZBB, 1)
+ : : : : legacy);
+
+ asm volatile(
+ ".option push\n"
+ ".option arch,+zbb\n"
+ "rorw %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (word32) : "r" (word32), "r" (shift) :);
+
+ return (u8)word32;
+
+legacy:
+ return generic_ror8(word, shift);
+}
+
+#define rol64(word, shift) variable_rol64(word, shift)
+#define ror64(word, shift) variable_ror64(word, shift)
+#define rol32(word, shift) variable_rol32(word, shift)
+#define ror32(word, shift) variable_ror32(word, shift)
+#define rol16(word, shift) variable_rol16(word, shift)
+#define ror16(word, shift) variable_ror16(word, shift)
+#define rol8(word, shift) variable_rol8(word, shift)
+#define ror8(word, shift) variable_ror8(word, shift)
+
#endif /* !(defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)) || defined(NO_ALTERNATIVE) */
#include <asm-generic/bitops/ffz.h>
--
2.49.0
On Fri, Jun 20, 2025 at 07:16:10PM +0800, cp0613@linux.alibaba.com wrote: > From: Chen Pei <cp0613@linux.alibaba.com> > > The RISC-V Zbb extension[1] defines bitwise rotation instructions, > which can be used to implement rotate related functions. > > [1] https://github.com/riscv/riscv-bitmanip/ > > Signed-off-by: Chen Pei <cp0613@linux.alibaba.com> > --- > arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++ > 1 file changed, 172 insertions(+) > > diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h > index d59310f74c2b..be247ef9e686 100644 > --- a/arch/riscv/include/asm/bitops.h > +++ b/arch/riscv/include/asm/bitops.h > @@ -20,17 +20,20 @@ > #include <asm-generic/bitops/__fls.h> > #include <asm-generic/bitops/ffs.h> > #include <asm-generic/bitops/fls.h> > +#include <asm-generic/bitops/rotate.h> > > #else > #define __HAVE_ARCH___FFS > #define __HAVE_ARCH___FLS > #define __HAVE_ARCH_FFS > #define __HAVE_ARCH_FLS > +#define __HAVE_ARCH_ROTATE > > #include <asm-generic/bitops/__ffs.h> > #include <asm-generic/bitops/__fls.h> > #include <asm-generic/bitops/ffs.h> > #include <asm-generic/bitops/fls.h> > +#include <asm-generic/bitops/rotate.h> > > #include <asm/alternative-macros.h> > #include <asm/hwcap.h> > @@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x) > variable_fls(x_); \ > }) ... > +static inline u8 variable_ror8(u8 word, unsigned int shift) > +{ > + u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word; Can you add a comment about what is happening here? Are you sure it's optimized out in case of the 'legacy' alternative? > + > + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, > + RISCV_ISA_EXT_ZBB, 1) > + : : : : legacy); > + > + asm volatile( > + ".option push\n" > + ".option arch,+zbb\n" > + "rorw %0, %1, %2\n" > + ".option pop\n" > + : "=r" (word32) : "r" (word32), "r" (shift) :); > + > + return (u8)word32; > + > +legacy: > + return generic_ror8(word, shift); > +} > + > +#define rol64(word, shift) variable_rol64(word, shift) > +#define ror64(word, shift) variable_ror64(word, shift) > +#define rol32(word, shift) variable_rol32(word, shift) > +#define ror32(word, shift) variable_ror32(word, shift) > +#define rol16(word, shift) variable_rol16(word, shift) > +#define ror16(word, shift) variable_ror16(word, shift) > +#define rol8(word, shift) variable_rol8(word, shift) > +#define ror8(word, shift) variable_ror8(word, shift) Here you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks compile-time rotation if the parameter is known at compile time. I believe, generic implementation will allow compiler to handle this case better. Can you do a similar thing to what fls() does in the same file? Thanks, Yury > + > #endif /* !(defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)) || defined(NO_ALTERNATIVE) */ > > #include <asm-generic/bitops/ffz.h> > -- > 2.49.0
On Fri, 20 Jun 2025 12:20:47 -0400, yury.norov@gmail.com wrote: > Can you add a comment about what is happening here? Are you sure it's > optimized out in case of the 'legacy' alternative? Thank you for your review. Yes, I referred to the existing variable__fls() implementation, which should be fine. > Here you wire ror/rol() to the variable_ror/rol() unconditionally, and > that breaks compile-time rotation if the parameter is known at compile > time. > > I believe, generic implementation will allow compiler to handle this > case better. Can you do a similar thing to what fls() does in the same > file? I did consider it, but I did not find any toolchain that provides an implementation similar to __builtin_ror or __builtin_rol. If there is one, please help point it out. In addition, I did not consider it carefully before. If the rotate function is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>. I missed this step.
On Sat, Jun 28, 2025 at 07:13:57PM +0800, cp0613@linux.alibaba.com wrote: > On Fri, 20 Jun 2025 12:20:47 -0400, yury.norov@gmail.com wrote: > > > Can you add a comment about what is happening here? Are you sure it's > > optimized out in case of the 'legacy' alternative? > > Thank you for your review. Yes, I referred to the existing variable__fls() > implementation, which should be fine. No, it's not fine. Because you trimmed your original email completely, so there's no way to understand what I'm asking about; and because you didn't answer my question. So I'll ask again: what exactly you are doing in the line you've trimmed out? > > Here you wire ror/rol() to the variable_ror/rol() unconditionally, and > > that breaks compile-time rotation if the parameter is known at compile > > time. > > > > I believe, generic implementation will allow compiler to handle this > > case better. Can you do a similar thing to what fls() does in the same > > file? > > I did consider it, but I did not find any toolchain that provides an > implementation similar to __builtin_ror or __builtin_rol. If there is one, > please help point it out. This is the example of the toolchain you're looking for: /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } What I'm asking is: please show me that compile-time rol/ror is still calculated at compile time, i.e. ror64(1234, 12) is evaluated at compile time. > In addition, I did not consider it carefully before. If the rotate function > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>. > I missed this step. Sorry, I'm lost here about what you've considered and what not. I'm OK about accelerating ror/rol, but I want to make sure that; 1. The most trivial compile-case is actually evaluated at compile time; and 2. Any arch-specific code is well explained; and 3. legacy case optimized just as well as non-legacy. Thanks, Yury
On Sat, 28 Jun 2025 21:48:02 -0400, yury.norov@gmail.com wrote: > > > > +static inline u8 variable_ror8(u8 word, unsigned int shift) > > > > +{ > > > > + u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word; > > > > > > Can you add a comment about what is happening here? Are you sure it's > > > optimized out in case of the 'legacy' alternative? > > > > Thank you for your review. Yes, I referred to the existing variable__fls() > > implementation, which should be fine. > > No, it's not fine. Because you trimmed your original email completely, > so there's no way to understand what I'm asking about; and because you > didn't answer my question. So I'll ask again: what exactly you are doing > in the line you've trimmed out? Sorry, I misunderstood your question. Now I have made up for the lost original email. This is my answer. The RISC-V Zbb extension only provides 64-bit data rotation instructions rol/ror and 32-bit data rotation instructions rolw/rorw. Therefore, for 16-bit and 8-bit data, in order to use the rolw/rorw instruction optimization, the data is cyclically spliced here, and the corresponding number of bits is truncated after processing to achieve the function. This data preparation process does introduce additional operations. Compared with genneric's implementation, I use the web tool provided by David to illustrate. The two functions that need to be compared are briefly summarized as follows: ``` unsigned char generic_ror8(unsigned char word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift) { unsigned int word32 = ((unsigned int)word << 24) | \ ((unsigned int)word << 16) | ((unsigned int)word << 8) | word; #ifdef __riscv __asm__ volatile("nop"); // ALTERNATIVE(nop) __asm__ volatile( ".option push\n" ".option arch,+zbb\n" "rorw %0, %1, %2\n" ".option pop\n" : "=r" (word32) : "r" (word32), "r" (shift) :); #endif return (unsigned char)word32; } ``` The disassembly obtained is: ``` generic_ror8: andi a1,a1,7 negw a5,a1 andi a5,a5,7 sllw a5,a0,a5 srlw a0,a0,a1 or a0,a0,a5 andi a0,a0,0xff ret zbb_opt_ror8: slli a5,a0,8 add a0,a5,a0 slliw a5,a0,16 addw a5,a5,a0 nop rorw a5, a5, a1 andi a0,a5,0xff ret ``` From the perspective of the total number of instructions, although zbb_opt_ror8 has one more instruction, one of them is a nop, so the difference with generic_ror8 should be very small, or using the solution provided by David would be better for non-x86. > > I did consider it, but I did not find any toolchain that provides an > > implementation similar to __builtin_ror or __builtin_rol. If there is one, > > please help point it out. > > This is the example of the toolchain you're looking for: > > /** > * rol64 - rotate a 64-bit value left > * @word: value to rotate > * @shift: bits to roll > */ > static inline __u64 rol64(__u64 word, unsigned int shift) > { > return (word << (shift & 63)) | (word >> ((-shift) & 63)); > } > > What I'm asking is: please show me that compile-time rol/ror is still > calculated at compile time, i.e. ror64(1234, 12) is evaluated at > compile time. I see what you mean, I didn't consider the case of constants being evaluated at compile time, as you pointed out earlier: "you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks compile-time rotation if the parameter is known at compile time." In the absence of compiler built-in function support, I think it can be handled like this: ``` #define rol16(word, shift) \ (__builtin_constant_p(word) && __builtin_constant_p(shift) ? \ generic_ror16(word, shift) : variable_rol16(word, shift)) ``` How do you see? > > In addition, I did not consider it carefully before. If the rotate function > > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>. > > I missed this step. > > Sorry, I'm lost here about what you've considered and what not. I'm OK > about accelerating ror/rol, but I want to make sure that; > > 1. The most trivial compile-case is actually evaluated at compile time; and > 2. Any arch-specific code is well explained; and > 3. legacy case optimized just as well as non-legacy. 1. As in the above reply, use the generic implementation when compile-time evaluation is possible。 2. I will improve the comments later. 3. As mentioned before, only 8-bit rotation should have no optimization effect, and 16-bit and above will have significant optimization. Thanks, Pei
On Mon, Jun 30, 2025 at 08:04:53PM +0800, cp0613@linux.alibaba.com wrote: > On Sat, 28 Jun 2025 21:48:02 -0400, yury.norov@gmail.com wrote: > > > > > > +static inline u8 variable_ror8(u8 word, unsigned int shift) > > > > > +{ > > > > > + u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word; > > > > > > > > Can you add a comment about what is happening here? Are you sure it's > > > > optimized out in case of the 'legacy' alternative? > > > > > > Thank you for your review. Yes, I referred to the existing variable__fls() > > > implementation, which should be fine. > > > > No, it's not fine. Because you trimmed your original email completely, > > so there's no way to understand what I'm asking about; and because you > > didn't answer my question. So I'll ask again: what exactly you are doing > > in the line you've trimmed out? > > Sorry, I misunderstood your question. Now I have made up for the lost original > email. This is my answer. The RISC-V Zbb extension only provides 64-bit data > rotation instructions rol/ror and 32-bit data rotation instructions rolw/rorw. > Therefore, for 16-bit and 8-bit data, in order to use the rolw/rorw instruction > optimization, the data is cyclically spliced here, and the corresponding number > of bits is truncated after processing to achieve the function. > > This data preparation process does introduce additional operations. Compared with > genneric's implementation, I use the web tool provided by David to illustrate. > > The two functions that need to be compared are briefly summarized as follows: > ``` > unsigned char generic_ror8(unsigned char word, unsigned int shift) > { > return (word >> (shift & 7)) | (word << ((-shift) & 7)); > } > > unsigned char zbb_opt_ror8(unsigned char word, unsigned int shift) > { > unsigned int word32 = ((unsigned int)word << 24) | \ > ((unsigned int)word << 16) | ((unsigned int)word << 8) | word; > #ifdef __riscv > __asm__ volatile("nop"); // ALTERNATIVE(nop) > > __asm__ volatile( > ".option push\n" > ".option arch,+zbb\n" > "rorw %0, %1, %2\n" > ".option pop\n" > : "=r" (word32) : "r" (word32), "r" (shift) :); > #endif > return (unsigned char)word32; > } > ``` > The disassembly obtained is: > ``` > generic_ror8: > andi a1,a1,7 > negw a5,a1 > andi a5,a5,7 > sllw a5,a0,a5 > srlw a0,a0,a1 > or a0,a0,a5 > andi a0,a0,0xff > ret > > zbb_opt_ror8: > slli a5,a0,8 > add a0,a5,a0 > slliw a5,a0,16 > addw a5,a5,a0 > nop > rorw a5, a5, a1 > andi a0,a5,0xff > ret > ``` > From the perspective of the total number of instructions, although zbb_opt_ror8 has > one more instruction, one of them is a nop, so the difference with generic_ror8 should > be very small, or using the solution provided by David would be better for non-x86. And what about performance? > > > I did consider it, but I did not find any toolchain that provides an > > > implementation similar to __builtin_ror or __builtin_rol. If there is one, > > > please help point it out. > > > > This is the example of the toolchain you're looking for: > > > > /** > > * rol64 - rotate a 64-bit value left > > * @word: value to rotate > > * @shift: bits to roll > > */ > > static inline __u64 rol64(__u64 word, unsigned int shift) > > { > > return (word << (shift & 63)) | (word >> ((-shift) & 63)); > > } > > > > What I'm asking is: please show me that compile-time rol/ror is still > > calculated at compile time, i.e. ror64(1234, 12) is evaluated at > > compile time. > > I see what you mean, I didn't consider the case of constants being evaluated > at compile time, as you pointed out earlier: > "you wire ror/rol() to the variable_ror/rol() unconditionally, and that breaks > compile-time rotation if the parameter is known at compile time." > > In the absence of compiler built-in function support, I think it can be handled > like this: > ``` > #define rol16(word, shift) \ > (__builtin_constant_p(word) && __builtin_constant_p(shift) ? \ > generic_ror16(word, shift) : variable_rol16(word, shift)) > ``` > How do you see? That's what I meant. > > > In addition, I did not consider it carefully before. If the rotate function > > > is to be genericized, all archneed to include <asm-generic/bitops/rotate.h>. > > > I missed this step. > > > > Sorry, I'm lost here about what you've considered and what not. I'm OK > > about accelerating ror/rol, but I want to make sure that; > > > > 1. The most trivial compile-case is actually evaluated at compile time; and > > 2. Any arch-specific code is well explained; and > > 3. legacy case optimized just as well as non-legacy. > > 1. As in the above reply, use the generic implementation when compile-time evaluation > is possible。 > 2. I will improve the comments later. I'm particularly interested in ror8/rol8 case: u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word; When you expand it to 32-bit word, and want to rotate, you obviously need to copy lower quarterword to the higher one: 0xab >> 0xab0000ab That way generic (u8)ror32(0xab, shift) would work. But I don't understand why you copy the lower 8 bits to inner quarterwords. Is that a hardware requirement? Can you point to any arch documentation > 3. As mentioned before, only 8-bit rotation should have no optimization effect, and > 16-bit and above will have significant optimization. I asked you about the asm goto ("legacy") thing: you calculate that complex word32 _before_ evaluating the goto. So this word32 may get unused, and you waste cycles. I want to make sure this isn't the case. Please find attached a test for compile-time ror/rol evaluation. Please consider prepending your series with it. Thanks, Yury From 5c5be22117a2bd0a656efae0efd6ed159d4168c5 Mon Sep 17 00:00:00 2001 From: Yury Norov <yury.norov@gmail.com> Date: Mon, 30 Jun 2025 12:07:47 -0400 Subject: [PATCH] bitops: add compile-time test for ror() and rol() If parameters for the functions are passed at compile time, the compiler must calculate the result at compile time, as well. Now that architectures introduce accelerated implementations for bit rotation, we must make sure that they don't break compile-time evaluation. This patch adds a test for it, similarly to test_bitmap_const_eval(). Tested on x86_64. Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com> --- lib/test_bitops.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/lib/test_bitops.c b/lib/test_bitops.c index 55669624bb28..aa4c06df34f7 100644 --- a/lib/test_bitops.c +++ b/lib/test_bitops.c @@ -76,6 +76,56 @@ static int __init test_fns(void) return 0; } +static void __init test_bitops_const_eval(void) +{ + /* + * ror/rol operations on parameters known at compile-time must be + * optimized to compile-time constants on any supported optimization + * level (-O2, -Os) and all architectures. Otherwise, trigger a build + * bug. + */ + + u64 r64 = ror64(0x1234567890abcdefull, 24); + + BUILD_BUG_ON(!__builtin_constant_p(r64)); + BUILD_BUG_ON(r64 != 0xabcdef1234567890ull); + + u64 l64 = rol64(0x1234567890abcdefull, 24); + + BUILD_BUG_ON(!__builtin_constant_p(l64)); + BUILD_BUG_ON(l64 != 0x7890abcdef123456ull); + + u32 r32 = ror32(0x12345678, 24); + + BUILD_BUG_ON(!__builtin_constant_p(r32)); + BUILD_BUG_ON(r32 != 0x34567812); + + u32 l32 = rol32(0x12345678, 24); + + BUILD_BUG_ON(!__builtin_constant_p(l32)); + BUILD_BUG_ON(l32 != 0x78123456); + + u16 r16 = ror16(0x1234, 12); + + BUILD_BUG_ON(!__builtin_constant_p(r16)); + BUILD_BUG_ON(r16 != 0x2341); + + u16 l16 = rol16(0x1234, 12); + + BUILD_BUG_ON(!__builtin_constant_p(l16)); + BUILD_BUG_ON(l16 != 0x4123); + + u8 r8 = ror8(0x12, 6); + + BUILD_BUG_ON(!__builtin_constant_p(r16)); + BUILD_BUG_ON(r8 != 0x48); + + u8 l8 = rol8(0x12, 6); + + BUILD_BUG_ON(!__builtin_constant_p(l16)); + BUILD_BUG_ON(l8 != 0x84); +} + static int __init test_bitops_startup(void) { int i, bit_set; @@ -121,6 +171,7 @@ static int __init test_bitops_startup(void) pr_err("ERROR: FOUND SET BIT %d\n", bit_set); test_fns(); + test_bitops_const_eval(); pr_info("Completed bitops test\n"); -- 2.43.0
On Fri, 20 Jun 2025 12:20:47 -0400 Yury Norov <yury.norov@gmail.com> wrote: > On Fri, Jun 20, 2025 at 07:16:10PM +0800, cp0613@linux.alibaba.com wrote: > > From: Chen Pei <cp0613@linux.alibaba.com> > > > > The RISC-V Zbb extension[1] defines bitwise rotation instructions, > > which can be used to implement rotate related functions. > > > > [1] https://github.com/riscv/riscv-bitmanip/ > > > > Signed-off-by: Chen Pei <cp0613@linux.alibaba.com> > > --- > > arch/riscv/include/asm/bitops.h | 172 ++++++++++++++++++++++++++++++++ > > 1 file changed, 172 insertions(+) > > > > diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h > > index d59310f74c2b..be247ef9e686 100644 > > --- a/arch/riscv/include/asm/bitops.h > > +++ b/arch/riscv/include/asm/bitops.h > > @@ -20,17 +20,20 @@ > > #include <asm-generic/bitops/__fls.h> > > #include <asm-generic/bitops/ffs.h> > > #include <asm-generic/bitops/fls.h> > > +#include <asm-generic/bitops/rotate.h> > > > > #else > > #define __HAVE_ARCH___FFS > > #define __HAVE_ARCH___FLS > > #define __HAVE_ARCH_FFS > > #define __HAVE_ARCH_FLS > > +#define __HAVE_ARCH_ROTATE > > > > #include <asm-generic/bitops/__ffs.h> > > #include <asm-generic/bitops/__fls.h> > > #include <asm-generic/bitops/ffs.h> > > #include <asm-generic/bitops/fls.h> > > +#include <asm-generic/bitops/rotate.h> > > > > #include <asm/alternative-macros.h> > > #include <asm/hwcap.h> > > @@ -175,6 +178,175 @@ static __always_inline int variable_fls(unsigned int x) > > variable_fls(x_); \ > > }) > > ... > > > +static inline u8 variable_ror8(u8 word, unsigned int shift) > > +{ > > + u32 word32 = ((u32)word << 24) | ((u32)word << 16) | ((u32)word << 8) | word; > > Can you add a comment about what is happening here? Are you sure it's > optimized out in case of the 'legacy' alternative? Is it even a gain in the zbb case? The "rorw" is only ever going to help full word rotates. Here you might as well do ((word << 8 | word) >> shift). For "rol8" you'd need ((word << 24 | word) 'rol' shift). I still bet the generic code is faster (but see below). Same for 16bit rotates. Actually the generic version is (probably) horrid for everything except x86. See https://www.godbolt.org/z/xTxYj57To unsigned char rol(unsigned char v, unsigned int shift) { return (v << 8 | v) << shift >> 8; } unsigned char ror(unsigned char v, unsigned int shift) { return (v << 8 | v) >> shift; } David
On Wed, 25 Jun 2025 17:02:34 +0100, david.laight.linux@gmail.com wrote: > Is it even a gain in the zbb case? > The "rorw" is only ever going to help full word rotates. > Here you might as well do ((word << 8 | word) >> shift). > > For "rol8" you'd need ((word << 24 | word) 'rol' shift). > I still bet the generic code is faster (but see below). > > Same for 16bit rotates. > > Actually the generic version is (probably) horrid for everything except x86. > See https://www.godbolt.org/z/xTxYj57To Thanks for your suggestion, this website is very inspiring. According to the results, the generic version is indeed the most friendly to x86. I think this is also a reason why other architectures should be optimized. Take the riscv64 ror32 implementation as an example, compare the number of assembly instructions of the following two functions: ``` u32 zbb_opt_ror32(u32 word, unsigned int shift) { asm volatile( ".option push\n" ".option arch,+zbb\n" "rorw %0, %1, %2\n" ".option pop\n" : "=r" (word) : "r" (word), "r" (shift) :); return word; } u16 generic_ror32(u16 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } ``` Their disassembly is: ``` zbb_opt_ror32: <+0>: addi sp,sp,-16 <+2>: sd s0,0(sp) <+4>: sd ra,8(sp) <+6>: addi s0,sp,16 <+8>: .insn 4, 0x60b5553b <+12>: ld ra,8(sp) <+14>: ld s0,0(sp) <+16>: sext.w a0,a0 <+18>: addi sp,sp,16 <+20>: ret generic_ror32: <+0>: addi sp,sp,-16 <+2>: andi a1,a1,31 <+4>: sd s0,0(sp) <+6>: sd ra,8(sp) <+8>: addi s0,sp,16 <+10>: negw a5,a1 <+14>: sllw a5,a0,a5 <+18>: ld ra,8(sp) <+20>: ld s0,0(sp) <+22>: srlw a0,a0,a1 <+26>: or a0,a0,a5 <+28>: slli a0,a0,0x30 <+30>: srli a0,a0,0x30 <+32>: addi sp,sp,16 <+34>: ret ``` It can be found that the zbb optimized implementation uses fewer instructions, even for 16-bit and 8-bit data.
On Sat, 28 Jun 2025 20:08:16 +0800 cp0613@linux.alibaba.com wrote: > On Wed, 25 Jun 2025 17:02:34 +0100, david.laight.linux@gmail.com wrote: > > > Is it even a gain in the zbb case? > > The "rorw" is only ever going to help full word rotates. > > Here you might as well do ((word << 8 | word) >> shift). > > > > For "rol8" you'd need ((word << 24 | word) 'rol' shift). > > I still bet the generic code is faster (but see below). > > > > Same for 16bit rotates. > > > > Actually the generic version is (probably) horrid for everything except x86. > > See https://www.godbolt.org/z/xTxYj57To > > Thanks for your suggestion, this website is very inspiring. According to the > results, the generic version is indeed the most friendly to x86. I think this > is also a reason why other architectures should be optimized. Take the riscv64 > ror32 implementation as an example, compare the number of assembly instructions > of the following two functions: > ``` > u32 zbb_opt_ror32(u32 word, unsigned int shift) > { > asm volatile( > ".option push\n" > ".option arch,+zbb\n" > "rorw %0, %1, %2\n" > ".option pop\n" > : "=r" (word) : "r" (word), "r" (shift) :); > > return word; > } > > u16 generic_ror32(u16 word, unsigned int shift) > { > return (word >> (shift & 31)) | (word << ((-shift) & 31)); > } > ``` > Their disassembly is: > ``` > zbb_opt_ror32: > <+0>: addi sp,sp,-16 > <+2>: sd s0,0(sp) > <+4>: sd ra,8(sp) > <+6>: addi s0,sp,16 > <+8>: .insn 4, 0x60b5553b > <+12>: ld ra,8(sp) > <+14>: ld s0,0(sp) > <+16>: sext.w a0,a0 > <+18>: addi sp,sp,16 > <+20>: ret > > generic_ror32: > <+0>: addi sp,sp,-16 > <+2>: andi a1,a1,31 > <+4>: sd s0,0(sp) > <+6>: sd ra,8(sp) > <+8>: addi s0,sp,16 > <+10>: negw a5,a1 > <+14>: sllw a5,a0,a5 > <+18>: ld ra,8(sp) > <+20>: ld s0,0(sp) > <+22>: srlw a0,a0,a1 > <+26>: or a0,a0,a5 > <+28>: slli a0,a0,0x30 > <+30>: srli a0,a0,0x30 > <+32>: addi sp,sp,16 > <+34>: ret > ``` > It can be found that the zbb optimized implementation uses fewer instructions, > even for 16-bit and 8-bit data. Far too many register spills to stack. I think you've forgotten to specify -O2 David
On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote: > > It can be found that the zbb optimized implementation uses fewer instructions, > > even for 16-bit and 8-bit data. > > Far too many register spills to stack. > I think you've forgotten to specify -O2 Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and I used the web tool you provided as follows: ``` unsigned int generic_ror32(unsigned int word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift) { #ifdef __riscv __asm__ volatile("nop"); // ALTERNATIVE(nop) __asm__ volatile( ".option push\n" ".option arch,+zbb\n" "rorw %0, %1, %2\n" ".option pop\n" : "=r" (word) : "r" (word), "r" (shift) :); #endif return word; } unsigned short generic_ror16(unsigned short word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift) { unsigned int word32 = ((unsigned int)word << 16) | word; #ifdef __riscv __asm__ volatile("nop"); // ALTERNATIVE(nop) __asm__ volatile( ".option push\n" ".option arch,+zbb\n" "rorw %0, %1, %2\n" ".option pop\n" : "=r" (word32) : "r" (word32), "r" (shift) :); #endif return (unsigned short)word; } ``` The disassembly obtained is: ``` generic_ror32: andi a1,a1,31 negw a5,a1 sllw a5,a0,a5 srlw a0,a0,a1 or a0,a5,a0 ret zbb_opt_ror32: nop rorw a0, a0, a1 sext.w a0,a0 ret generic_ror16: andi a1,a1,15 negw a5,a1 andi a5,a5,15 sllw a5,a0,a5 srlw a0,a0,a1 or a0,a0,a5 slli a0,a0,48 srli a0,a0,48 ret zbb_opt_ror16: slliw a5,a0,16 addw a5,a5,a0 nop rorw a5, a5, a1 ret ``` Thanks, Pei
On Mon, 30 Jun 2025 20:14:30 +0800 cp0613@linux.alibaba.com wrote: > On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote: > > > > It can be found that the zbb optimized implementation uses fewer instructions, > > > even for 16-bit and 8-bit data. > > > > Far too many register spills to stack. > > I think you've forgotten to specify -O2 > > Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and > I used the web tool you provided as follows: > ``` > unsigned int generic_ror32(unsigned int word, unsigned int shift) > { > return (word >> (shift & 31)) | (word << ((-shift) & 31)); > } > > unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift) > { > #ifdef __riscv > __asm__ volatile("nop"); // ALTERNATIVE(nop) > > __asm__ volatile( > ".option push\n" > ".option arch,+zbb\n" > "rorw %0, %1, %2\n" > ".option pop\n" > : "=r" (word) : "r" (word), "r" (shift) :); > #endif > return word; > } > > unsigned short generic_ror16(unsigned short word, unsigned int shift) > { > return (word >> (shift & 15)) | (word << ((-shift) & 15)); > } > > unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift) > { > unsigned int word32 = ((unsigned int)word << 16) | word; > #ifdef __riscv > __asm__ volatile("nop"); // ALTERNATIVE(nop) > > __asm__ volatile( > ".option push\n" > ".option arch,+zbb\n" > "rorw %0, %1, %2\n" > ".option pop\n" > : "=r" (word32) : "r" (word32), "r" (shift) :); > #endif > return (unsigned short)word; > } > ``` > The disassembly obtained is: > ``` > generic_ror32: > andi a1,a1,31 The compiler shouldn't be generating that mask. After all it knows the negated value doesn't need the same mask. (I'd guess the cpu just ignores the high bits of the shift - most do.) > negw a5,a1 > sllw a5,a0,a5 > srlw a0,a0,a1 > or a0,a5,a0 > ret > > zbb_opt_ror32: > nop > rorw a0, a0, a1 > sext.w a0,a0 Is that a sign extend? Why is it there? If it is related to the (broken) 'feature' of riscv-64 that 32bit results are sign extended, why isn't there one in the example above. You also need to consider the code for non-zbb cpu. > ret > > generic_ror16: > andi a1,a1,15 > negw a5,a1 > andi a5,a5,15 > sllw a5,a0,a5 > srlw a0,a0,a1 > or a0,a0,a5 > slli a0,a0,48 > srli a0,a0,48 The last two instructions mask the result with 0xffff. If that is necessary it is missing from the zbb version below. > ret > > zbb_opt_ror16: > slliw a5,a0,16 > addw a5,a5,a0 At this point you can just do a 'shift right' on all cpu. For rol16 you can do a variable shift left and a 16 bit shift right on all cpu. If the zbb version ends up with a nop (as below) then it is likely to be much the same speed. David > nop > rorw a5, a5, a1 > ret > ``` > > Thanks, > Pei
On Mon, 30 Jun 2025 18:35:34 +0100, david.laight.linux@gmail.com wrote: > > On Sun, 29 Jun 2025 11:38:40 +0100, david.laight.linux@gmail.com wrote: > > > > > > It can be found that the zbb optimized implementation uses fewer instructions, > > > > even for 16-bit and 8-bit data. > > > > > > Far too many register spills to stack. > > > I think you've forgotten to specify -O2 > > > > Yes, I extracted it from the vmlinux disassembly, without compiling with -O2, and > > I used the web tool you provided as follows: > > ``` > > unsigned int generic_ror32(unsigned int word, unsigned int shift) > > { > > return (word >> (shift & 31)) | (word << ((-shift) & 31)); > > } > > > > unsigned int zbb_opt_ror32(unsigned int word, unsigned int shift) > > { > > #ifdef __riscv > > __asm__ volatile("nop"); // ALTERNATIVE(nop) > > > > __asm__ volatile( > > ".option push\n" > > ".option arch,+zbb\n" > > "rorw %0, %1, %2\n" > > ".option pop\n" > > : "=r" (word) : "r" (word), "r" (shift) :); > > #endif > > return word; > > } > > > > unsigned short generic_ror16(unsigned short word, unsigned int shift) > > { > > return (word >> (shift & 15)) | (word << ((-shift) & 15)); > > } > > > > unsigned short zbb_opt_ror16(unsigned short word, unsigned int shift) > > { > > unsigned int word32 = ((unsigned int)word << 16) | word; > > #ifdef __riscv > > __asm__ volatile("nop"); // ALTERNATIVE(nop) > > > > __asm__ volatile( > > ".option push\n" > > ".option arch,+zbb\n" > > "rorw %0, %1, %2\n" > > ".option pop\n" > > : "=r" (word32) : "r" (word32), "r" (shift) :); > > #endif > > return (unsigned short)word; > > } > > ``` > > The disassembly obtained is: > > ``` > > generic_ror32: > > andi a1,a1,31 > > The compiler shouldn't be generating that mask. > After all it knows the negated value doesn't need the same mask. > (I'd guess the cpu just ignores the high bits of the shift - most do.) > > > negw a5,a1 > > sllw a5,a0,a5 > > srlw a0,a0,a1 > > or a0,a5,a0 > > ret > > > > zbb_opt_ror32: > > nop > > rorw a0, a0, a1 > > sext.w a0,a0 > > Is that a sign extend? > Why is it there? > If it is related to the (broken) 'feature' of riscv-64 that 32bit results > are sign extended, why isn't there one in the example above. > > You also need to consider the code for non-zbb cpu. > > > ret > > > > generic_ror16: > > andi a1,a1,15 > > negw a5,a1 > > andi a5,a5,15 > > sllw a5,a0,a5 > > srlw a0,a0,a1 > > or a0,a0,a5 > > slli a0,a0,48 > > srli a0,a0,48 > > The last two instructions mask the result with 0xffff. > If that is necessary it is missing from the zbb version below. > > > ret > > > > zbb_opt_ror16: > > slliw a5,a0,16 > > addw a5,a5,a0 > > At this point you can just do a 'shift right' on all cpu. > For rol16 you can do a variable shift left and a 16 bit > shift right on all cpu. > If the zbb version ends up with a nop (as below) then it is > likely to be much the same speed. > > David > > > nop > > rorw a5, a5, a1 > > ret > > ``` Sorry, please allow me to reply in a unified way. I did not check the rationality of the above assembly, but only used the web tool you provided before to generate it. In fact, I think it is more in line with the actual situation to disassemble it from vmlinux. In addition, the code is simplified here, and the complete implementation takes into account the processor that does not support or has not enabled zbb.
© 2016 - 2025 Red Hat, Inc.