[tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

tip-bot2 for Uros Bizjak posted 1 patch 1 year, 10 months ago
arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)
[tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
Posted by tip-bot2 for Uros Bizjak 1 year, 10 months ago
The following commit has been merged into the x86/percpu branch of tip:

Commit-ID:     0539084639f3835c8d0b798e6659ec14a266b4f1
Gitweb:        https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00

x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations: e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op().

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240320083127.493250-1-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958eb..de991e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do {									\
 })
 
 /*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix.  The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
  */
-#define percpu_xchg_op(size, qual, _var, _nval)				\
+#define this_percpu_xchg_op(_var, _nval)				\
 ({									\
-	__pcpu_type_##size pxo_old__;					\
-	__pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval);	\
-	asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]),		\
-				    "%[oval]")				\
-		  "\n1:\t"						\
-		  __pcpu_op2_##size("cmpxchg", "%[nval]",		\
-				    __percpu_arg([var]))		\
-		  "\n\tjnz 1b"						\
-		  : [oval] "=&a" (pxo_old__),				\
-		    [var] "+m" (__my_cpu_var(_var))			\
-		  : [nval] __pcpu_reg_##size(, pxo_new__)		\
-		  : "memory");						\
-	(typeof(_var))(unsigned long) pxo_old__;			\
+	typeof(_var) pxo_old__ = this_cpu_read(_var);			\
+	do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval));	\
+	pxo_old__;							\
 })
 
 /*
@@ -534,9 +524,9 @@ do {									\
 #define this_cpu_or_1(pcp, val)		percpu_to_op(1, volatile, "or", (pcp), val)
 #define this_cpu_or_2(pcp, val)		percpu_to_op(2, volatile, "or", (pcp), val)
 #define this_cpu_or_4(pcp, val)		percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	this_percpu_xchg_op(pcp, nval)
 
 #define raw_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, , pcp, val)
 #define raw_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do {									\
 #define this_cpu_and_8(pcp, val)		percpu_to_op(8, volatile, "and", (pcp), val)
 #define this_cpu_or_8(pcp, val)			percpu_to_op(8, volatile, "or", (pcp), val)
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)		this_percpu_xchg_op(pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
 #define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
Posted by Ingo Molnar 1 year, 10 months ago
* tip-bot2 for Uros Bizjak <tip-bot2@linutronix.de> wrote:

> The following commit has been merged into the x86/percpu branch of tip:
> 
> Commit-ID:     0539084639f3835c8d0b798e6659ec14a266b4f1
> Gitweb:        https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
> Author:        Uros Bizjak <ubizjak@gmail.com>
> AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
> Committer:     Ingo Molnar <mingo@kernel.org>
> CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00
> 
> x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
> 
> Rewrite percpu_xchg_op() using generic percpu primitives instead
> of using asm. The new implementation is similar to local_xchg() and
> allows the compiler to perform various optimizations: e.g. the
> compiler is able to create fast path through the loop, according
> to likely/unlikely annotations in percpu_try_cmpxchg_op().

So, while at it, there's two other x86 percpu code generation details I was 
wondering about:

1)

Right now it's GCC-only:

  config CC_HAS_NAMED_AS
          def_bool CC_IS_GCC && GCC_VERSION >= 120100

Because we wanted to create a stable core of known-working functionality.

I suppose we have already established that with the current merge window, 
so it might be time to expand it.

Clang claims to be compatible:

  https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html

  "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
   same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
   support."

I haven't tried it yet though.

2)

Also, is the GCC_VERSION cutoff accurate - are previous GCC versions 
known-buggy, or was it primarily a risk-reduction cutoff?

Thanks,

	Ingo
Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
Posted by Uros Bizjak 1 year, 10 months ago
On Wed, Mar 20, 2024 at 12:45 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * tip-bot2 for Uros Bizjak <tip-bot2@linutronix.de> wrote:
>
> > The following commit has been merged into the x86/percpu branch of tip:
> >
> > Commit-ID:     0539084639f3835c8d0b798e6659ec14a266b4f1
> > Gitweb:        https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
> > Author:        Uros Bizjak <ubizjak@gmail.com>
> > AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
> > Committer:     Ingo Molnar <mingo@kernel.org>
> > CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00
> >
> > x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
> >
> > Rewrite percpu_xchg_op() using generic percpu primitives instead
> > of using asm. The new implementation is similar to local_xchg() and
> > allows the compiler to perform various optimizations: e.g. the
> > compiler is able to create fast path through the loop, according
> > to likely/unlikely annotations in percpu_try_cmpxchg_op().
>
> So, while at it, there's two other x86 percpu code generation details I was
> wondering about:
>
> 1)
>
> Right now it's GCC-only:
>
>   config CC_HAS_NAMED_AS
>           def_bool CC_IS_GCC && GCC_VERSION >= 120100
>
> Because we wanted to create a stable core of known-working functionality.
>
> I suppose we have already established that with the current merge window,
> so it might be time to expand it.

Please note the KASAN incompatibility issue with GCC < 13.3. This
issue was fixed in the meantime, so I have posted a patch to re-enable
the named AS feature for gcc-13.3+ [1].

[1] https://lore.kernel.org/lkml/20240320124603.566923-1-ubizjak@gmail.com/

> Clang claims to be compatible:
>
>   https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html
>
>   "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
>    same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
>    support."
>
> I haven't tried it yet though.

In the RFC submission, the support was determined by the functional
check [2]. Perhaps we should re-introduce this instead of checking for
known compiler versions:

+config CC_HAS_NAMED_AS
+ def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC)
-x c - -c -o /dev/null)

[2] https://lore.kernel.org/lkml/20231001131620.112484-3-ubizjak@gmail.com/

> 2)
>
> Also, is the GCC_VERSION cutoff accurate - are previous GCC versions
> known-buggy, or was it primarily a risk-reduction cutoff?

This approach was chosen from our discussion [3]. The version cutoff
is arbitrary, it was later set to gcc-12.1, because it is the version
of the compiler you used at the time ;) I have also tried gcc-11 and
gcc-10 in the past, and the compiler produced bootable image. Saying
that, the usage of named address spaces in the kernel is somehow basic
(from the compiler PoV), so I think we could try the above approach
with the functional check and see if and what breaks. We can always
disable the USE_X86_SEG_SUPPORT config variable for known bad compiler
versions.

[3] https://lore.kernel.org/lkml/ZRwZOtANkcwtL+5B@gmail.com/

BTW: Related to percpu series is the patch that fixes the issue with
%rip-relative addressing for PER_CPU_VAR in BPF. IMHO, this issue
should be fixed before rc1, otherwise call thunks will be unusable
with BPF.

[4] https://lore.kernel.org/lkml/20240316232104.368561-1-joanbrugueram@gmail.com/

Thanks,
Uros.
Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
Posted by Nathan Chancellor 1 year, 10 months ago
On Wed, Mar 20, 2024 at 02:12:14PM +0100, Uros Bizjak wrote:
> On Wed, Mar 20, 2024 at 12:45 PM Ingo Molnar <mingo@kernel.org> wrote:
> > Clang claims to be compatible:
> >
> >   https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html
> >
> >   "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
> >    same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
> >    support."
> >
> > I haven't tried it yet though.
> 
> In the RFC submission, the support was determined by the functional
> check [2]. Perhaps we should re-introduce this instead of checking for
> known compiler versions:
> 
> +config CC_HAS_NAMED_AS
> + def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC)
> -x c - -c -o /dev/null)
> 
> [2] https://lore.kernel.org/lkml/20231001131620.112484-3-ubizjak@gmail.com/

I applied this change on top of current mainline (a4145ce1e7bc) and
built ARCH=x86_64 defconfig with LLVM 17.0.6 from [1] but it doesn't get
too far :)

  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:56:
  In file included from include/linux/preempt.h:79:
  In file included from arch/x86/include/asm/preempt.h:7:
  arch/x86/include/asm/current.h:47:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
     47 |                 return this_cpu_read_const(const_pcpu_hot.current_task);
        |                        ^
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:30: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                                     ^
  arch/x86/include/asm/percpu.h:105:28: note: expanded from macro '__my_cpu_ptr'
    105 | #define __my_cpu_ptr(ptr)       (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
        |                                  ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:56:
  In file included from include/linux/preempt.h:79:
  In file included from arch/x86/include/asm/preempt.h:7:
  arch/x86/include/asm/current.h:47:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:9: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:60:
  In file included from include/linux/thread_info.h:60:
  In file included from arch/x86/include/asm/thread_info.h:59:
  In file included from arch/x86/include/asm/cpufeature.h:5:
  arch/x86/include/asm/processor.h:530:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
    530 |                 return this_cpu_read_const(const_pcpu_hot.top_of_stack);
        |                        ^
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:30: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                                     ^
  arch/x86/include/asm/percpu.h:105:28: note: expanded from macro '__my_cpu_ptr'
    105 | #define __my_cpu_ptr(ptr)       (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
        |                                  ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:60:
  In file included from include/linux/thread_info.h:60:
  In file included from arch/x86/include/asm/thread_info.h:59:
  In file included from arch/x86/include/asm/cpufeature.h:5:
  arch/x86/include/asm/processor.h:530:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:9: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  4 errors generated.

[1]: https://mirrors.edge.kernel.org/pub/tools/llvm/

Cheers,
Nathan