[PATCH v2] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()

Uros Bizjak posted 1 patch 1 year, 3 months ago
There is a newer version of this series
arch/loongarch/include/asm/percpu.h | 130 +++++++++-------------------
1 file changed, 41 insertions(+), 89 deletions(-)
[PATCH v2] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
Posted by Uros Bizjak 1 year, 3 months ago
_percpu_read() and _percpu_write() macros call __percpu_read()
and __percpu_write() static inline functions that result in a single
assembly instruction. Percpu infrastructure expects its leaf
definitions to encode the size of their percpu variable, so the patch
merges asm clauses from the static inline function into the
corresponding leaf macros.

The secondary effect of this change is to avoid explicit __percpu
function arguments. Currently, __percpu macro is defined in
include/linux/compiler_types.h, but with proposed patch [1],
__percpu definition will need macros from include/asm-generic/percpu.h,
creating forward dependency loop.

The proposed solution is the same as x86 architecture uses.

Patch is compile tested only.

[1] https://lore.kernel.org/lkml/20240812115945.484051-4-ubizjak@gmail.com/

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Xi Ruoyao <xry111@xry111.site>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
v2: Add a missing cast and a missing coma in the asm template,
    reported by kernel test robot. Some formatting changes.
---
 arch/loongarch/include/asm/percpu.h | 130 +++++++++-------------------
 1 file changed, 41 insertions(+), 89 deletions(-)

diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
index 8f290e5546cf..2bb84227a8c5 100644
--- a/arch/loongarch/include/asm/percpu.h
+++ b/arch/loongarch/include/asm/percpu.h
@@ -68,75 +68,6 @@ PERCPU_OP(and, and, &)
 PERCPU_OP(or, or, |)
 #undef PERCPU_OP
 
-static __always_inline unsigned long __percpu_read(void __percpu *ptr, int size)
-{
-	unsigned long ret;
-
-	switch (size) {
-	case 1:
-		__asm__ __volatile__ ("ldx.b %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	case 2:
-		__asm__ __volatile__ ("ldx.h %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	case 4:
-		__asm__ __volatile__ ("ldx.w %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	case 8:
-		__asm__ __volatile__ ("ldx.d %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	default:
-		ret = 0;
-		BUILD_BUG();
-	}
-
-	return ret;
-}
-
-static __always_inline void __percpu_write(void __percpu *ptr, unsigned long val, int size)
-{
-	switch (size) {
-	case 1:
-		__asm__ __volatile__("stx.b %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	case 2:
-		__asm__ __volatile__("stx.h %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	case 4:
-		__asm__ __volatile__("stx.w %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	case 8:
-		__asm__ __volatile__("stx.d %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	default:
-		BUILD_BUG();
-	}
-}
-
 static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val, int size)
 {
 	switch (size) {
@@ -157,6 +88,39 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 	return 0;
 }
 
+#define __pcpu_op_1(op)		op ".b "
+#define __pcpu_op_2(op)		op ".h "
+#define __pcpu_op_4(op)		op ".w "
+#define __pcpu_op_8(op)		op ".d "
+
+#define _percpu_read(size, _pcp)					\
+({									\
+	unsigned long __pcp_ret;					\
+									\
+	__asm__ __volatile__(						\
+		__pcpu_op_##size("ldx") "%[ret], $r21, %[ptr]	\n"	\
+		: [ret] "=&r"(__pcp_ret)				\
+		: [ptr] "r"(&(_pcp))					\
+		: "memory");						\
+	(typeof(_pcp))__pcp_ret;					\
+})
+
+#define __pcpu_cast_1(val)	(((unsigned long) val) & 0xff)
+#define __pcpu_cast_2(val)	(((unsigned long) val) & 0xffff)
+#define __pcpu_cast_4(val)	(((unsigned long) val) & 0xffffffff)
+#define __pcpu_cast_8(val)	((unsigned long) val)
+
+#define _percpu_write(size, _pcp, _val)					\
+do {									\
+	unsigned long __pcp_val = __pcpu_cast_##size(_val);		\
+									\
+	__asm__ __volatile__(						\
+		__pcpu_op_##size("stx") "%[val], $r21, %[ptr]	\n"	\
+		:							\
+		: [val] "r"(__pcp_val), [ptr] "r"(&(_pcp))		\
+		: "memory");						\
+} while (0)
+
 /* this_cpu_cmpxchg */
 #define _protect_cmpxchg_local(pcp, o, n)			\
 ({								\
@@ -167,18 +131,6 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 	__ret;							\
 })
 
-#define _percpu_read(pcp)						\
-({									\
-	typeof(pcp) __retval;						\
-	__retval = (typeof(pcp))__percpu_read(&(pcp), sizeof(pcp));	\
-	__retval;							\
-})
-
-#define _percpu_write(pcp, val)						\
-do {									\
-	__percpu_write(&(pcp), (unsigned long)(val), sizeof(pcp));	\
-} while (0)								\
-
 #define _pcp_protect(operation, pcp, val)			\
 ({								\
 	typeof(pcp) __retval;					\
@@ -215,15 +167,15 @@ do {									\
 #define this_cpu_or_4(pcp, val) _percpu_or(pcp, val)
 #define this_cpu_or_8(pcp, val) _percpu_or(pcp, val)
 
-#define this_cpu_read_1(pcp) _percpu_read(pcp)
-#define this_cpu_read_2(pcp) _percpu_read(pcp)
-#define this_cpu_read_4(pcp) _percpu_read(pcp)
-#define this_cpu_read_8(pcp) _percpu_read(pcp)
+#define this_cpu_read_1(pcp) _percpu_read(1, pcp)
+#define this_cpu_read_2(pcp) _percpu_read(2, pcp)
+#define this_cpu_read_4(pcp) _percpu_read(4, pcp)
+#define this_cpu_read_8(pcp) _percpu_read(8, pcp)
 
-#define this_cpu_write_1(pcp, val) _percpu_write(pcp, val)
-#define this_cpu_write_2(pcp, val) _percpu_write(pcp, val)
-#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
-#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
+#define this_cpu_write_1(pcp, val) _percpu_write(1, pcp, val)
+#define this_cpu_write_2(pcp, val) _percpu_write(2, pcp, val)
+#define this_cpu_write_4(pcp, val) _percpu_write(4, pcp, val)
+#define this_cpu_write_8(pcp, val) _percpu_write(8, pcp, val)
 
 #define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
-- 
2.46.0
Re: [PATCH v2] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
Posted by Huacai Chen 1 year, 3 months ago
Hi, Uros,

Thank you for your patch.

On Wed, Sep 4, 2024 at 5:52 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> _percpu_read() and _percpu_write() macros call __percpu_read()
> and __percpu_write() static inline functions that result in a single
> assembly instruction. Percpu infrastructure expects its leaf
> definitions to encode the size of their percpu variable, so the patch
> merges asm clauses from the static inline function into the
> corresponding leaf macros.
It seems in some other places we prefer inline functions rather than
macros, but this patch is the opposite...

>
> The secondary effect of this change is to avoid explicit __percpu
> function arguments. Currently, __percpu macro is defined in
> include/linux/compiler_types.h, but with proposed patch [1],
> __percpu definition will need macros from include/asm-generic/percpu.h,
> creating forward dependency loop.
Macros don't check types, so use macros to drop "__percpu" checking?
Seems a little strange.

>
> The proposed solution is the same as x86 architecture uses.
>
> Patch is compile tested only.
>
> [1] https://lore.kernel.org/lkml/20240812115945.484051-4-ubizjak@gmail.com/
>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Huacai Chen <chenhuacai@kernel.org>
> Cc: WANG Xuerui <kernel@xen0n.name>
> Cc: Xi Ruoyao <xry111@xry111.site>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> ---
> v2: Add a missing cast and a missing coma in the asm template,
>     reported by kernel test robot. Some formatting changes.
> ---
>  arch/loongarch/include/asm/percpu.h | 130 +++++++++-------------------
>  1 file changed, 41 insertions(+), 89 deletions(-)
>
> diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> index 8f290e5546cf..2bb84227a8c5 100644
> --- a/arch/loongarch/include/asm/percpu.h
> +++ b/arch/loongarch/include/asm/percpu.h
> @@ -68,75 +68,6 @@ PERCPU_OP(and, and, &)
>  PERCPU_OP(or, or, |)
>  #undef PERCPU_OP
>
> -static __always_inline unsigned long __percpu_read(void __percpu *ptr, int size)
> -{
> -       unsigned long ret;
> -
> -       switch (size) {
> -       case 1:
> -               __asm__ __volatile__ ("ldx.b %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       case 2:
> -               __asm__ __volatile__ ("ldx.h %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       case 4:
> -               __asm__ __volatile__ ("ldx.w %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       case 8:
> -               __asm__ __volatile__ ("ldx.d %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       default:
> -               ret = 0;
> -               BUILD_BUG();
> -       }
> -
> -       return ret;
> -}
> -
> -static __always_inline void __percpu_write(void __percpu *ptr, unsigned long val, int size)
> -{
> -       switch (size) {
> -       case 1:
> -               __asm__ __volatile__("stx.b %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       case 2:
> -               __asm__ __volatile__("stx.h %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       case 4:
> -               __asm__ __volatile__("stx.w %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       case 8:
> -               __asm__ __volatile__("stx.d %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       default:
> -               BUILD_BUG();
> -       }
> -}
> -
>  static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val, int size)
>  {
>         switch (size) {
> @@ -157,6 +88,39 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>         return 0;
>  }
>
> +#define __pcpu_op_1(op)                op ".b "
> +#define __pcpu_op_2(op)                op ".h "
> +#define __pcpu_op_4(op)                op ".w "
> +#define __pcpu_op_8(op)                op ".d "
> +
> +#define _percpu_read(size, _pcp)                                       \
> +({                                                                     \
> +       unsigned long __pcp_ret;                                        \
> +                                                                       \
> +       __asm__ __volatile__(                                           \
> +               __pcpu_op_##size("ldx") "%[ret], $r21, %[ptr]   \n"     \
> +               : [ret] "=&r"(__pcp_ret)                                \
> +               : [ptr] "r"(&(_pcp))                                    \
> +               : "memory");                                            \
> +       (typeof(_pcp))__pcp_ret;                                        \
> +})
> +
> +#define __pcpu_cast_1(val)     (((unsigned long) val) & 0xff)
> +#define __pcpu_cast_2(val)     (((unsigned long) val) & 0xffff)
> +#define __pcpu_cast_4(val)     (((unsigned long) val) & 0xffffffff)
> +#define __pcpu_cast_8(val)     ((unsigned long) val)
Maybe use "((unsigned long) val)" is enough for all casting?

Huacai

> +
> +#define _percpu_write(size, _pcp, _val)                                        \
> +do {                                                                   \
> +       unsigned long __pcp_val = __pcpu_cast_##size(_val);             \
> +                                                                       \
> +       __asm__ __volatile__(                                           \
> +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> +               :                                                       \
> +               : [val] "r"(__pcp_val), [ptr] "r"(&(_pcp))              \
> +               : "memory");                                            \
> +} while (0)
> +
>  /* this_cpu_cmpxchg */
>  #define _protect_cmpxchg_local(pcp, o, n)                      \
>  ({                                                             \
> @@ -167,18 +131,6 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>         __ret;                                                  \
>  })
>
> -#define _percpu_read(pcp)                                              \
> -({                                                                     \
> -       typeof(pcp) __retval;                                           \
> -       __retval = (typeof(pcp))__percpu_read(&(pcp), sizeof(pcp));     \
> -       __retval;                                                       \
> -})
> -
> -#define _percpu_write(pcp, val)                                                \
> -do {                                                                   \
> -       __percpu_write(&(pcp), (unsigned long)(val), sizeof(pcp));      \
> -} while (0)                                                            \
> -
>  #define _pcp_protect(operation, pcp, val)                      \
>  ({                                                             \
>         typeof(pcp) __retval;                                   \
> @@ -215,15 +167,15 @@ do {                                                                      \
>  #define this_cpu_or_4(pcp, val) _percpu_or(pcp, val)
>  #define this_cpu_or_8(pcp, val) _percpu_or(pcp, val)
>
> -#define this_cpu_read_1(pcp) _percpu_read(pcp)
> -#define this_cpu_read_2(pcp) _percpu_read(pcp)
> -#define this_cpu_read_4(pcp) _percpu_read(pcp)
> -#define this_cpu_read_8(pcp) _percpu_read(pcp)
> +#define this_cpu_read_1(pcp) _percpu_read(1, pcp)
> +#define this_cpu_read_2(pcp) _percpu_read(2, pcp)
> +#define this_cpu_read_4(pcp) _percpu_read(4, pcp)
> +#define this_cpu_read_8(pcp) _percpu_read(8, pcp)
>
> -#define this_cpu_write_1(pcp, val) _percpu_write(pcp, val)
> -#define this_cpu_write_2(pcp, val) _percpu_write(pcp, val)
> -#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
> -#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
> +#define this_cpu_write_1(pcp, val) _percpu_write(1, pcp, val)
> +#define this_cpu_write_2(pcp, val) _percpu_write(2, pcp, val)
> +#define this_cpu_write_4(pcp, val) _percpu_write(4, pcp, val)
> +#define this_cpu_write_8(pcp, val) _percpu_write(8, pcp, val)
>
>  #define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
> --
> 2.46.0
>
Re: [PATCH v2] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
Posted by Uros Bizjak 1 year, 3 months ago
On Wed, Sep 4, 2024 at 5:02 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> Hi, Uros,
>
> Thank you for your patch.
>
> On Wed, Sep 4, 2024 at 5:52 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > _percpu_read() and _percpu_write() macros call __percpu_read()
> > and __percpu_write() static inline functions that result in a single
> > assembly instruction. Percpu infrastructure expects its leaf
> > definitions to encode the size of their percpu variable, so the patch
> > merges asm clauses from the static inline function into the
> > corresponding leaf macros.
> It seems in some other places we prefer inline functions rather than
> macros, but this patch is the opposite...

Please note that these are leaf macros (functions), always used
through the upper level macro (see e.g. the definition of
raw_cpu_read() and __pcpu_size_call_return() in
include/linux/percpu-defs.h). These upper level macros do type check
on the pointer, so there is no need to do it again in the leaf macro.
The percpu address space checks on x86 depend on the presence of these
checks.

> >
> > The secondary effect of this change is to avoid explicit __percpu
> > function arguments. Currently, __percpu macro is defined in
> > include/linux/compiler_types.h, but with proposed patch [1],
> > __percpu definition will need macros from include/asm-generic/percpu.h,
> > creating forward dependency loop.
> Macros don't check types, so use macros to drop "__percpu" checking?
> Seems a little strange.

As explained above, types are checked in the upper level macro (that
uses these leaf macros) through __verify_pcpu_ptr(). These checks
currently use sparse to check __percpu tag, but x86 will soon use the
compiler infrastructure with much more powerful checks in this place.

So, there is really no need to type check percpu pointer also in leaf functions.

Best regards,
Uros.
Re: [PATCH v2] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
Posted by Uros Bizjak 1 year, 3 months ago
On Wed, Sep 4, 2024 at 5:24 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Sep 4, 2024 at 5:02 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> >
> > Hi, Uros,
> >
> > Thank you for your patch.
> >
> > On Wed, Sep 4, 2024 at 5:52 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > _percpu_read() and _percpu_write() macros call __percpu_read()
> > > and __percpu_write() static inline functions that result in a single
> > > assembly instruction. Percpu infrastructure expects its leaf
> > > definitions to encode the size of their percpu variable, so the patch
> > > merges asm clauses from the static inline function into the
> > > corresponding leaf macros.
> > It seems in some other places we prefer inline functions rather than
> > macros, but this patch is the opposite...
>
> Please note that these are leaf macros (functions), always used
> through the upper level macro (see e.g. the definition of
> raw_cpu_read() and __pcpu_size_call_return() in
> include/linux/percpu-defs.h). These upper level macros do type check
> on the pointer, so there is no need to do it again in the leaf macro.
> The percpu address space checks on x86 depend on the presence of these
> checks.
>
> > >
> > > The secondary effect of this change is to avoid explicit __percpu
> > > function arguments. Currently, __percpu macro is defined in
> > > include/linux/compiler_types.h, but with proposed patch [1],
> > > __percpu definition will need macros from include/asm-generic/percpu.h,
> > > creating forward dependency loop.
> > Macros don't check types, so use macros to drop "__percpu" checking?
> > Seems a little strange.
>
> As explained above, types are checked in the upper level macro (that
> uses these leaf macros) through __verify_pcpu_ptr(). These checks
> currently use sparse to check __percpu tag, but x86 will soon use the
> compiler infrastructure with much more powerful checks in this place.
>
> So, there is really no need to type check percpu pointer also in leaf functions.

OTOH, you are right, we should also typecheck _val in case of
_percpu_write(). We need to add:

    if (0) {                                                \
        typeof(_var) pto_tmp__;                    \
        pto_tmp__ = (_val);                    \
        (void)pto_tmp__;                    \
    }                                \

to the _percpu_write() macro.

Let me test this amendment a bit, please expect the V3 patch tomorrow.

Thanks,
Uros.