[PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns

Uros Bizjak posted 1 patch 9 months, 2 weeks ago
There is a newer version of this series
arch/x86/include/asm/atomic.h      | 14 +++++++-------
arch/x86/include/asm/atomic64_64.h | 14 +++++++-------
arch/x86/include/asm/bitops.h      | 14 +++++++-------
arch/x86/include/asm/cmpxchg.h     | 24 ++++++++++++------------
arch/x86/include/asm/cmpxchg_32.h  |  4 ++--
arch/x86/include/asm/cmpxchg_64.h  |  4 ++--
arch/x86/include/asm/rmwcc.h       |  2 +-
7 files changed, 38 insertions(+), 38 deletions(-)
[PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
According to:

  https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html

the usage of asm pseudo directives in the asm template can confuse
the compiler to wrongly estimate the size of the generated
code.

The LOCK_PREFIX macro expands to several asm pseudo directives, so
its usage in atomic locking insns causes instruction length estimate
to fail significantly (the specially instrumented compiler reports
the estimated length of these asm templates to be 6 instructions long).

This incorrect estimate further causes unoptimal inlining decisions,
unoptimal instruction scheduling and unoptimal code block alignments
for functions that use these locking primitives.

Use asm_inline instead:

  https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html

which is a feature that makes GCC pretend some inline assembler code
is tiny (while it would think it is huge), instead of just asm.

For code size estimation, the size of the asm is then taken as
the minimum size of one instruction, ignoring how many instructions
compiler thinks it is.

The code size of the resulting x86_64 defconfig object file increases
for 33.264 kbytes, representing 1.2% code size increase:

   text    data     bss     dec     hex filename
27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o

mainly due to different inlining decisions of -O2 build.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/atomic.h      | 14 +++++++-------
 arch/x86/include/asm/atomic64_64.h | 14 +++++++-------
 arch/x86/include/asm/bitops.h      | 14 +++++++-------
 arch/x86/include/asm/cmpxchg.h     | 24 ++++++++++++------------
 arch/x86/include/asm/cmpxchg_32.h  |  4 ++--
 arch/x86/include/asm/cmpxchg_64.h  |  4 ++--
 arch/x86/include/asm/rmwcc.h       |  2 +-
 7 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55b4d24356ea..75743f1dfd4e 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -30,14 +30,14 @@ static __always_inline void arch_atomic_set(atomic_t *v, int i)
 
 static __always_inline void arch_atomic_add(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "addl %1,%0"
+	asm_inline volatile(LOCK_PREFIX "addl %1, %0"
 		     : "+m" (v->counter)
 		     : "ir" (i) : "memory");
 }
 
 static __always_inline void arch_atomic_sub(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "subl %1,%0"
+	asm_inline volatile(LOCK_PREFIX "subl %1, %0"
 		     : "+m" (v->counter)
 		     : "ir" (i) : "memory");
 }
@@ -50,14 +50,14 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
 
 static __always_inline void arch_atomic_inc(atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "incl %0"
+	asm_inline volatile(LOCK_PREFIX "incl %0"
 		     : "+m" (v->counter) :: "memory");
 }
 #define arch_atomic_inc arch_atomic_inc
 
 static __always_inline void arch_atomic_dec(atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "decl %0"
+	asm_inline volatile(LOCK_PREFIX "decl %0"
 		     : "+m" (v->counter) :: "memory");
 }
 #define arch_atomic_dec arch_atomic_dec
@@ -116,7 +116,7 @@ static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
 
 static __always_inline void arch_atomic_and(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "andl %1,%0"
+	asm_inline volatile(LOCK_PREFIX "andl %1, %0"
 			: "+m" (v->counter)
 			: "ir" (i)
 			: "memory");
@@ -134,7 +134,7 @@ static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
 
 static __always_inline void arch_atomic_or(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "orl %1,%0"
+	asm_inline volatile(LOCK_PREFIX "orl %1, %0"
 			: "+m" (v->counter)
 			: "ir" (i)
 			: "memory");
@@ -152,7 +152,7 @@ static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
 
 static __always_inline void arch_atomic_xor(int i, atomic_t *v)
 {
-	asm volatile(LOCK_PREFIX "xorl %1,%0"
+	asm_inline volatile(LOCK_PREFIX "xorl %1, %0"
 			: "+m" (v->counter)
 			: "ir" (i)
 			: "memory");
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index ae12acae5b06..87b496325b5b 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -22,14 +22,14 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
 
 static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "addq %1,%0"
+	asm_inline volatile(LOCK_PREFIX "addq %1, %0"
 		     : "=m" (v->counter)
 		     : "er" (i), "m" (v->counter) : "memory");
 }
 
 static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "subq %1,%0"
+	asm_inline volatile(LOCK_PREFIX "subq %1, %0"
 		     : "=m" (v->counter)
 		     : "er" (i), "m" (v->counter) : "memory");
 }
@@ -42,7 +42,7 @@ static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
 
 static __always_inline void arch_atomic64_inc(atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "incq %0"
+	asm_inline volatile(LOCK_PREFIX "incq %0"
 		     : "=m" (v->counter)
 		     : "m" (v->counter) : "memory");
 }
@@ -50,7 +50,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v)
 
 static __always_inline void arch_atomic64_dec(atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "decq %0"
+	asm_inline volatile(LOCK_PREFIX "decq %0"
 		     : "=m" (v->counter)
 		     : "m" (v->counter) : "memory");
 }
@@ -110,7 +110,7 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
 
 static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "andq %1,%0"
+	asm_inline volatile(LOCK_PREFIX "andq %1, %0"
 			: "+m" (v->counter)
 			: "er" (i)
 			: "memory");
@@ -128,7 +128,7 @@ static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 
 static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "orq %1,%0"
+	asm_inline volatile(LOCK_PREFIX "orq %1, %0"
 			: "+m" (v->counter)
 			: "er" (i)
 			: "memory");
@@ -146,7 +146,7 @@ static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 
 static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
-	asm volatile(LOCK_PREFIX "xorq %1,%0"
+	asm_inline volatile(LOCK_PREFIX "xorq %1, %0"
 			: "+m" (v->counter)
 			: "er" (i)
 			: "memory");
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index b96d45944c59..100413aff640 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -52,12 +52,12 @@ static __always_inline void
 arch_set_bit(long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
-		asm volatile(LOCK_PREFIX "orb %b1,%0"
+		asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
 			: CONST_MASK_ADDR(nr, addr)
 			: "iq" (CONST_MASK(nr))
 			: "memory");
 	} else {
-		asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
+		asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
 			: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
 	}
 }
@@ -72,11 +72,11 @@ static __always_inline void
 arch_clear_bit(long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
-		asm volatile(LOCK_PREFIX "andb %b1,%0"
+		asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
 			: CONST_MASK_ADDR(nr, addr)
 			: "iq" (~CONST_MASK(nr)));
 	} else {
-		asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
+		asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
 			: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
 	}
 }
@@ -98,7 +98,7 @@ static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
 		volatile unsigned long *addr)
 {
 	bool negative;
-	asm volatile(LOCK_PREFIX "xorb %2,%1"
+	asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
 		CC_SET(s)
 		: CC_OUT(s) (negative), WBYTE_ADDR(addr)
 		: "iq" ((char)mask) : "memory");
@@ -122,11 +122,11 @@ static __always_inline void
 arch_change_bit(long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
-		asm volatile(LOCK_PREFIX "xorb %b1,%0"
+		asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
 			: CONST_MASK_ADDR(nr, addr)
 			: "iq" (CONST_MASK(nr)));
 	} else {
-		asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
+		asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
 			: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
 	}
 }
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index fd8afc1f5f6b..b61f32c3459f 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -44,22 +44,22 @@ extern void __add_wrong_size(void)
 	        __typeof__ (*(ptr)) __ret = (arg);			\
 		switch (sizeof(*(ptr))) {				\
 		case __X86_CASE_B:					\
-			asm volatile (lock #op "b %b0, %1\n"		\
+			asm_inline volatile (lock #op "b %b0, %1"	\
 				      : "+q" (__ret), "+m" (*(ptr))	\
 				      : : "memory", "cc");		\
 			break;						\
 		case __X86_CASE_W:					\
-			asm volatile (lock #op "w %w0, %1\n"		\
+			asm_inline volatile (lock #op "w %w0, %1"	\
 				      : "+r" (__ret), "+m" (*(ptr))	\
 				      : : "memory", "cc");		\
 			break;						\
 		case __X86_CASE_L:					\
-			asm volatile (lock #op "l %0, %1\n"		\
+			asm_inline volatile (lock #op "l %0, %1"	\
 				      : "+r" (__ret), "+m" (*(ptr))	\
 				      : : "memory", "cc");		\
 			break;						\
 		case __X86_CASE_Q:					\
-			asm volatile (lock #op "q %q0, %1\n"		\
+			asm_inline volatile (lock #op "q %q0, %1"	\
 				      : "+r" (__ret), "+m" (*(ptr))	\
 				      : : "memory", "cc");		\
 			break;						\
@@ -91,7 +91,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_B:						\
 	{								\
 		volatile u8 *__ptr = (volatile u8 *)(ptr);		\
-		asm volatile(lock "cmpxchgb %2,%1"			\
+		asm_inline volatile(lock "cmpxchgb %2, %1"		\
 			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "q" (__new), "0" (__old)			\
 			     : "memory");				\
@@ -100,7 +100,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_W:						\
 	{								\
 		volatile u16 *__ptr = (volatile u16 *)(ptr);		\
-		asm volatile(lock "cmpxchgw %2,%1"			\
+		asm_inline volatile(lock "cmpxchgw %2, %1"		\
 			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
@@ -109,7 +109,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_L:						\
 	{								\
 		volatile u32 *__ptr = (volatile u32 *)(ptr);		\
-		asm volatile(lock "cmpxchgl %2,%1"			\
+		asm_inline volatile(lock "cmpxchgl %2, %1"		\
 			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
@@ -118,7 +118,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_Q:						\
 	{								\
 		volatile u64 *__ptr = (volatile u64 *)(ptr);		\
-		asm volatile(lock "cmpxchgq %2,%1"			\
+		asm_inline volatile(lock "cmpxchgq %2, %1"		\
 			     : "=a" (__ret), "+m" (*__ptr)		\
 			     : "r" (__new), "0" (__old)			\
 			     : "memory");				\
@@ -165,7 +165,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_B:						\
 	{								\
 		volatile u8 *__ptr = (volatile u8 *)(_ptr);		\
-		asm volatile(lock "cmpxchgb %[new], %[ptr]"		\
+		asm_inline volatile(lock "cmpxchgb %[new], %[ptr]"	\
 			     CC_SET(z)					\
 			     : CC_OUT(z) (success),			\
 			       [ptr] "+m" (*__ptr),			\
@@ -177,7 +177,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_W:						\
 	{								\
 		volatile u16 *__ptr = (volatile u16 *)(_ptr);		\
-		asm volatile(lock "cmpxchgw %[new], %[ptr]"		\
+		asm_inline volatile(lock "cmpxchgw %[new], %[ptr]"	\
 			     CC_SET(z)					\
 			     : CC_OUT(z) (success),			\
 			       [ptr] "+m" (*__ptr),			\
@@ -189,7 +189,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_L:						\
 	{								\
 		volatile u32 *__ptr = (volatile u32 *)(_ptr);		\
-		asm volatile(lock "cmpxchgl %[new], %[ptr]"		\
+		asm_inline volatile(lock "cmpxchgl %[new], %[ptr]"	\
 			     CC_SET(z)					\
 			     : CC_OUT(z) (success),			\
 			       [ptr] "+m" (*__ptr),			\
@@ -201,7 +201,7 @@ extern void __add_wrong_size(void)
 	case __X86_CASE_Q:						\
 	{								\
 		volatile u64 *__ptr = (volatile u64 *)(_ptr);		\
-		asm volatile(lock "cmpxchgq %[new], %[ptr]"		\
+		asm_inline volatile(lock "cmpxchgq %[new], %[ptr]"	\
 			     CC_SET(z)					\
 			     : CC_OUT(z) (success),			\
 			       [ptr] "+m" (*__ptr),			\
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8806c646d452..3dd278593960 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -19,7 +19,7 @@ union __u64_halves {
 	union __u64_halves o = { .full = (_old), },			\
 			   n = { .full = (_new), };			\
 									\
-	asm volatile(_lock "cmpxchg8b %[ptr]"				\
+	asm_inline volatile(_lock "cmpxchg8b %[ptr]"			\
 		     : [ptr] "+m" (*(_ptr)),				\
 		       "+a" (o.low), "+d" (o.high)			\
 		     : "b" (n.low), "c" (n.high)			\
@@ -45,7 +45,7 @@ static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new
 			   n = { .full = (_new), };			\
 	bool ret;							\
 									\
-	asm volatile(_lock "cmpxchg8b %[ptr]"				\
+	asm_inline volatile(_lock "cmpxchg8b %[ptr]"			\
 		     CC_SET(e)						\
 		     : CC_OUT(e) (ret),					\
 		       [ptr] "+m" (*(_ptr)),				\
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 5e241306db26..71d1e72ed879 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -38,7 +38,7 @@ union __u128_halves {
 	union __u128_halves o = { .full = (_old), },			\
 			    n = { .full = (_new), };			\
 									\
-	asm volatile(_lock "cmpxchg16b %[ptr]"				\
+	asm_inline volatile(_lock "cmpxchg16b %[ptr]"			\
 		     : [ptr] "+m" (*(_ptr)),				\
 		       "+a" (o.low), "+d" (o.high)			\
 		     : "b" (n.low), "c" (n.high)			\
@@ -65,7 +65,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
 			    n = { .full = (_new), };			\
 	bool ret;							\
 									\
-	asm volatile(_lock "cmpxchg16b %[ptr]"				\
+	asm_inline volatile(_lock "cmpxchg16b %[ptr]"			\
 		     CC_SET(e)						\
 		     : CC_OUT(e) (ret),					\
 		       [ptr] "+m" (*(_ptr)),				\
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 363266cbcada..3821ee3fae35 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -29,7 +29,7 @@ cc_label:	c = true;						\
 #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...)			\
 ({									\
 	bool c;								\
-	asm volatile (fullop CC_SET(cc)					\
+	asm_inline volatile (fullop CC_SET(cc)				\
 			: [var] "+m" (_var), CC_OUT(cc) (c)		\
 			: __VA_ARGS__ : clobbers);			\
 	c;								\
-- 
2.48.1
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> According to:
> 
>   https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html
> 
> the usage of asm pseudo directives in the asm template can confuse
> the compiler to wrongly estimate the size of the generated
> code.
> 
> The LOCK_PREFIX macro expands to several asm pseudo directives, so
> its usage in atomic locking insns causes instruction length estimate
> to fail significantly (the specially instrumented compiler reports
> the estimated length of these asm templates to be 6 instructions long).
> 
> This incorrect estimate further causes unoptimal inlining decisions,
> unoptimal instruction scheduling and unoptimal code block alignments
> for functions that use these locking primitives.
> 
> Use asm_inline instead:
> 
>   https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html
> 
> which is a feature that makes GCC pretend some inline assembler code
> is tiny (while it would think it is huge), instead of just asm.
> 
> For code size estimation, the size of the asm is then taken as
> the minimum size of one instruction, ignoring how many instructions
> compiler thinks it is.
> 
> The code size of the resulting x86_64 defconfig object file increases
> for 33.264 kbytes, representing 1.2% code size increase:
> 
>    text    data     bss     dec     hex filename
> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> 
> mainly due to different inlining decisions of -O2 build.

So my request here would be not more benchmark figures (I don't think 
it's a realistic expectation for contributors to be able to measure 
much of an effect with such a type of change, let alone be certain
what a macro or micro-benchmark measures is causally connected with
the patch), but I'd like to ask for some qualitative analysis on the
code generation side:

 - +1.2% code size increase is a lot, especially if it's under the 
   default build flags of the kernel. Where does the extra code come 
   from?

 - Is there any effect on Clang? Are its inlining decisions around 
   these asm() statements comparable, worse/better?

A couple of concrete examples would go a long way:

 - "Function XXX was inlined 3 times before the patch, and it was 
    inlined 30 times after the patch. I have reviewed two such inlining 
    locations, and they have added more code to unlikely or 
    failure-handling branches collected near the function epilogue, 
    while the fast-path of the function was more optimal."

Or you might end up finding:

 - "Function YYY was inlined 3x more frequently after the patch, but 
    the inlining decision increased register pressure and created less 
    optimal code in the fast-path, increasing both code size and likely 
    decreasing fast-path performance."

Obviously we'd be sad about the second case, but it's well within the 
spectrum of possibilities when we look at "+1.2% object code size 
increase".

What we cannot do is to throw up our hands and claim "-O2 trades 
performance for size, and thus this patch improves performance".
We don't know that for sure and 30 years of kernel development
created a love-and-hate relationship and a fair level of distrust
between kernel developers and compiler inlining decisions,
especially around x86 asm() statements ...

So these are roughly the high level requirements around such patches.
Does this make sense?

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Thu, Mar 6, 2025 at 10:57 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > According to:
> >
> >   https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html
> >
> > the usage of asm pseudo directives in the asm template can confuse
> > the compiler to wrongly estimate the size of the generated
> > code.
> >
> > The LOCK_PREFIX macro expands to several asm pseudo directives, so
> > its usage in atomic locking insns causes instruction length estimate
> > to fail significantly (the specially instrumented compiler reports
> > the estimated length of these asm templates to be 6 instructions long).
> >
> > This incorrect estimate further causes unoptimal inlining decisions,
> > unoptimal instruction scheduling and unoptimal code block alignments
> > for functions that use these locking primitives.
> >
> > Use asm_inline instead:
> >
> >   https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html
> >
> > which is a feature that makes GCC pretend some inline assembler code
> > is tiny (while it would think it is huge), instead of just asm.
> >
> > For code size estimation, the size of the asm is then taken as
> > the minimum size of one instruction, ignoring how many instructions
> > compiler thinks it is.
> >
> > The code size of the resulting x86_64 defconfig object file increases
> > for 33.264 kbytes, representing 1.2% code size increase:
> >
> >    text    data     bss     dec     hex filename
> > 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> > 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> >
> > mainly due to different inlining decisions of -O2 build.
>
> So my request here would be not more benchmark figures (I don't think
> it's a realistic expectation for contributors to be able to measure
> much of an effect with such a type of change, let alone be certain
> what a macro or micro-benchmark measures is causally connected with
> the patch), but I'd like to ask for some qualitative analysis on the
> code generation side:
>
>  - +1.2% code size increase is a lot, especially if it's under the
>    default build flags of the kernel. Where does the extra code come
>    from?
>
>  - Is there any effect on Clang? Are its inlining decisions around
>    these asm() statements comparable, worse/better?

FTR, clang recognizes "asm inline", but there was no difference in code sizes:

  text    data     bss     dec     hex filename
27577163        4503078  807732 32887973        1f5d4a5 vmlinux-clang-patched.o
27577181        4503078  807732 32887991        1f5d4b7
vmlinux-clang-unpatched.o

Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Thu, Mar 6, 2025 at 10:57 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > According to:
> >
> >   https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html
> >
> > the usage of asm pseudo directives in the asm template can confuse
> > the compiler to wrongly estimate the size of the generated
> > code.
> >
> > The LOCK_PREFIX macro expands to several asm pseudo directives, so
> > its usage in atomic locking insns causes instruction length estimate
> > to fail significantly (the specially instrumented compiler reports
> > the estimated length of these asm templates to be 6 instructions long).
> >
> > This incorrect estimate further causes unoptimal inlining decisions,
> > unoptimal instruction scheduling and unoptimal code block alignments
> > for functions that use these locking primitives.
> >
> > Use asm_inline instead:
> >
> >   https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html
> >
> > which is a feature that makes GCC pretend some inline assembler code
> > is tiny (while it would think it is huge), instead of just asm.
> >
> > For code size estimation, the size of the asm is then taken as
> > the minimum size of one instruction, ignoring how many instructions
> > compiler thinks it is.
> >
> > The code size of the resulting x86_64 defconfig object file increases
> > for 33.264 kbytes, representing 1.2% code size increase:
> >
> >    text    data     bss     dec     hex filename
> > 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> > 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> >
> > mainly due to different inlining decisions of -O2 build.
>
> So my request here would be not more benchmark figures (I don't think
> it's a realistic expectation for contributors to be able to measure
> much of an effect with such a type of change, let alone be certain
> what a macro or micro-benchmark measures is causally connected with
> the patch), but I'd like to ask for some qualitative analysis on the
> code generation side:
>
>  - +1.2% code size increase is a lot, especially if it's under the
>    default build flags of the kernel. Where does the extra code come
>    from?
>
>  - Is there any effect on Clang? Are its inlining decisions around
>    these asm() statements comparable, worse/better?

Bah, apparently I can't calculate... the code increase is in fact
0.14%, but still 33k.

bloat-o-meter shows top grows (> 500 bytes):

add/remove: 82/283 grow/shrink: 870/372 up/down: 76272/-43618 (32654)
Function                                     old     new   delta
copy_process                                6465   10191   +3726
balance_dirty_pages_ratelimited_flags        237    2949   +2712
icl_plane_update_noarm                      5800    7969   +2169
samsung_input_mapping                       3375    5170   +1795
ext4_do_update_inode.isra                      -    1526   +1526
__schedule                                  2416    3472   +1056
__i915_vma_resource_unhold                     -     946    +946
sched_mm_cid_after_execve                    175    1097    +922
__do_sys_membarrier                            -     862    +862
filemap_fault                               2666    3462    +796
nl80211_send_wiphy                         11185   11874    +689
samsung_input_mapping.cold                   900    1500    +600
virtio_gpu_queue_fenced_ctrl_buffer          839    1410    +571
ilk_update_pipe_csc                         1201    1735    +534
enable_step                                    -     525    +525
icl_color_commit_noarm                      1334    1847    +513
tg3_read_bc_ver                                -     501    +501

and top shrinks ( < 500 bytes):

nl80211_send_iftype_data                     580       -    -580
samsung_gamepad_input_mapping.isra.cold      604       -    -604
virtio_gpu_queue_ctrl_sgs                    724       -    -724
tg3_get_invariants                          9218    8376    -842
__i915_vma_resource_unhold.part              899       -    -899
ext4_mark_iloc_dirty                        1735     106   -1629
samsung_gamepad_input_mapping.isra          2046       -   -2046
icl_program_input_csc                       2203       -   -2203
copy_mm                                     2242       -   -2242
balance_dirty_pages                         2657       -   -2657
Total: Before=22770320, After=22802974, chg +0.14%

> A couple of concrete examples would go a long way:
>
>  - "Function XXX was inlined 3 times before the patch, and it was
>     inlined 30 times after the patch. I have reviewed two such inlining
>     locations, and they have added more code to unlikely or
>     failure-handling branches collected near the function epilogue,
>     while the fast-path of the function was more optimal."
>
> Or you might end up finding:
>
>  - "Function YYY was inlined 3x more frequently after the patch, but
>     the inlining decision increased register pressure and created less
>     optimal code in the fast-path, increasing both code size and likely
>     decreasing fast-path performance."
>
> Obviously we'd be sad about the second case, but it's well within the
> spectrum of possibilities when we look at "+1.2% object code size
> increase".
>
> What we cannot do is to throw up our hands and claim "-O2 trades
> performance for size, and thus this patch improves performance".
> We don't know that for sure and 30 years of kernel development
> created a love-and-hate relationship and a fair level of distrust
> between kernel developers and compiler inlining decisions,
> especially around x86 asm() statements ...
>
> So these are roughly the high level requirements around such patches.
> Does this make sense?

In my opinion, the writing above makes perfect sense. As far as I'm
concerned, I'm able and can do the above code analysis, the
problematic part would be precise performance measurements. Although
with your instructions, I can also try that.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> > So these are roughly the high level requirements around such patches.
> > Does this make sense?
> 
> In my opinion, the writing above makes perfect sense. As far as I'm 
> concerned, I'm able and can do the above code analysis, the 
> problematic part would be precise performance measurements. Although 
> with your instructions, I can also try that.

Yeah, so *personally* I find the kind of code generation analysis you 
routinely perform for your micro-optimization patches far more useful 
and persuasive, because it's basically a first principles argument: 
instructions removed are an inarguable positive in the overwhelming 
majority cases all other things equal (as long as it doesn't come at 
the expense of more function calls or worse instructions, etc.).

For inlining decisions code generation analysis is arguably more 
complicated - but that's the nature of inlining related patches.

Performance measurements can back up such arguments, and being more 
proficient in perf tooling is a useful toolbox to have anyway, but it's 
fundamentally a stohastic argument for something as comparatively small 
as a +0.12% code size increase.

But if code generation analysis is inconclusive or even negative, then 
performance measurements can trump all of that, but it's a substantial 
barrier of entry as you noted - and I'm somewhat sceptical whether a 
0.12% code generation effect *can* even be measured reliably even with 
the best of expertise and infrastructure...

Also, to shorten build & test times you can use the x86-64 defconfig. 
It's a config more or less representative of what major distros enable, 
and it's even bootable on some systems and in VMs, but it builds in far 
less time.

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Ingo Molnar <mingo@kernel.org> wrote:

> Also, to shorten build & test times you can use the x86-64 defconfig. 
> It's a config more or less representative of what major distros 
> enable, and it's even bootable on some systems and in VMs, but it 
> builds in far less time.

And if your primary test method is KVM+Qemu, then the following build 
method will give you a representative core kernel bzImage that will 
boot most cloud VM images as-is:

  $ make -j128 ARCH=x86 defconfig kvm_guest.config bzImage

  $ ll arch/x86/boot/bzImage 
    -rw-rw-r-- 1 mingo mingo 13788160 Mar  6 11:42 arch/x86/boot/bzImage

And you can boot up a .raw distro image in an xterm:

  $ RAW=your_cloud_image.raw
  $ PARTUID=your_target_root_partition_UUID

  $ qemu-system-x86_64 \
   -enable-kvm \
   -smp cpus=4 \
   -m 2048 \
   -machine q35 \
   -cpu host \
   -global ICH9-LPC.disable_s3=1 \
   -net nic,model=virtio \
   -net user,hostfwd=tcp::8022-:22,hostfwd=tcp::8090-:80  \
   -drive "file=$RAW",if=none,format=raw,id=disk1 \
   -device virtio-blk-pci,drive=disk1,bootindex=1 \
   -serial mon:stdio \
   -nographic \
   -append "root=PARTUUID=$ID ro console=tty0 console=ttyS0,115200 earlyprintk=ttyS0,115200 consoleblank=0 ignore_loglevel" \
   -kernel arch/x86/boot/bzImage

This way you don't need any initrd build or modules nonsense - 
everything necessary is built in.

Bootable raw distro images can be found in numerous places, for example 
at:

  https://cloud.debian.org/images/cloud/bookworm-backports/

( And since I'm lazy to figure it out the 'cloud way', I usually read the 
  root UUID from the bootlog of the first unsuccessful attempt. )

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Dave Hansen 9 months, 2 weeks ago
On 2/28/25 04:35, Uros Bizjak wrote:
> The code size of the resulting x86_64 defconfig object file increases
> for 33.264 kbytes, representing 1.2% code size increase:
> 
>    text    data     bss     dec     hex filename
> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o

So, first of all, thank you for including some objective measurement of
the impact if your patches. It's much appreciated.

But I think the patches need to come with a solid theory of why they're
good. The minimum bar for that, I think, is *some* kind of actual
real-world performance test. I'm not picky. Just *something* that spends
a lot of time in the kernel and ideally where a profile points at some
of the code you're poking here.

I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.

But performance patches need to come with performance *numbers*.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by H. Peter Anvin 9 months, 1 week ago
On 2/28/25 08:48, Dave Hansen wrote:
> On 2/28/25 04:35, Uros Bizjak wrote:
>> The code size of the resulting x86_64 defconfig object file increases
>> for 33.264 kbytes, representing 1.2% code size increase:
>>
>>     text    data     bss     dec     hex filename
>> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
>> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> 
> So, first of all, thank you for including some objective measurement of
> the impact if your patches. It's much appreciated.
> 
> But I think the patches need to come with a solid theory of why they're
> good. The minimum bar for that, I think, is *some* kind of actual
> real-world performance test. I'm not picky. Just *something* that spends
> a lot of time in the kernel and ideally where a profile points at some
> of the code you're poking here.
> 
> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
> 
> But performance patches need to come with performance *numbers*.

Incidentally, this is exactly the reason why gcc added "asm inline" *at 
our request*. We just haven't caught up with it everywhere yet.

In fact, I would wonder if we shouldn't simply do:

#define asm __asm__ __inline__
#define asm_noinline __asm__

... in other words, to make asm inline an opt-out instead of an opt-in.
It is comparatively unusual that we do complex things in inline assembly 
that we would want gcc to treat as something that should be avoided.

	-hpa
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 1 week ago
On Sat, Mar 8, 2025 at 8:08 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> On 2/28/25 08:48, Dave Hansen wrote:
> > On 2/28/25 04:35, Uros Bizjak wrote:
> >> The code size of the resulting x86_64 defconfig object file increases
> >> for 33.264 kbytes, representing 1.2% code size increase:
> >>
> >>     text    data     bss     dec     hex filename
> >> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> >> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> >
> > So, first of all, thank you for including some objective measurement of
> > the impact if your patches. It's much appreciated.
> >
> > But I think the patches need to come with a solid theory of why they're
> > good. The minimum bar for that, I think, is *some* kind of actual
> > real-world performance test. I'm not picky. Just *something* that spends
> > a lot of time in the kernel and ideally where a profile points at some
> > of the code you're poking here.
> >
> > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
> >
> > But performance patches need to come with performance *numbers*.
>
> Incidentally, this is exactly the reason why gcc added "asm inline" *at
> our request*. We just haven't caught up with it everywhere yet.
>
> In fact, I would wonder if we shouldn't simply do:
>
> #define asm __asm__ __inline__
> #define asm_noinline __asm__
>
> ... in other words, to make asm inline an opt-out instead of an opt-in.
> It is comparatively unusual that we do complex things in inline assembly
> that we would want gcc to treat as something that should be avoided.

I don't think we need such radical changes. There are only a few
groups of instructions, nicely hidden behind macros, that need asm
noinline. Alternatives (gcc counted them as 20 - 23 instructions) are
already using asm inline (please see
arch/x86/include/asm/alternative.h) in their high-level macros, and my
proposed patch converts all asm using LOCK_PREFIX by amending macros
in 7 header files.

Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by David Laight 9 months, 1 week ago
On Sun, 9 Mar 2025 08:50:08 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

> On Sat, Mar 8, 2025 at 8:08 PM H. Peter Anvin <hpa@zytor.com> wrote:
...
> > In fact, I would wonder if we shouldn't simply do:
> >
> > #define asm __asm__ __inline__
> > #define asm_noinline __asm__
> >
> > ... in other words, to make asm inline an opt-out instead of an opt-in.

The asm statements themselves get inlined (typically they are in an
always_inline wrapper), the size affects whether the calling code is inlined.
You are now in the 'games' of I$ fetches, overall code size and TLB lookups
(not helped by the speculative execution mitigations for 'ret').

> > It is comparatively unusual that we do complex things in inline assembly
> > that we would want gcc to treat as something that should be avoided.
> 
> I don't think we need such radical changes. There are only a few
> groups of instructions, nicely hidden behind macros, that need asm
> noinline. Alternatives (gcc counted them as 20 - 23 instructions) are
> already using asm inline (please see
> arch/x86/include/asm/alternative.h) in their high-level macros, and my
> proposed patch converts all asm using LOCK_PREFIX by amending macros
> in 7 header files.

The other ones that are likely to get mis-sized are the ones that change
the section to add annotations.
The size overestimate may be better in order to reduce the number of
annotations?

	David
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 1 week ago
On Sun, Mar 9, 2025 at 10:46 AM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Sun, 9 Mar 2025 08:50:08 +0100
> Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On Sat, Mar 8, 2025 at 8:08 PM H. Peter Anvin <hpa@zytor.com> wrote:
> ...
> > > In fact, I would wonder if we shouldn't simply do:
> > >
> > > #define asm __asm__ __inline__
> > > #define asm_noinline __asm__
> > >
> > > ... in other words, to make asm inline an opt-out instead of an opt-in.
>
> The asm statements themselves get inlined (typically they are in an
> always_inline wrapper), the size affects whether the calling code is inlined.
> You are now in the 'games' of I$ fetches, overall code size and TLB lookups
> (not helped by the speculative execution mitigations for 'ret').
>
> > > It is comparatively unusual that we do complex things in inline assembly
> > > that we would want gcc to treat as something that should be avoided.
> >
> > I don't think we need such radical changes. There are only a few
> > groups of instructions, nicely hidden behind macros, that need asm
> > noinline. Alternatives (gcc counted them as 20 - 23 instructions) are
> > already using asm inline (please see
> > arch/x86/include/asm/alternative.h) in their high-level macros, and my
> > proposed patch converts all asm using LOCK_PREFIX by amending macros
> > in 7 header files.
>
> The other ones that are likely to get mis-sized are the ones that change
> the section to add annotations.
> The size overestimate may be better in order to reduce the number of
> annotations?

Yes, this is why I think it is better to use asm inline on a per-case
basis. While ALTERNATIVE and LOCK_HERE instructions result in one insn
(and can be trivially amended with asm inline to instruct the compiler
about this fact), this may not always be the case.

Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 2/28/25 04:35, Uros Bizjak wrote:
> > The code size of the resulting x86_64 defconfig object file increases
> > for 33.264 kbytes, representing 1.2% code size increase:
> >
> >    text    data     bss     dec     hex filename
> > 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> > 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
>
> So, first of all, thank you for including some objective measurement of
> the impact if your patches. It's much appreciated.
>
> But I think the patches need to come with a solid theory of why they're
> good. The minimum bar for that, I think, is *some* kind of actual
> real-world performance test. I'm not picky. Just *something* that spends
> a lot of time in the kernel and ideally where a profile points at some
> of the code you're poking here.
>
> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
>
> But performance patches need to come with performance *numbers*.

Please find lmbench results from unpatched (fedora.0) and patched
(fedora.1) fedora-41 6.13.5 kernels.

lmbench is from [1]

[1] https://fedora.pkgs.org/41/rpm-sphere-x86_64/lmbench-3.0-0.a9.3.x86_64.rpm.html

Some tests show quite different results, but I'd appreciate some help
in interpreting the results. Maybe they show that the price of 33
kbytes is worth the improvement, or they will motivate someone
experienced in kernel benchmarks to benchmark the patch in a more
scientific way.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Sun, Mar 2, 2025 at 9:56 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote:
> >
> > On 2/28/25 04:35, Uros Bizjak wrote:
> > > The code size of the resulting x86_64 defconfig object file increases
> > > for 33.264 kbytes, representing 1.2% code size increase:
> > >
> > >    text    data     bss     dec     hex filename
> > > 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> > > 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> >
> > So, first of all, thank you for including some objective measurement of
> > the impact if your patches. It's much appreciated.
> >
> > But I think the patches need to come with a solid theory of why they're
> > good. The minimum bar for that, I think, is *some* kind of actual
> > real-world performance test. I'm not picky. Just *something* that spends
> > a lot of time in the kernel and ideally where a profile points at some
> > of the code you're poking here.
> >
> > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
> >
> > But performance patches need to come with performance *numbers*.
>
> Please find lmbench results from unpatched (fedora.0) and patched
> (fedora.1) fedora-41 6.13.5 kernels.
>
> lmbench is from [1]
>
> [1] https://fedora.pkgs.org/41/rpm-sphere-x86_64/lmbench-3.0-0.a9.3.x86_64.rpm.html
>
> Some tests show quite different results, but I'd appreciate some help
> in interpreting the results. Maybe they show that the price of 33
> kbytes is worth the improvement, or they will motivate someone
> experienced in kernel benchmarks to benchmark the patch in a more
> scientific way.

These go from:

Process fork+exit: 270.0952 microseconds
Process fork+execve: 2620.3333 microseconds
Process fork+/bin/sh -c: 6781.0000 microseconds
File /usr/tmp/XXX write bandwidth: 1780350 KB/sec
Pagefaults on /usr/tmp/XXX: 0.3875 microseconds

to:

Process fork+exit: 298.6842 microseconds
Process fork+execve: 1662.7500 microseconds
Process fork+/bin/sh -c: 2127.6667 microseconds
File /usr/tmp/XXX write bandwidth: 1950077 KB/sec
Pagefaults on /usr/tmp/XXX: 0.1958 microseconds

and from:

Socket bandwidth using localhost
0.000001 2.52 MB/sec
0.000064 163.02 MB/sec
0.000128 321.70 MB/sec
0.000256 630.06 MB/sec
0.000512 1207.07 MB/sec
0.001024 2004.06 MB/sec
0.001437 2475.43 MB/sec
10.000000 5817.34 MB/sec

Avg xfer: 3.2KB, 41.8KB in 1.2230 millisecs, 34.15 MB/sec
AF_UNIX sock stream bandwidth: 9850.01 MB/sec
Pipe bandwidth: 4631.28 MB/sec

to:

Socket bandwidth using localhost
0.000001 3.13 MB/sec
0.000064 187.08 MB/sec
0.000128 324.12 MB/sec
0.000256 618.51 MB/sec
0.000512 1137.13 MB/sec
0.001024 1962.95 MB/sec
0.001437 2458.27 MB/sec
10.000000 6168.08 MB/sec

Avg xfer: 3.2KB, 41.8KB in 1.0060 millisecs, 41.52 MB/sec
AF_UNIX sock stream bandwidth: 9921.68 MB/sec
Pipe bandwidth: 4649.96 MB/sec

Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 2/28/25 04:35, Uros Bizjak wrote:
> > The code size of the resulting x86_64 defconfig object file increases
> > for 33.264 kbytes, representing 1.2% code size increase:
> >
> >    text    data     bss     dec     hex filename
> > 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> > 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
>
> So, first of all, thank you for including some objective measurement of
> the impact if your patches. It's much appreciated.
>
> But I think the patches need to come with a solid theory of why they're
> good. The minimum bar for that, I think, is *some* kind of actual
> real-world performance test. I'm not picky. Just *something* that spends
> a lot of time in the kernel and ideally where a profile points at some
> of the code you're poking here.
>
> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
>
> But performance patches need to come with performance *numbers*.

I don't consider this patch a performance patch, it is more a patch
that fixes a correctness issue. The compiler estimates the number of
instructions in the asm template wrong, so the patch instructs the
compiler that everything in the template in fact results in a single
instruction, no matter the pseudos there. The correct estimation then
allows the compiler to do its job better (e.g. better scheduling,
better inlining decisions, etc...).

The metric of code size is excellent for -Os compile, but not so good
for -O2 compile, and measured results mirror that.. In the -O2 case,
we actually requested from the compiler to prioritize the performance,
not code size, so the code size measurements are only of limited
relevance. The purpose of these measurements are to show that the
effect of the patch is limited to the expected 1% of code size
difference.

I don't expect some noticeable performance changes from the
non-algorithmic patch like this. TBH, I would be surprised if they
were outside the measurement noise. Nevertheless, I'll try to provide
some performance numbers.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Dave Hansen 9 months, 2 weeks ago
On 2/28/25 14:31, Uros Bizjak wrote:
> On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote:
>>
>> On 2/28/25 04:35, Uros Bizjak wrote:
>>> The code size of the resulting x86_64 defconfig object file increases
>>> for 33.264 kbytes, representing 1.2% code size increase:
>>>
>>>    text    data     bss     dec     hex filename
>>> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
>>> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
>>
>> So, first of all, thank you for including some objective measurement of
>> the impact if your patches. It's much appreciated.
>>
>> But I think the patches need to come with a solid theory of why they're
>> good. The minimum bar for that, I think, is *some* kind of actual
>> real-world performance test. I'm not picky. Just *something* that spends
>> a lot of time in the kernel and ideally where a profile points at some
>> of the code you're poking here.
>>
>> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
>> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
>>
>> But performance patches need to come with performance *numbers*.
> 
> I don't consider this patch a performance patch, it is more a patch
> that fixes a correctness issue. The compiler estimates the number of
> instructions in the asm template wrong, so the patch instructs the
> compiler that everything in the template in fact results in a single
> instruction, no matter the pseudos there. The correct estimation then
> allows the compiler to do its job better (e.g. better scheduling,
> better inlining decisions, etc...).

Why does it matter if the compiler does its job better?

I'll let the other folks who maintain this code chime in if they think
I'm off my rocker. But, *I* consider this -- and all of these, frankly
-- performance patches.

> I don't expect some noticeable performance changes from the
> non-algorithmic patch like this. TBH, I would be surprised if they
> were outside the measurement noise. Nevertheless, I'll try to provide
> some performance numbers.

That would be appreciated, thanks!
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by David Laight 9 months, 2 weeks ago
On Fri, 28 Feb 2025 14:58:47 -0800
Dave Hansen <dave.hansen@intel.com> wrote:

> On 2/28/25 14:31, Uros Bizjak wrote:
> > On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote:  
> >>
> >> On 2/28/25 04:35, Uros Bizjak wrote:  
> >>> The code size of the resulting x86_64 defconfig object file increases
> >>> for 33.264 kbytes, representing 1.2% code size increase:
> >>>
> >>>    text    data     bss     dec     hex filename
> >>> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> >>> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o  
> >>
> >> So, first of all, thank you for including some objective measurement of
> >> the impact if your patches. It's much appreciated.
> >>
> >> But I think the patches need to come with a solid theory of why they're
> >> good. The minimum bar for that, I think, is *some* kind of actual
> >> real-world performance test. I'm not picky. Just *something* that spends
> >> a lot of time in the kernel and ideally where a profile points at some
> >> of the code you're poking here.
> >>
> >> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> >> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
> >>
> >> But performance patches need to come with performance *numbers*.  
> > 
> > I don't consider this patch a performance patch, it is more a patch
> > that fixes a correctness issue. The compiler estimates the number of
> > instructions in the asm template wrong, so the patch instructs the
> > compiler that everything in the template in fact results in a single
> > instruction, no matter the pseudos there. The correct estimation then
> > allows the compiler to do its job better (e.g. better scheduling,
> > better inlining decisions, etc...).  
> 
> Why does it matter if the compiler does its job better?
> 
> I'll let the other folks who maintain this code chime in if they think
> I'm off my rocker. But, *I* consider this -- and all of these, frankly
> -- performance patches.

I was looking at some size changes related to a different 'trivial'
code change.
It caused gcc to make apparently unrelated inlining decisions that caused
some functions to grow/shrink by +/-100+ bytes even though the actual
change would mostly only add/remove a single instruction.

I've lost the patch for this one, but if the asm block does expand to a
single instruction it is likely to making gcc decide to inline one of the
functions that uses it - so increasing overall code size.
Whether that helps or hinders performance is difficult to say.

	David
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Fri, Feb 28, 2025 at 11:58 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 2/28/25 14:31, Uros Bizjak wrote:
> > On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote:
> >>
> >> On 2/28/25 04:35, Uros Bizjak wrote:
> >>> The code size of the resulting x86_64 defconfig object file increases
> >>> for 33.264 kbytes, representing 1.2% code size increase:
> >>>
> >>>    text    data     bss     dec     hex filename
> >>> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> >>> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
> >>
> >> So, first of all, thank you for including some objective measurement of
> >> the impact if your patches. It's much appreciated.
> >>
> >> But I think the patches need to come with a solid theory of why they're
> >> good. The minimum bar for that, I think, is *some* kind of actual
> >> real-world performance test. I'm not picky. Just *something* that spends
> >> a lot of time in the kernel and ideally where a profile points at some
> >> of the code you're poking here.
> >>
> >> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
> >> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop.
> >>
> >> But performance patches need to come with performance *numbers*.
> >
> > I don't consider this patch a performance patch, it is more a patch
> > that fixes a correctness issue. The compiler estimates the number of
> > instructions in the asm template wrong, so the patch instructs the
> > compiler that everything in the template in fact results in a single
> > instruction, no matter the pseudos there. The correct estimation then
> > allows the compiler to do its job better (e.g. better scheduling,
> > better inlining decisions, etc...).
>
> Why does it matter if the compiler does its job better?

Please read the long thread [1], especially part [1.1], that was the
reason for gcc to implement asm inline [2].

[1] https://lore.kernel.org/lkml/20181003213100.189959-1-namit@vmware.com/
[1.1] https://lore.kernel.org/lkml/20181007091805.GA30687@zn.tnic/
[2] https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html

Accurate inline decisions are just one of compiler optimizations that
depend on code growth factor, tail duplication [3] is another one,
there are also code hoisting, function cloning, block reordering,
basic block copying, to name a few from the top of my head.

[3] https://gcc.gnu.org/projects/sched-treegion.html

These all work better with accurate input data. These optimizations
are also the reason for 1% code growth with -O2: additional code
blocks now fall under the code size threshold that enables the
mentioned optimizations, under the assumption of -O2 code
size/performance tradeoffs. OTOH, -Os, where different code
size/performance heuristics are used, now performs better w.r.t code
size.

Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Borislav Petkov 9 months, 2 weeks ago
On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> OTOH, -Os, where different code size/performance heuristics are used, now
> performs better w.r.t code size.

Did anything change since:

281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")

wrt -Os?

Because if not, we still don't love -Os and you can drop the -Os argument.

And without any perf data showing any improvement, this patch does nothing but
enlarge -O2 size...

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > OTOH, -Os, where different code size/performance heuristics are used, now
> > performs better w.r.t code size.
>
> Did anything change since:
>
> 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
>
> wrt -Os?
>
> Because if not, we still don't love -Os and you can drop the -Os argument.

The -Os argument was to show the effect of the patch when the compiler
is instructed to take care of the overall size. Giving the compiler
-O2 and then looking at the overall size of the produced binary is
just wrong.

> And without any perf data showing any improvement, this patch does nothing but
> enlarge -O2 size...

Even to my surprise, the patch has some noticeable effects on the
performance, please see the attachment in [1] for LMBench data or [2]
for some excerpts from the data. So, I think the patch has potential
to improve the performance.

[1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Borislav Petkov 9 months, 2 weeks ago
On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote:
> The -Os argument was to show the effect of the patch when the compiler
> is instructed to take care of the overall size. Giving the compiler
> -O2 and then looking at the overall size of the produced binary is
> just wrong.

No one cares about -Os AFAICT. It might as well be non-existent. So the effect
doesn't matter.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Peter Zijlstra 9 months, 2 weeks ago
On Wed, Mar 05, 2025 at 09:36:33PM +0100, Borislav Petkov wrote:
> On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote:
> > The -Os argument was to show the effect of the patch when the compiler
> > is instructed to take care of the overall size. Giving the compiler
> > -O2 and then looking at the overall size of the produced binary is
> > just wrong.
> 
> No one cares about -Os AFAICT. It might as well be non-existent. So the effect
> doesn't matter.

Well, more people would care if it didn't stand for -Ostupid I suppose.
That is, traditionally GCC made some very questionable choices with -Os,
quite horrendous code-gen.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Wed, Mar 5, 2025 at 10:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 05, 2025 at 09:36:33PM +0100, Borislav Petkov wrote:
> > On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote:
> > > The -Os argument was to show the effect of the patch when the compiler
> > > is instructed to take care of the overall size. Giving the compiler
> > > -O2 and then looking at the overall size of the produced binary is
> > > just wrong.
> >
> > No one cares about -Os AFAICT. It might as well be non-existent. So the effect
> > doesn't matter.
>
> Well, more people would care if it didn't stand for -Ostupid I suppose.
> That is, traditionally GCC made some very questionable choices with -Os,
> quite horrendous code-gen.

Size optimizations result in 15% code size reduction (x86_64
defconfig, gcc-14.2), so they reflect what user wanted:

  text    data     bss     dec     hex filename
27478996        4635807  814660 32929463        1f676b7 vmlinux-O2.o
23859143        4617419  814724 29291286        1bef316 vmlinux-Os.o

The compiler heuristics depend on tradeoffs, and -Os uses different
tradeoffs than -O2. Unfortunately, there is no
-Os-but-I-really-want-performace switch, but OTOH, tradeoffs can be
adjusted. The compiler is open-source, and these adjustments can be
discussed in public spaces (mailing lists and bugzilla) and eventually
re-tuned. We are aware that the world around us changes, so tunings
are not set in stone, but we also depend on user feedback.

Thanks,
Uros.
kernel: Current status of CONFIG_CC_OPTIMIZE_FOR_SIZE=y (was: Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns)
Posted by Ingo Molnar 9 months, 2 weeks ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> On Wed, Mar 5, 2025 at 10:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Mar 05, 2025 at 09:36:33PM +0100, Borislav Petkov wrote:
> > > On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote:
> > > > The -Os argument was to show the effect of the patch when the compiler
> > > > is instructed to take care of the overall size. Giving the compiler
> > > > -O2 and then looking at the overall size of the produced binary is
> > > > just wrong.
> > >
> > > No one cares about -Os AFAICT. It might as well be non-existent. So the effect
> > > doesn't matter.
> >
> > Well, more people would care if it didn't stand for -Ostupid I suppose.
> > That is, traditionally GCC made some very questionable choices with -Os,
> > quite horrendous code-gen.
> 
> Size optimizations result in 15% code size reduction (x86_64
> defconfig, gcc-14.2), so they reflect what user wanted:
> 
>   text    data     bss     dec     hex filename
> 27478996        4635807  814660 32929463        1f676b7 vmlinux-O2.o
> 23859143        4617419  814724 29291286        1bef316 vmlinux-Os.o
> 
> The compiler heuristics depend on tradeoffs, and -Os uses different
> tradeoffs than -O2. Unfortunately, there is no
> -Os-but-I-really-want-performace switch, but OTOH, tradeoffs can be
> adjusted. The compiler is open-source, and these adjustments can be
> discussed in public spaces (mailing lists and bugzilla) and eventually
> re-tuned. We are aware that the world around us changes, so tunings
> are not set in stone, but we also depend on user feedback.

So the best way to drive -Os forward is not to insist that it's good 
(it might still be crap), and not to insist that it's crap (it might 
have become better), but to dig out old problems and to look at what 
kind of code current compilers generate in the kernel with -Os.

There's been a few pathological GCC optimizations in the past, but also 
other problems, such as this one 9 years ago that hid useful warnings:

  =================>
  877417e6ffb9 Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
  =================>

  From: Arnd Bergmann <arnd@arndb.de>
  Date: Mon, 25 Apr 2016 17:35:27 +0200
  Subject: [PATCH] Kbuild: change CC_OPTIMIZE_FOR_SIZE definition

    CC_OPTIMIZE_FOR_SIZE disables the often useful -Wmaybe-unused warning,
    because that causes a ridiculous amount of false positives when combined
    with -Os. 
      
    This means a lot of warnings don't show up in testing by the developers
    that should see them with an 'allmodconfig' kernel that has
    CC_OPTIMIZE_FOR_SIZE enabled, but only later in randconfig builds
    that don't.

And this one by Linus, 14 years ago:

  =================>
  281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
  =================>

  From: Linus Torvalds <torvalds@linux-foundation.org>
  Date: Sun, 22 May 2011 14:30:36 -0700
  Subject: [PATCH] Give up on pushing CC_OPTIMIZE_FOR_SIZE

    I still happen to believe that I$ miss costs are a major thing, but
    sadly, -Os doesn't seem to be the solution.  With or without it, gcc
    will miss some obvious code size improvements, and with it enabled gcc
    will sometimes make choices that aren't good even with high I$ miss
    ratios.

    For example, with -Os, gcc on x86 will turn a 20-byte constant memcpy
    into a "rep movsl".  While I sincerely hope that x86 CPU's will some day
    do a good job at that, they certainly don't do it yet, and the cost is
    higher than a L1 I$ miss would be.

    Some day I hope we can re-enable this.

    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

I'm quite sure there were more, but these were that popped up in a 
quick Git archeology.

And yeah, it was me who pushed for -Os originally 17 years ago, due to 
the positive I$ impact, in theory:

  =================>
  96fffeb4b413 ("make CC_OPTIMIZE_FOR_SIZE non-experimental")
  =================>

  From: Ingo Molnar <mingo@elte.hu>
  Date: Mon, 28 Apr 2008 01:39:43 +0200
  Subject: [PATCH] make CC_OPTIMIZE_FOR_SIZE non-experimental

    this option has been the default on a wide range of distributions
    for a long time - time to make it non-experimental.

    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

But practice disagreed with theory, and obviously in the kernel 
practice has supremacy.

But yes, I'd cautiously agree that reduced kernel size with a -Os build 
is a stochastic proxy metric for better code and better performance - 
but it comes with caveats and needs to be backed by other data or 
robust first principles arguments too.

Thanks,

	Ingo
Re: kernel: Current status of CONFIG_CC_OPTIMIZE_FOR_SIZE=y (was: Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns)
Posted by David Laight 9 months, 2 weeks ago
On Thu, 6 Mar 2025 10:43:26 +0100
Ingo Molnar <mingo@kernel.org> wrote:

> * Uros Bizjak <ubizjak@gmail.com> wrote:
...
> And this one by Linus, 14 years ago:
> 
>   =================>  
>   281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
>   =================>  
> 
>   From: Linus Torvalds <torvalds@linux-foundation.org>
>   Date: Sun, 22 May 2011 14:30:36 -0700
>   Subject: [PATCH] Give up on pushing CC_OPTIMIZE_FOR_SIZE
> 
>     I still happen to believe that I$ miss costs are a major thing, but
>     sadly, -Os doesn't seem to be the solution.  With or without it, gcc
>     will miss some obvious code size improvements, and with it enabled gcc
>     will sometimes make choices that aren't good even with high I$ miss
>     ratios.
> 
>     For example, with -Os, gcc on x86 will turn a 20-byte constant memcpy
>     into a "rep movsl".  While I sincerely hope that x86 CPU's will some day
>     do a good job at that, they certainly don't do it yet, and the cost is
>     higher than a L1 I$ miss would be.

Well 'rep movsb' is a lot better than it was then.
Even on Sandy bridge (IIRC) it is ~20 clocks for short transfers (of any length).
Unlike the P4 with a 140 clock overhead!
Still slower for short fixed sizes, but probably good for anything variable
because of the costs of the function call and the conditionals to select the
'best' algorithm.
OTOH if you know it is only a few bytes a code loop may be best - and gcc will
convert it to a memcpy() call for you!

The really silly one was 'push immd_byte; pop reg' to get a sign extended value.

But I do remember -O2 being smaller than -Oz !
Just changing the inlining thresholds and code replication on loops
(and never unrollong loops) would probably be a good start.

	David
Re: kernel: Current status of CONFIG_CC_OPTIMIZE_FOR_SIZE=y (was: Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns)
Posted by Arnd Bergmann 9 months, 2 weeks ago
On Thu, Mar 6, 2025, at 10:43, Ingo Molnar wrote:
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Wed, Mar 5, 2025 at 10:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
> So the best way to drive -Os forward is not to insist that it's good 
> (it might still be crap), and not to insist that it's crap (it might 
> have become better), but to dig out old problems and to look at what 
> kind of code current compilers generate in the kernel with -Os.
>
> There's been a few pathological GCC optimizations in the past, but also 
> other problems, such as this one 9 years ago that hid useful warnings:
>
>   =================>
>   877417e6ffb9 Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
>   =================>
>
>   From: Arnd Bergmann <arnd@arndb.de>
>   Date: Mon, 25 Apr 2016 17:35:27 +0200
>   Subject: [PATCH] Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
>
>     CC_OPTIMIZE_FOR_SIZE disables the often useful -Wmaybe-unused warning,
>     because that causes a ridiculous amount of false positives when combined
>     with -Os. 

I think that should have said '-Wmaybe-uninitialized', which is a thing
of the past, since (IIRC) gcc-9 changed the default inlining rules and
Linus turned it off globally after the false positives started happening
with -O2 as well.

This is rather annoying of course, since we miss a lot of warnings
for real bugs, but at least clang's -Wsometimes-uninitialized and
smatch still catch the important ones in the CI systems, but we keep
seeing new ones in linux-next and have to patch them.

> But yes, I'd cautiously agree that reduced kernel size with a -Os build 
> is a stochastic proxy metric for better code and better performance - 
> but it comes with caveats and needs to be backed by other data or 
> robust first principles arguments too.

My impression is that for the most part, the -Os and -O2 options do
exactly what they promise, picking either a smaller or faster object
code. Clearly there is no single ideal choice because if some
optimization step is a win for both performance and size it would
be turned on for both.

The crazy options (-Oz and -O3) on the other hand are intentionally
not offered by Kconfig.

      Arnd
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > > OTOH, -Os, where different code size/performance heuristics are used, now
> > > performs better w.r.t code size.
> >
> > Did anything change since:
> >
> > 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> > 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
> >
> > wrt -Os?
> >
> > Because if not, we still don't love -Os and you can drop the -Os argument.
> 
> The -Os argument was to show the effect of the patch when the compiler
> is instructed to take care of the overall size. Giving the compiler
> -O2 and then looking at the overall size of the produced binary is
> just wrong.
> 
> > And without any perf data showing any improvement, this patch does nothing but
> > enlarge -O2 size...
> 
> Even to my surprise, the patch has some noticeable effects on the
> performance, please see the attachment in [1] for LMBench data or [2]
> for some excerpts from the data. So, I think the patch has potential
> to improve the performance.
> 
> [1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
> [2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/

If you are measuring micro-costs, please make sure you pin the workload 
to a single CPU (via 'taskset' for example) and run 'perf stat --null 
--repeat 5' or so to measure the run-over-run noise of the benchmark.

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Ingo Molnar <mingo@kernel.org> wrote:

> 
> * Uros Bizjak <ubizjak@gmail.com> wrote:
> 
> > On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > > > OTOH, -Os, where different code size/performance heuristics are used, now
> > > > performs better w.r.t code size.
> > >
> > > Did anything change since:
> > >
> > > 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> > > 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
> > >
> > > wrt -Os?
> > >
> > > Because if not, we still don't love -Os and you can drop the -Os argument.
> > 
> > The -Os argument was to show the effect of the patch when the compiler
> > is instructed to take care of the overall size. Giving the compiler
> > -O2 and then looking at the overall size of the produced binary is
> > just wrong.
> > 
> > > And without any perf data showing any improvement, this patch does nothing but
> > > enlarge -O2 size...
> > 
> > Even to my surprise, the patch has some noticeable effects on the
> > performance, please see the attachment in [1] for LMBench data or [2]
> > for some excerpts from the data. So, I think the patch has potential
> > to improve the performance.
> > 
> > [1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
> > [2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/
> 
> If you are measuring micro-costs, please make sure you pin the 
> workload to a single CPU (via 'taskset' for example) and run 'perf 
> stat --null --repeat 5' or so to measure the run-over-run noise of 
> the benchmark.

And if the benchmark is context-switching heavy, you'll want to use 
'perf stat -a' option to not have PMU context switching costs, and the 
-C option to only measure on the pinned CPU.


For example, to measure pipe handling overhead, the naive measurement is:

 starship:~> perf bench sched pipe
 # Running 'sched/pipe' benchmark:
 # Executed 1000000 pipe operations between two processes

     Total time: 6.939 [sec]

       6.939128 usecs/op
         144110 ops/sec
 starship:~> perf bench sched pipe
 # Running 'sched/pipe' benchmark:
 # Executed 1000000 pipe operations between two processes

     Total time: 6.879 [sec]

       6.879282 usecs/op
         145364 ops/sec

See how the run-to-run noise is 0.9%?

If we measure it naively with perf stat, we get:

 starship:~> perf stat perf bench sched pipe
 # Running 'sched/pipe' benchmark:
 # Executed 1000000 pipe operations between two processes

     Total time: 11.870 [sec]

      11.870403 usecs/op
          84243 ops/sec

 Performance counter stats for 'perf bench sched pipe':

         10,722.04 msec task-clock                       #    0.903 CPUs utilized             
         2,000,093      context-switches                 #  186.540 K/sec                     
               499      cpu-migrations                   #   46.540 /sec                      
             1,482      page-faults                      #  138.220 /sec                      
    27,853,380,218      cycles                           #    2.598 GHz                       
    18,434,409,889      stalled-cycles-frontend          #   66.18% frontend cycles idle      
    24,277,227,239      instructions                     #    0.87  insn per cycle            
                                                  #    0.76  stalled cycles per insn   
     5,001,727,980      branches                         #  466.490 M/sec                     
       572,756,283      branch-misses                    #   11.45% of all branches           

      11.875458968 seconds time elapsed

       0.271152000 seconds user
      11.272766000 seconds sys

See how the usecs/op increased by +70% due to PMU switching overhead?

With --null we can reduce the PMU switching overhead by only measuring 
elapsed time:

 starship:~> perf stat --null perf bench sched pipe
 # Running 'sched/pipe' benchmark:
 # Executed 1000000 pipe operations between two processes

     Total time: 6.916 [sec]

       6.916700 usecs/op
         144577 ops/sec

  Performance counter stats for 'perf bench sched pipe':

       6.921547909 seconds time elapsed

       0.341734000 seconds user
       6.215287000 seconds sys

But noise is still high:

 starship:~> perf stat --null --repeat 5 perf bench sched pipe
       6.854731 usecs/op
       7.082047 usecs/op
       7.087193 usecs/op
       6.934439 usecs/op
       7.056695 usecs/op
 ...
 Performance counter stats for 'perf bench sched pipe' (5 runs):

            7.0093 +- 0.0463 seconds time elapsed  ( +-  0.66% )

Likely due to the tasks migrating semi-randomly among cores.

We can pin them down to a single CPU (CPU2 in this case) via taskset:

 starship:~> taskset 4 perf stat --null --repeat 5 perf bench sched pipe
       5.575906 usecs/op
       5.637112 usecs/op
       5.532060 usecs/op
       5.703270 usecs/op
       5.506517 usecs/op
 
 Performance counter stats for 'perf bench sched pipe' (5 runs):

            5.5929 +- 0.0359 seconds time elapsed  ( +-  0.64% )

Note how performance increased by ~25%, due to lack of migration, but 
noise is still a bit high.

A good way to reduce noise is to measure instructions only:

 starship:~> taskset 0x4 perf stat -e instructions --repeat 5 perf bench sched pipe
       6.962279 usecs/op
       6.917374 usecs/op
       6.928672 usecs/op
       6.939555 usecs/op
       6.942980 usecs/op

 Performance counter stats for 'perf bench sched pipe' (5 runs):

    32,561,773,780      instructions                                                            ( +-  0.27% )

           6.93977 +- 0.00735 seconds time elapsed  ( +-  0.11% )

'Number of instructions executed' is an imperfect proxy for overhead. 
(Not every instruction has the same overhead - but for compiler code 
generation it's a useful proxy in most cases.)

But the best measurement is to avoid the PMU switching overhead via the 
'-a' option, and limiting the measurement to the saturated CPU#2:

 starship:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
       5.808068 usecs/op
       5.843716 usecs/op
       5.826543 usecs/op
       5.801616 usecs/op
       5.793129 usecs/op

 Performance counter stats for 'system wide' (5 runs):

    32,244,691,275      instructions                                                            ( +-  0.21% )

           5.81624 +- 0.00912 seconds time elapsed  ( +-  0.16% )

Note how this measurement provides the highest performance for the 
workload, almost as good as --null.

 - Beware the difference in CPU mask parameters between taskset and perf 
   stat. (I tried to convince the perf tooling people to integrate 
   CPU-pinning into perf stat, but I digress.)

 - Beware of cpufreq considerations: changing CPU frequencies will skew 
   your workload's performance by a lot more than the 0.1% kind of 
   noise we are trying to gun for. Setting your CPU governor to 
   'performance' will eliminate some (but not all) cpufreq artifacts.

 - On modern systems there's also boot-to-boot variance of key data 
   structure alignment and cache access patterns, that can sometimes 
   rise beyond the noise of the measurement. These can send you on a 
   wild goose chase ...

Finally, you can use something like 'nice -n -10' to increase the 
priority of your benchmark and reduce the impact of other workloads 
running on your system.

Anyway, I think noise levels of around 0.1%-0.2% are about the best you 
can expect in context-switch heavy workloads. (A bit better in 
CPU-bound workloads with low context switching.)

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Dirk Gouders 9 months, 2 weeks ago
Hi Ingo,

my interest comes, because I just started to try to better understand
PCL and am reading the perf manual pages.  Perhaps I should therefore
keep my RO-bit permanent for some more months, but:

> And if the benchmark is context-switching heavy, you'll want to use 
> 'perf stat -a' option to not have PMU context switching costs, and the 

I'm sure you know what you are talking about so I don't doubt the above
is correct but perhaps, the manual page should also clarify -a:

-a::
--all-cpus::
        system-wide collection from all CPUs (default if no target is specified)

In the last example -a is combined with -C 2 which is even more irritating when
you just started with the manual pages.


But the main reason why I thought it might be OK to once toggle my
RO-bit is that I tried your examples and with the first one I have way
higher numbers than yours and I thought that must be, because you just
own the faster machine (as I would have expected):

>  starship:~> perf bench sched pipe
>  # Running 'sched/pipe' benchmark:
>  # Executed 1000000 pipe operations between two processes
>
>      Total time: 6.939 [sec]
>
>        6.939128 usecs/op
>          144110 ops/sec

lena:~> perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes

     Total time: 11.129 [sec]

      11.129952 usecs/op
          89847 ops/sec

And I expected this to continue throughout the examples.

But -- to keep this short -- with the last example, my numbers are
suddenly significantly lower than yours:

>  starship:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
>        5.808068 usecs/op
>        5.843716 usecs/op
>        5.826543 usecs/op
>        5.801616 usecs/op
>        5.793129 usecs/op
>
>  Performance counter stats for 'system wide' (5 runs):
>
>     32,244,691,275      instructions                                                            ( +-  0.21% )
>
>            5.81624 +- 0.00912 seconds time elapsed  ( +-  0.16% )

lena:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
       4.204444 usecs/op
       4.169279 usecs/op
       4.186812 usecs/op
       4.217039 usecs/op
       4.208538 usecs/op

 Performance counter stats for 'system wide' (5 runs):

    14,196,762,588      instructions                                                            ( +-  0.04% )

           4.20203 +- 0.00854 seconds time elapsed  ( +-  0.20% )



Of course, I don't want to waste anyones time if this is a so obvious
thing that only newbies don't understand.  So, feel free to just ignore this.

Regards

Dirk
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
( I've Cc:-ed some perf gents as to the measurement artifacts observed 
  below. Full report quoted below. )

* Dirk Gouders <dirk@gouders.net> wrote:

> Hi Ingo,
> 
> my interest comes, because I just started to try to better understand
> PCL and am reading the perf manual pages.  Perhaps I should therefore
> keep my RO-bit permanent for some more months, but:
> 
> > And if the benchmark is context-switching heavy, you'll want to use 
> > 'perf stat -a' option to not have PMU context switching costs, and the 
> 
> I'm sure you know what you are talking about so I don't doubt the above
> is correct but perhaps, the manual page should also clarify -a:
> 
> -a::
> --all-cpus::
>         system-wide collection from all CPUs (default if no target is specified)
> 
> In the last example -a is combined with -C 2 which is even more irritating when
> you just started with the manual pages.
> 
> 
> But the main reason why I thought it might be OK to once toggle my
> RO-bit is that I tried your examples and with the first one I have way
> higher numbers than yours and I thought that must be, because you just
> own the faster machine (as I would have expected):
> 
> >  starship:~> perf bench sched pipe
> >  # Running 'sched/pipe' benchmark:
> >  # Executed 1000000 pipe operations between two processes
> >
> >      Total time: 6.939 [sec]
> >
> >        6.939128 usecs/op
> >          144110 ops/sec
> 
> lena:~> perf bench sched pipe
> # Running 'sched/pipe' benchmark:
> # Executed 1000000 pipe operations between two processes
> 
>      Total time: 11.129 [sec]
> 
>       11.129952 usecs/op
>           89847 ops/sec
> 
> And I expected this to continue throughout the examples.
> 
> But -- to keep this short -- with the last example, my numbers are
> suddenly significantly lower than yours:
> 
> >  starship:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
> >        5.808068 usecs/op
> >        5.843716 usecs/op
> >        5.826543 usecs/op
> >        5.801616 usecs/op
> >        5.793129 usecs/op
> >
> >  Performance counter stats for 'system wide' (5 runs):
> >
> >     32,244,691,275      instructions                                                            ( +-  0.21% )
> >
> >            5.81624 +- 0.00912 seconds time elapsed  ( +-  0.16% )
> 
> lena:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
>        4.204444 usecs/op
>        4.169279 usecs/op
>        4.186812 usecs/op
>        4.217039 usecs/op
>        4.208538 usecs/op
> 
>  Performance counter stats for 'system wide' (5 runs):
> 
>     14,196,762,588      instructions                                                            ( +-  0.04% )
> 
>            4.20203 +- 0.00854 seconds time elapsed  ( +-  0.20% )
> 
> 
> 
> Of course, I don't want to waste anyones time if this is a so obvious
> thing that only newbies don't understand.  So, feel free to just ignore this.
> 
> Regards
> 
> Dirk
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Wed, Mar 5, 2025 at 8:55 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > > > OTOH, -Os, where different code size/performance heuristics are used, now
> > > > performs better w.r.t code size.
> > >
> > > Did anything change since:
> > >
> > > 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> > > 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
> > >
> > > wrt -Os?
> > >
> > > Because if not, we still don't love -Os and you can drop the -Os argument.
> >
> > The -Os argument was to show the effect of the patch when the compiler
> > is instructed to take care of the overall size. Giving the compiler
> > -O2 and then looking at the overall size of the produced binary is
> > just wrong.
> >
> > > And without any perf data showing any improvement, this patch does nothing but
> > > enlarge -O2 size...
> >
> > Even to my surprise, the patch has some noticeable effects on the
> > performance, please see the attachment in [1] for LMBench data or [2]
> > for some excerpts from the data. So, I think the patch has potential
> > to improve the performance.
> >
> > [1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
> > [2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/
>
> If you are measuring micro-costs, please make sure you pin the workload
> to a single CPU (via 'taskset' for example) and run 'perf stat --null
> --repeat 5' or so to measure the run-over-run noise of the benchmark.

I simply run the lmbench command, where the benchmark was obtained as
.rpm for Fedora 41 [1], assuming that the benchmark itself sets the
benchmarked system to the correct state and does enough repetitions to
obtain a meaningful result [2].

[1] https://fedora.pkgs.org/41/rpm-sphere-x86_64/lmbench-3.0-0.a9.3.x86_64.rpm.html
[2] https://www.usenix.org/legacy/publications/library/proceedings/sd96/full_papers/mcvoy.pdf

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> I simply run the lmbench command, where the benchmark was obtained as 
> .rpm for Fedora 41 [1], assuming that the benchmark itself sets the 
> benchmarked system to the correct state and does enough repetitions 
> to obtain a meaningful result [2].

These assumptions are not valid in a lot of cases - see my other email 
with an example.

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Wed, Mar 5, 2025 at 9:21 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > I simply run the lmbench command, where the benchmark was obtained as
> > .rpm for Fedora 41 [1], assuming that the benchmark itself sets the
> > benchmarked system to the correct state and does enough repetitions
> > to obtain a meaningful result [2].
>
> These assumptions are not valid in a lot of cases - see my other email
> with an example.

Thanks for the post, it was very thorough and informative, an
interesting read indeed.

OTOH, the proposed approach requires a deep specialist knowledge,
which IMO is unreasonable to expect from the prospective patch
submitter. I totally agree with Dave's request for some performance
numbers [partial quote from his post earlier in the thread]:

"I'm seriously not picky: will-it-scale, lmbench, dbench, kernel
compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop."

that would indicate if the patch holds its promise. However, "Run it
on your laptop" already implies the ability to compile, install and
run a distribution kernel, not exactly a trivial task, compared to
e.g. running a patched kernel under QEMU. Some kind of easy-to-run
benchmark scripts, or documented easy-to-follow steps, would be of
immense help to the patch submitter to assess the performance aspects
of the patch. As said elsewhere, code size is a nice metric, but it is
not exactly appropriate for an -O2 compiled kernel.

As probably everybody reading LKML noticed, there is quite some CI
infrastructure available that tests the posted patches. That
infrastructure is set and maintained by experts that *can* precisely
measure performance impact of the patch. My proposal would be to ease
the requirements for the patches to something that Dave requested in
order to allow patches to be integrated and further tested by the CI
infrastructure.

Disclaimer: Please read the above as a feedback from the wannabe
contributor, not as a critique of the established development process.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Linus Torvalds 9 months, 2 weeks ago
On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
>
> Even to my surprise, the patch has some noticeable effects on the
> performance, please see the attachment in [1] for LMBench data or [2]
> for some excerpts from the data. So, I think the patch has potential
> to improve the performance.

I suspect some of the performance difference - which looks
unexpectedly large - is due to having run them on a CPU with the
horrendous indirect return costs, and then inlining can make a huge
difference.
kvm
Regardless, I absolutely think that using asm_inline here is the right
thing for the locked instructions.

That said, I do want to bring up another issue: maybe it's time to
just retire the LOCK_PREFIX thing entirely?

It harkens back to Ye Olde Days when UP was the norm, and we didn't
want to pay the cost of lock prefixes when the kernel was built for
SMP but was run on an UP machine.

And honestly, none of that makes sense any more. You can't buy a UP
machine any more, and the only UP case would be some silly minimal
virtual environment, and if people really care about that minimal
case, they should just compile the kernel without SMP support.
Becxause UP has gone from being the default to being irrelevant. At
least for x86-64.

So I think we should just get rid of LOCK_PREFIX_HERE and the
smp_locks section entirely.

Which would probably obviate the need for your patch, since then the
compiler wouldn't see it as some big instruction. But your patch isn't
wrong, so this is absolutely not a NAK, more of a "we should go
further".

Hmm?

                  Linus
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by H. Peter Anvin 9 months, 1 week ago
On 3/5/25 09:04, Linus Torvalds wrote:
> 
> That said, I do want to bring up another issue: maybe it's time to
> just retire the LOCK_PREFIX thing entirely?
> 
> It harkens back to Ye Olde Days when UP was the norm, and we didn't
> want to pay the cost of lock prefixes when the kernel was built for
> SMP but was run on an UP machine.
> 
> And honestly, none of that makes sense any more. You can't buy a UP
> machine any more, and the only UP case would be some silly minimal
> virtual environment, and if people really care about that minimal
> case, they should just compile the kernel without SMP support.
> Becxause UP has gone from being the default to being irrelevant. At
> least for x86-64.
> 

I think the key there is that "they should just compile the kernel 
without SMP support" (which may very well make sense for some oddball 
embedded uses) *does* still mean that remaining cases of locks 
can/should be elided so demacroizing it is still not an option.

But yes... would get rid of a lot of crap machinery which is basically 
dead code, and that would be a Very Good Thing[TM].

	-hpa
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by David Laight 9 months, 2 weeks ago
On Wed, 5 Mar 2025 07:04:08 -1000
Linus Torvalds <torvalds@linuxfoundation.org> wrote:

> On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > Even to my surprise, the patch has some noticeable effects on the
> > performance, please see the attachment in [1] for LMBench data or [2]
> > for some excerpts from the data. So, I think the patch has potential
> > to improve the performance.  
> 
> I suspect some of the performance difference - which looks
> unexpectedly large - is due to having run them on a CPU with the
> horrendous indirect return costs, and then inlining can make a huge
> difference.
...

Another possibility is that the processes are getting bounced around
cpu in a slightly different way.
An idle cpu might be running at 800MHz, run something that spins on it
and the clock speed will soon jump to 4GHz.
But if your 'spinning' process is migrated to a different cpu it starts
again at 800MHz.

(I had something where a fpga compile when from 12 mins to over 20 because
the kernel RSB stuffing caused the scheduler to behave differently even
though nothing was doing a lot of system calls.)

All sorts of things can affect that - possibly even making some code faster!

The (IIRC) 30k increase in code size will be a few functions being inlined.
The bloat-o-meter might show which, and forcing a few inlines the same way
should reduce that difference.
OTOH I'm surprised that a single (or two) instruction makes that much
difference - unless gcc is managing to discard the size of the entire
function rather than just the asm block itself.

Benchmarking on modern cpu is hard.
You really do need to lock the cpu frequencies - and that may not be supported.

	David
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Wed, Mar 5, 2025 at 9:14 PM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Wed, 5 Mar 2025 07:04:08 -1000
> Linus Torvalds <torvalds@linuxfoundation.org> wrote:
>
> > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > Even to my surprise, the patch has some noticeable effects on the
> > > performance, please see the attachment in [1] for LMBench data or [2]
> > > for some excerpts from the data. So, I think the patch has potential
> > > to improve the performance.
> >
> > I suspect some of the performance difference - which looks
> > unexpectedly large - is due to having run them on a CPU with the
> > horrendous indirect return costs, and then inlining can make a huge
> > difference.
> ...
>
> Another possibility is that the processes are getting bounced around
> cpu in a slightly different way.
> An idle cpu might be running at 800MHz, run something that spins on it
> and the clock speed will soon jump to 4GHz.
> But if your 'spinning' process is migrated to a different cpu it starts
> again at 800MHz.
>
> (I had something where a fpga compile when from 12 mins to over 20 because
> the kernel RSB stuffing caused the scheduler to behave differently even
> though nothing was doing a lot of system calls.)
>
> All sorts of things can affect that - possibly even making some code faster!
>
> The (IIRC) 30k increase in code size will be a few functions being inlined.
> The bloat-o-meter might show which, and forcing a few inlines the same way
> should reduce that difference.

bloat-o-meter is an excellent idea, I'll analyse binaries some more
and report my findings.

> OTOH I'm surprised that a single (or two) instruction makes that much
> difference - unless gcc is managing to discard the size of the entire
> function rather than just the asm block itself.

Actually, the compiler uses estimated function code size as one of the
conditions when to fully (or partially - hot/cold split) inline the
function. The estimated code size of functions that use (patched)
locking primitives is now lower, so now they fall below the inlining
threshold, causing the compiler to do more inlining. Compiler knows
the performance/size tradeoff of setting up a function call and the
perf/size tradeoff of creating the function frame in the called
function and decides accordingly. Please note that the inlining is
multi-level, so it doesn't stop at the first function.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Thu, Mar 6, 2025 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Mar 5, 2025 at 9:14 PM David Laight
> <david.laight.linux@gmail.com> wrote:
> >
> > On Wed, 5 Mar 2025 07:04:08 -1000
> > Linus Torvalds <torvalds@linuxfoundation.org> wrote:
> >
> > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > Even to my surprise, the patch has some noticeable effects on the
> > > > performance, please see the attachment in [1] for LMBench data or [2]
> > > > for some excerpts from the data. So, I think the patch has potential
> > > > to improve the performance.
> > >
> > > I suspect some of the performance difference - which looks
> > > unexpectedly large - is due to having run them on a CPU with the
> > > horrendous indirect return costs, and then inlining can make a huge
> > > difference.
> > ...
> >
> > Another possibility is that the processes are getting bounced around
> > cpu in a slightly different way.
> > An idle cpu might be running at 800MHz, run something that spins on it
> > and the clock speed will soon jump to 4GHz.
> > But if your 'spinning' process is migrated to a different cpu it starts
> > again at 800MHz.
> >
> > (I had something where a fpga compile when from 12 mins to over 20 because
> > the kernel RSB stuffing caused the scheduler to behave differently even
> > though nothing was doing a lot of system calls.)
> >
> > All sorts of things can affect that - possibly even making some code faster!
> >
> > The (IIRC) 30k increase in code size will be a few functions being inlined.
> > The bloat-o-meter might show which, and forcing a few inlines the same way
> > should reduce that difference.
>
> bloat-o-meter is an excellent idea, I'll analyse binaries some more
> and report my findings.

Please find attached bloat.txt where:

a) some functions now include once-called functions. These are:

copy_process                                6465   10191   +3726
balance_dirty_pages_ratelimited_flags        237    2949   +2712
icl_plane_update_noarm                      5800    7969   +2169
samsung_input_mapping                       3375    5170   +1795
ext4_do_update_inode.isra                      -    1526   +1526

that now include:

ext4_mark_iloc_dirty                        1735     106   -1629
samsung_gamepad_input_mapping.isra          2046       -   -2046
icl_program_input_csc                       2203       -   -2203
copy_mm                                     2242       -   -2242
balance_dirty_pages                         2657       -   -2657

b) ISRA [interprocedural scalar replacement of aggregates,
interprocedural pass that removes unused function return values
(turning functions returning a value which is never used into void
functions) and removes unused function parameters.  It can also
replace an aggregate parameter by a set of other parameters
representing part of the original, turning those passed by reference
into new ones which pass the value directly.]

ext4_do_update_inode.isra                      -    1526   +1526
nfs4_begin_drain_session.isra                  -     249    +249
nfs4_end_drain_session.isra                    -     168    +168
__guc_action_register_multi_lrc_v70.isra     335     500    +165
__i915_gem_free_objects.isra                   -     144    +144
...
membarrier_register_private_expedited.isra     108       -    -108
syncobj_eventfd_entry_func.isra              445     314    -131
__ext4_sb_bread_gfp.isra                     140       -    -140
class_preempt_notrace_destructor.isra        145       -    -145
p9_fid_put.isra                              151       -    -151
__mm_cid_try_get.isra                        238       -    -238
membarrier_global_expedited.isra             294       -    -294
mm_cid_get.isra                              295       -    -295
samsung_gamepad_input_mapping.isra.cold      604       -    -604
samsung_gamepad_input_mapping.isra          2046       -   -2046

c) different split points of hot/cold split that just move code around:

samsung_input_mapping.cold                   900    1500    +600
__i915_request_reset.cold                    311     389     +78
nfs_update_inode.cold                         77     153     +76
__do_sys_swapon.cold                         404     455     +51
copy_process.cold                              -      45     +45
tg3_get_invariants.cold                       73     115     +42
...
hibernate.cold                               671     643     -28
copy_mm.cold                                  31       -     -31
software_resume.cold                         249     207     -42
io_poll_wake.cold                            106      54     -52
samsung_gamepad_input_mapping.isra.cold      604       -    -604

c) full inline of small functions with locking insn (~150 cases).
These bring in most of the performance increase because there is no
call setup. E.g.:

0000000000a50e10 <release_devnum>:
  a50e10:    48 63 07                 movslq (%rdi),%rax
  a50e13:    85 c0                    test   %eax,%eax
  a50e15:    7e 10                    jle    a50e27 <release_devnum+0x17>
  a50e17:    48 8b 4f 50              mov    0x50(%rdi),%rcx
  a50e1b:    f0 48 0f b3 41 50        lock btr %rax,0x50(%rcx)
  a50e21:    c7 07 ff ff ff ff        movl   $0xffffffff,(%rdi)
  a50e27:    e9 00 00 00 00           jmp    a50e2c <release_devnum+0x1c>
            a50e28: R_X86_64_PLT32    __x86_return_thunk-0x4
  a50e2c:    0f 1f 40 00              nopl   0x0(%rax)

IMO, for 0.14% code increase, these changes are desirable.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Ingo Molnar 9 months, 2 weeks ago
* Uros Bizjak <ubizjak@gmail.com> wrote:

> On Thu, Mar 6, 2025 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Mar 5, 2025 at 9:14 PM David Laight
> > <david.laight.linux@gmail.com> wrote:
> > >
> > > On Wed, 5 Mar 2025 07:04:08 -1000
> > > Linus Torvalds <torvalds@linuxfoundation.org> wrote:
> > >
> > > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > Even to my surprise, the patch has some noticeable effects on the
> > > > > performance, please see the attachment in [1] for LMBench data or [2]
> > > > > for some excerpts from the data. So, I think the patch has potential
> > > > > to improve the performance.
> > > >
> > > > I suspect some of the performance difference - which looks
> > > > unexpectedly large - is due to having run them on a CPU with the
> > > > horrendous indirect return costs, and then inlining can make a huge
> > > > difference.
> > > ...
> > >
> > > Another possibility is that the processes are getting bounced around
> > > cpu in a slightly different way.
> > > An idle cpu might be running at 800MHz, run something that spins on it
> > > and the clock speed will soon jump to 4GHz.
> > > But if your 'spinning' process is migrated to a different cpu it starts
> > > again at 800MHz.
> > >
> > > (I had something where a fpga compile when from 12 mins to over 20 because
> > > the kernel RSB stuffing caused the scheduler to behave differently even
> > > though nothing was doing a lot of system calls.)
> > >
> > > All sorts of things can affect that - possibly even making some code faster!
> > >
> > > The (IIRC) 30k increase in code size will be a few functions being inlined.
> > > The bloat-o-meter might show which, and forcing a few inlines the same way
> > > should reduce that difference.
> >
> > bloat-o-meter is an excellent idea, I'll analyse binaries some more
> > and report my findings.
> 
> Please find attached bloat.txt where:
> 
> a) some functions now include once-called functions. These are:
> 
> copy_process                                6465   10191   +3726
> balance_dirty_pages_ratelimited_flags        237    2949   +2712
> icl_plane_update_noarm                      5800    7969   +2169
> samsung_input_mapping                       3375    5170   +1795
> ext4_do_update_inode.isra                      -    1526   +1526
> 
> that now include:
> 
> ext4_mark_iloc_dirty                        1735     106   -1629
> samsung_gamepad_input_mapping.isra          2046       -   -2046
> icl_program_input_csc                       2203       -   -2203
> copy_mm                                     2242       -   -2242
> balance_dirty_pages                         2657       -   -2657
> 
> b) ISRA [interprocedural scalar replacement of aggregates,
> interprocedural pass that removes unused function return values
> (turning functions returning a value which is never used into void
> functions) and removes unused function parameters.  It can also
> replace an aggregate parameter by a set of other parameters
> representing part of the original, turning those passed by reference
> into new ones which pass the value directly.]
> 
> ext4_do_update_inode.isra                      -    1526   +1526
> nfs4_begin_drain_session.isra                  -     249    +249
> nfs4_end_drain_session.isra                    -     168    +168
> __guc_action_register_multi_lrc_v70.isra     335     500    +165
> __i915_gem_free_objects.isra                   -     144    +144
> ...
> membarrier_register_private_expedited.isra     108       -    -108
> syncobj_eventfd_entry_func.isra              445     314    -131
> __ext4_sb_bread_gfp.isra                     140       -    -140
> class_preempt_notrace_destructor.isra        145       -    -145
> p9_fid_put.isra                              151       -    -151
> __mm_cid_try_get.isra                        238       -    -238
> membarrier_global_expedited.isra             294       -    -294
> mm_cid_get.isra                              295       -    -295
> samsung_gamepad_input_mapping.isra.cold      604       -    -604
> samsung_gamepad_input_mapping.isra          2046       -   -2046
> 
> c) different split points of hot/cold split that just move code around:
> 
> samsung_input_mapping.cold                   900    1500    +600
> __i915_request_reset.cold                    311     389     +78
> nfs_update_inode.cold                         77     153     +76
> __do_sys_swapon.cold                         404     455     +51
> copy_process.cold                              -      45     +45
> tg3_get_invariants.cold                       73     115     +42
> ...
> hibernate.cold                               671     643     -28
> copy_mm.cold                                  31       -     -31
> software_resume.cold                         249     207     -42
> io_poll_wake.cold                            106      54     -52
> samsung_gamepad_input_mapping.isra.cold      604       -    -604
> 
> c) full inline of small functions with locking insn (~150 cases).
> These bring in most of the performance increase because there is no
> call setup. E.g.:
> 
> 0000000000a50e10 <release_devnum>:
>   a50e10:    48 63 07                 movslq (%rdi),%rax
>   a50e13:    85 c0                    test   %eax,%eax
>   a50e15:    7e 10                    jle    a50e27 <release_devnum+0x17>
>   a50e17:    48 8b 4f 50              mov    0x50(%rdi),%rcx
>   a50e1b:    f0 48 0f b3 41 50        lock btr %rax,0x50(%rcx)
>   a50e21:    c7 07 ff ff ff ff        movl   $0xffffffff,(%rdi)
>   a50e27:    e9 00 00 00 00           jmp    a50e2c <release_devnum+0x1c>
>             a50e28: R_X86_64_PLT32    __x86_return_thunk-0x4
>   a50e2c:    0f 1f 40 00              nopl   0x0(%rax)
> 
> IMO, for 0.14% code increase, these changes are desirable.

I concur, and it's extra desirable IMHO due to the per function 
overhead of CPU bug mitigations like retpolines.

The number of function calls executed in a workload can be measured via 
perf on most modern x86 CPUs as well. For example on Zen5 CPUs the 
number of RET instructions can be counted:

  {
    EventName: ex_ret_near_ret,
    EventCode: 0xc8,
    BriefDescription: Retired near returns (RET or RET Iw).
  },

Which ought to be a good proxy for function calls (modulo 
tail-optimized jumps).

Thanks,

	Ingo
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 1 week ago
On Thu, Mar 6, 2025 at 11:19 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On Thu, Mar 6, 2025 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Wed, Mar 5, 2025 at 9:14 PM David Laight
> > > <david.laight.linux@gmail.com> wrote:
> > > >
> > > > On Wed, 5 Mar 2025 07:04:08 -1000
> > > > Linus Torvalds <torvalds@linuxfoundation.org> wrote:
> > > >
> > > > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > Even to my surprise, the patch has some noticeable effects on the
> > > > > > performance, please see the attachment in [1] for LMBench data or [2]
> > > > > > for some excerpts from the data. So, I think the patch has potential
> > > > > > to improve the performance.
> > > > >
> > > > > I suspect some of the performance difference - which looks
> > > > > unexpectedly large - is due to having run them on a CPU with the
> > > > > horrendous indirect return costs, and then inlining can make a huge
> > > > > difference.
> > > > ...
> > > >
> > > > Another possibility is that the processes are getting bounced around
> > > > cpu in a slightly different way.
> > > > An idle cpu might be running at 800MHz, run something that spins on it
> > > > and the clock speed will soon jump to 4GHz.
> > > > But if your 'spinning' process is migrated to a different cpu it starts
> > > > again at 800MHz.
> > > >
> > > > (I had something where a fpga compile when from 12 mins to over 20 because
> > > > the kernel RSB stuffing caused the scheduler to behave differently even
> > > > though nothing was doing a lot of system calls.)
> > > >
> > > > All sorts of things can affect that - possibly even making some code faster!
> > > >
> > > > The (IIRC) 30k increase in code size will be a few functions being inlined.
> > > > The bloat-o-meter might show which, and forcing a few inlines the same way
> > > > should reduce that difference.
> > >
> > > bloat-o-meter is an excellent idea, I'll analyse binaries some more
> > > and report my findings.
> >
> > Please find attached bloat.txt where:
> >
> > a) some functions now include once-called functions. These are:
> >
> > copy_process                                6465   10191   +3726
> > balance_dirty_pages_ratelimited_flags        237    2949   +2712
> > icl_plane_update_noarm                      5800    7969   +2169
> > samsung_input_mapping                       3375    5170   +1795
> > ext4_do_update_inode.isra                      -    1526   +1526
> >
> > that now include:
> >
> > ext4_mark_iloc_dirty                        1735     106   -1629
> > samsung_gamepad_input_mapping.isra          2046       -   -2046
> > icl_program_input_csc                       2203       -   -2203
> > copy_mm                                     2242       -   -2242
> > balance_dirty_pages                         2657       -   -2657
> >
> > b) ISRA [interprocedural scalar replacement of aggregates,
> > interprocedural pass that removes unused function return values
> > (turning functions returning a value which is never used into void
> > functions) and removes unused function parameters.  It can also
> > replace an aggregate parameter by a set of other parameters
> > representing part of the original, turning those passed by reference
> > into new ones which pass the value directly.]
> >
> > ext4_do_update_inode.isra                      -    1526   +1526
> > nfs4_begin_drain_session.isra                  -     249    +249
> > nfs4_end_drain_session.isra                    -     168    +168
> > __guc_action_register_multi_lrc_v70.isra     335     500    +165
> > __i915_gem_free_objects.isra                   -     144    +144
> > ...
> > membarrier_register_private_expedited.isra     108       -    -108
> > syncobj_eventfd_entry_func.isra              445     314    -131
> > __ext4_sb_bread_gfp.isra                     140       -    -140
> > class_preempt_notrace_destructor.isra        145       -    -145
> > p9_fid_put.isra                              151       -    -151
> > __mm_cid_try_get.isra                        238       -    -238
> > membarrier_global_expedited.isra             294       -    -294
> > mm_cid_get.isra                              295       -    -295
> > samsung_gamepad_input_mapping.isra.cold      604       -    -604
> > samsung_gamepad_input_mapping.isra          2046       -   -2046
> >
> > c) different split points of hot/cold split that just move code around:
> >
> > samsung_input_mapping.cold                   900    1500    +600
> > __i915_request_reset.cold                    311     389     +78
> > nfs_update_inode.cold                         77     153     +76
> > __do_sys_swapon.cold                         404     455     +51
> > copy_process.cold                              -      45     +45
> > tg3_get_invariants.cold                       73     115     +42
> > ...
> > hibernate.cold                               671     643     -28
> > copy_mm.cold                                  31       -     -31
> > software_resume.cold                         249     207     -42
> > io_poll_wake.cold                            106      54     -52
> > samsung_gamepad_input_mapping.isra.cold      604       -    -604
> >
> > c) full inline of small functions with locking insn (~150 cases).
> > These bring in most of the performance increase because there is no
> > call setup. E.g.:
> >
> > 0000000000a50e10 <release_devnum>:
> >   a50e10:    48 63 07                 movslq (%rdi),%rax
> >   a50e13:    85 c0                    test   %eax,%eax
> >   a50e15:    7e 10                    jle    a50e27 <release_devnum+0x17>
> >   a50e17:    48 8b 4f 50              mov    0x50(%rdi),%rcx
> >   a50e1b:    f0 48 0f b3 41 50        lock btr %rax,0x50(%rcx)
> >   a50e21:    c7 07 ff ff ff ff        movl   $0xffffffff,(%rdi)
> >   a50e27:    e9 00 00 00 00           jmp    a50e2c <release_devnum+0x1c>
> >             a50e28: R_X86_64_PLT32    __x86_return_thunk-0x4
> >   a50e2c:    0f 1f 40 00              nopl   0x0(%rax)
> >
> > IMO, for 0.14% code increase, these changes are desirable.
>
> I concur, and it's extra desirable IMHO due to the per function
> overhead of CPU bug mitigations like retpolines.

Thanks!

I will submit a v2 patch next week that will summarize all the
measurements in its commit message.

BR,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Wed, Mar 5, 2025 at 6:04 PM Linus Torvalds
<torvalds@linuxfoundation.org> wrote:
>
> On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > Even to my surprise, the patch has some noticeable effects on the
> > performance, please see the attachment in [1] for LMBench data or [2]
> > for some excerpts from the data. So, I think the patch has potential
> > to improve the performance.
>
> I suspect some of the performance difference - which looks
> unexpectedly large - is due to having run them on a CPU with the
> horrendous indirect return costs, and then inlining can make a huge
> difference.
> kvm
> Regardless, I absolutely think that using asm_inline here is the right
> thing for the locked instructions.

It is "Intel(R) Core(TM) i7-10710U"

> That said, I do want to bring up another issue: maybe it's time to
> just retire the LOCK_PREFIX thing entirely?
>
> It harkens back to Ye Olde Days when UP was the norm, and we didn't
> want to pay the cost of lock prefixes when the kernel was built for
> SMP but was run on an UP machine.
>
> And honestly, none of that makes sense any more. You can't buy a UP
> machine any more, and the only UP case would be some silly minimal
> virtual environment, and if people really care about that minimal
> case, they should just compile the kernel without SMP support.
> Becxause UP has gone from being the default to being irrelevant. At
> least for x86-64.
>
> So I think we should just get rid of LOCK_PREFIX_HERE and the
> smp_locks section entirely.

Please note that this functionality is shared with i386 target, so the
removal, proposed above, would somehow penalize 32bit targets. The
situation w.r.t. UP vs SMP is not that clear there, maybe some distro
still provides i386 SMP kernels that would then run unoptimized on UP
systems.

From the compiler POV, now that "lock; " prefix lost its semicolon,
removing LOCK_PREFIX_HERE or using asm_inline would result in exactly
the same code. The problematic 31k code size increase (1.1%) with -O2
is inevitable either way, if we want to move forward.

My proposal would be to use what a modern compiler offers. By using
asm_inline, we can keep the status quo (mostly for i386) for some more
time, and still move forward. And we get that -Os code size *decrease*
as a bonus for those that want to shave the last byte from the kernel.

Thanks,
Uros.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by David Laight 9 months, 2 weeks ago
On Wed, 5 Mar 2025 20:47:46 +0100
Uros Bizjak <ubizjak@gmail.com> wrote:

...
> From the compiler POV, now that "lock; " prefix lost its semicolon,
> removing LOCK_PREFIX_HERE or using asm_inline would result in exactly
> the same code. The problematic 31k code size increase (1.1%) with -O2
> is inevitable either way, if we want to move forward.

Have you looked to see where that 31k comes from?
The compiler must be very near to inlining something and removing a couple
of instructions is making a big difference.
It is quite likely that some other change will have the same (or the reverse)
effect.

	David
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Peter Zijlstra 9 months, 2 weeks ago
On Wed, Mar 05, 2025 at 07:04:08AM -1000, Linus Torvalds wrote:
> And honestly, none of that makes sense any more. You can't buy a UP
> machine any more, and the only UP case would be some silly minimal
> virtual environment, and if people really care about that minimal
> case, they should just compile the kernel without SMP support.
> Becxause UP has gone from being the default to being irrelevant. At
> least for x86-64.
> 
> So I think we should just get rid of LOCK_PREFIX_HERE and the
> smp_locks section entirely.
> 
> Which would probably obviate the need for your patch, since then the
> compiler wouldn't see it as some big instruction. But your patch isn't
> wrong, so this is absolutely not a NAK, more of a "we should go
> further".
> 
> Hmm?

I'm all for removing that.

On that note, a number of architectures have already made the next step
and that is mandate SMP=y.

The down-side of that it that it would make a definite dent in the
compile coverage for SMP=n code -- but perhaps we have enough robots to
get away with that.
Re: [PATCH -tip] x86/locking/atomic: Use asm_inline for atomic locking insns
Posted by Uros Bizjak 9 months, 2 weeks ago
On Fri, Feb 28, 2025 at 1:38 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> According to:
>
>   https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html
>
> the usage of asm pseudo directives in the asm template can confuse
> the compiler to wrongly estimate the size of the generated
> code.
>
> The LOCK_PREFIX macro expands to several asm pseudo directives, so
> its usage in atomic locking insns causes instruction length estimate
> to fail significantly (the specially instrumented compiler reports
> the estimated length of these asm templates to be 6 instructions long).
>
> This incorrect estimate further causes unoptimal inlining decisions,
> unoptimal instruction scheduling and unoptimal code block alignments
> for functions that use these locking primitives.
>
> Use asm_inline instead:
>
>   https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html
>
> which is a feature that makes GCC pretend some inline assembler code
> is tiny (while it would think it is huge), instead of just asm.
>
> For code size estimation, the size of the asm is then taken as
> the minimum size of one instruction, ignoring how many instructions
> compiler thinks it is.
>
> The code size of the resulting x86_64 defconfig object file increases
> for 33.264 kbytes, representing 1.2% code size increase:
>
>    text    data     bss     dec     hex filename
> 27450107        4633332  814148 32897587        1f5fa33 vmlinux-old.o
> 27483371        4633784  814148 32931303        1f67de7 vmlinux-new.o
>
> mainly due to different inlining decisions of -O2 build.

FTR, -Os (where generated code size really matters) x86_64 defconfig
object file *decreases* for 24.388 kbytes, representing 1.0% code size
*decrease*:

  text    data     bss     dec     hex filename
23883860        4617284  814212 29315356        1bf511c vmlinux-old.o
23859472        4615404  814212 29289088        1beea80 vmlinux-new.o

again mainly due to different inlining decisions of -Os build.

Uros.