arch/x86/include/asm/atomic.h | 14 +++++++------- arch/x86/include/asm/atomic64_64.h | 14 +++++++------- arch/x86/include/asm/bitops.h | 14 +++++++------- arch/x86/include/asm/cmpxchg.h | 24 ++++++++++++------------ arch/x86/include/asm/cmpxchg_32.h | 4 ++-- arch/x86/include/asm/cmpxchg_64.h | 4 ++-- arch/x86/include/asm/rmwcc.h | 2 +- 7 files changed, 38 insertions(+), 38 deletions(-)
According to:
https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html
the usage of asm pseudo directives in the asm template can confuse
the compiler to wrongly estimate the size of the generated
code.
The LOCK_PREFIX macro expands to several asm pseudo directives, so
its usage in atomic locking insns causes instruction length estimate
to fail significantly (the specially instrumented compiler reports
the estimated length of these asm templates to be 6 instructions long).
This incorrect estimate further causes unoptimal inlining decisions,
unoptimal instruction scheduling and unoptimal code block alignments
for functions that use these locking primitives.
Use asm_inline instead:
https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html
which is a feature that makes GCC pretend some inline assembler code
is tiny (while it would think it is huge), instead of just asm.
For code size estimation, the size of the asm is then taken as
the minimum size of one instruction, ignoring how many instructions
compiler thinks it is.
The code size of the resulting x86_64 defconfig object file increases
for 33.264 kbytes, representing 1.2% code size increase:
text data bss dec hex filename
27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o
27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o
mainly due to different inlining decisions of -O2 build.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
arch/x86/include/asm/atomic.h | 14 +++++++-------
arch/x86/include/asm/atomic64_64.h | 14 +++++++-------
arch/x86/include/asm/bitops.h | 14 +++++++-------
arch/x86/include/asm/cmpxchg.h | 24 ++++++++++++------------
arch/x86/include/asm/cmpxchg_32.h | 4 ++--
arch/x86/include/asm/cmpxchg_64.h | 4 ++--
arch/x86/include/asm/rmwcc.h | 2 +-
7 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55b4d24356ea..75743f1dfd4e 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -30,14 +30,14 @@ static __always_inline void arch_atomic_set(atomic_t *v, int i)
static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "addl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "addl %1, %0"
: "+m" (v->counter)
: "ir" (i) : "memory");
}
static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "subl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "subl %1, %0"
: "+m" (v->counter)
: "ir" (i) : "memory");
}
@@ -50,14 +50,14 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
static __always_inline void arch_atomic_inc(atomic_t *v)
{
- asm volatile(LOCK_PREFIX "incl %0"
+ asm_inline volatile(LOCK_PREFIX "incl %0"
: "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc
static __always_inline void arch_atomic_dec(atomic_t *v)
{
- asm volatile(LOCK_PREFIX "decl %0"
+ asm_inline volatile(LOCK_PREFIX "decl %0"
: "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec
@@ -116,7 +116,7 @@ static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "andl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "andl %1, %0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
@@ -134,7 +134,7 @@ static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "orl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "orl %1, %0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
@@ -152,7 +152,7 @@ static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
- asm volatile(LOCK_PREFIX "xorl %1,%0"
+ asm_inline volatile(LOCK_PREFIX "xorl %1, %0"
: "+m" (v->counter)
: "ir" (i)
: "memory");
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index ae12acae5b06..87b496325b5b 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -22,14 +22,14 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "addq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "addq %1, %0"
: "=m" (v->counter)
: "er" (i), "m" (v->counter) : "memory");
}
static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "subq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "subq %1, %0"
: "=m" (v->counter)
: "er" (i), "m" (v->counter) : "memory");
}
@@ -42,7 +42,7 @@ static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "incq %0"
+ asm_inline volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
: "m" (v->counter) : "memory");
}
@@ -50,7 +50,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v)
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "decq %0"
+ asm_inline volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
: "m" (v->counter) : "memory");
}
@@ -110,7 +110,7 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "andq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "andq %1, %0"
: "+m" (v->counter)
: "er" (i)
: "memory");
@@ -128,7 +128,7 @@ static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "orq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "orq %1, %0"
: "+m" (v->counter)
: "er" (i)
: "memory");
@@ -146,7 +146,7 @@ static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- asm volatile(LOCK_PREFIX "xorq %1,%0"
+ asm_inline volatile(LOCK_PREFIX "xorq %1, %0"
: "+m" (v->counter)
: "er" (i)
: "memory");
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index b96d45944c59..100413aff640 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -52,12 +52,12 @@ static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
- asm volatile(LOCK_PREFIX "orb %b1,%0"
+ asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (CONST_MASK(nr))
: "memory");
} else {
- asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -72,11 +72,11 @@ static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
- asm volatile(LOCK_PREFIX "andb %b1,%0"
+ asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (~CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
@@ -98,7 +98,7 @@ static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
volatile unsigned long *addr)
{
bool negative;
- asm volatile(LOCK_PREFIX "xorb %2,%1"
+ asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), WBYTE_ADDR(addr)
: "iq" ((char)mask) : "memory");
@@ -122,11 +122,11 @@ static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
- asm volatile(LOCK_PREFIX "xorb %b1,%0"
+ asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index fd8afc1f5f6b..b61f32c3459f 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -44,22 +44,22 @@ extern void __add_wrong_size(void)
__typeof__ (*(ptr)) __ret = (arg); \
switch (sizeof(*(ptr))) { \
case __X86_CASE_B: \
- asm volatile (lock #op "b %b0, %1\n" \
+ asm_inline volatile (lock #op "b %b0, %1" \
: "+q" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_W: \
- asm volatile (lock #op "w %w0, %1\n" \
+ asm_inline volatile (lock #op "w %w0, %1" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_L: \
- asm volatile (lock #op "l %0, %1\n" \
+ asm_inline volatile (lock #op "l %0, %1" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_Q: \
- asm volatile (lock #op "q %q0, %1\n" \
+ asm_inline volatile (lock #op "q %q0, %1" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
@@ -91,7 +91,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_B: \
{ \
volatile u8 *__ptr = (volatile u8 *)(ptr); \
- asm volatile(lock "cmpxchgb %2,%1" \
+ asm_inline volatile(lock "cmpxchgb %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "q" (__new), "0" (__old) \
: "memory"); \
@@ -100,7 +100,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_W: \
{ \
volatile u16 *__ptr = (volatile u16 *)(ptr); \
- asm volatile(lock "cmpxchgw %2,%1" \
+ asm_inline volatile(lock "cmpxchgw %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
@@ -109,7 +109,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_L: \
{ \
volatile u32 *__ptr = (volatile u32 *)(ptr); \
- asm volatile(lock "cmpxchgl %2,%1" \
+ asm_inline volatile(lock "cmpxchgl %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
@@ -118,7 +118,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_Q: \
{ \
volatile u64 *__ptr = (volatile u64 *)(ptr); \
- asm volatile(lock "cmpxchgq %2,%1" \
+ asm_inline volatile(lock "cmpxchgq %2, %1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
@@ -165,7 +165,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_B: \
{ \
volatile u8 *__ptr = (volatile u8 *)(_ptr); \
- asm volatile(lock "cmpxchgb %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgb %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -177,7 +177,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_W: \
{ \
volatile u16 *__ptr = (volatile u16 *)(_ptr); \
- asm volatile(lock "cmpxchgw %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgw %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -189,7 +189,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_L: \
{ \
volatile u32 *__ptr = (volatile u32 *)(_ptr); \
- asm volatile(lock "cmpxchgl %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgl %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
@@ -201,7 +201,7 @@ extern void __add_wrong_size(void)
case __X86_CASE_Q: \
{ \
volatile u64 *__ptr = (volatile u64 *)(_ptr); \
- asm volatile(lock "cmpxchgq %[new], %[ptr]" \
+ asm_inline volatile(lock "cmpxchgq %[new], %[ptr]" \
CC_SET(z) \
: CC_OUT(z) (success), \
[ptr] "+m" (*__ptr), \
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8806c646d452..3dd278593960 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -19,7 +19,7 @@ union __u64_halves {
union __u64_halves o = { .full = (_old), }, \
n = { .full = (_new), }; \
\
- asm volatile(_lock "cmpxchg8b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg8b %[ptr]" \
: [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
@@ -45,7 +45,7 @@ static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new
n = { .full = (_new), }; \
bool ret; \
\
- asm volatile(_lock "cmpxchg8b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg8b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
[ptr] "+m" (*(_ptr)), \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 5e241306db26..71d1e72ed879 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -38,7 +38,7 @@ union __u128_halves {
union __u128_halves o = { .full = (_old), }, \
n = { .full = (_new), }; \
\
- asm volatile(_lock "cmpxchg16b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg16b %[ptr]" \
: [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
@@ -65,7 +65,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
n = { .full = (_new), }; \
bool ret; \
\
- asm volatile(_lock "cmpxchg16b %[ptr]" \
+ asm_inline volatile(_lock "cmpxchg16b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
[ptr] "+m" (*(_ptr)), \
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 363266cbcada..3821ee3fae35 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -29,7 +29,7 @@ cc_label: c = true; \
#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \
({ \
bool c; \
- asm volatile (fullop CC_SET(cc) \
+ asm_inline volatile (fullop CC_SET(cc) \
: [var] "+m" (_var), CC_OUT(cc) (c) \
: __VA_ARGS__ : clobbers); \
c; \
--
2.48.1
* Uros Bizjak <ubizjak@gmail.com> wrote:
> According to:
>
> https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html
>
> the usage of asm pseudo directives in the asm template can confuse
> the compiler to wrongly estimate the size of the generated
> code.
>
> The LOCK_PREFIX macro expands to several asm pseudo directives, so
> its usage in atomic locking insns causes instruction length estimate
> to fail significantly (the specially instrumented compiler reports
> the estimated length of these asm templates to be 6 instructions long).
>
> This incorrect estimate further causes unoptimal inlining decisions,
> unoptimal instruction scheduling and unoptimal code block alignments
> for functions that use these locking primitives.
>
> Use asm_inline instead:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html
>
> which is a feature that makes GCC pretend some inline assembler code
> is tiny (while it would think it is huge), instead of just asm.
>
> For code size estimation, the size of the asm is then taken as
> the minimum size of one instruction, ignoring how many instructions
> compiler thinks it is.
>
> The code size of the resulting x86_64 defconfig object file increases
> for 33.264 kbytes, representing 1.2% code size increase:
>
> text data bss dec hex filename
> 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o
> 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o
>
> mainly due to different inlining decisions of -O2 build.
So my request here would be not more benchmark figures (I don't think
it's a realistic expectation for contributors to be able to measure
much of an effect with such a type of change, let alone be certain
what a macro or micro-benchmark measures is causally connected with
the patch), but I'd like to ask for some qualitative analysis on the
code generation side:
- +1.2% code size increase is a lot, especially if it's under the
default build flags of the kernel. Where does the extra code come
from?
- Is there any effect on Clang? Are its inlining decisions around
these asm() statements comparable, worse/better?
A couple of concrete examples would go a long way:
- "Function XXX was inlined 3 times before the patch, and it was
inlined 30 times after the patch. I have reviewed two such inlining
locations, and they have added more code to unlikely or
failure-handling branches collected near the function epilogue,
while the fast-path of the function was more optimal."
Or you might end up finding:
- "Function YYY was inlined 3x more frequently after the patch, but
the inlining decision increased register pressure and created less
optimal code in the fast-path, increasing both code size and likely
decreasing fast-path performance."
Obviously we'd be sad about the second case, but it's well within the
spectrum of possibilities when we look at "+1.2% object code size
increase".
What we cannot do is to throw up our hands and claim "-O2 trades
performance for size, and thus this patch improves performance".
We don't know that for sure and 30 years of kernel development
created a love-and-hate relationship and a fair level of distrust
between kernel developers and compiler inlining decisions,
especially around x86 asm() statements ...
So these are roughly the high level requirements around such patches.
Does this make sense?
Thanks,
Ingo
On Thu, Mar 6, 2025 at 10:57 AM Ingo Molnar <mingo@kernel.org> wrote: > > > * Uros Bizjak <ubizjak@gmail.com> wrote: > > > According to: > > > > https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html > > > > the usage of asm pseudo directives in the asm template can confuse > > the compiler to wrongly estimate the size of the generated > > code. > > > > The LOCK_PREFIX macro expands to several asm pseudo directives, so > > its usage in atomic locking insns causes instruction length estimate > > to fail significantly (the specially instrumented compiler reports > > the estimated length of these asm templates to be 6 instructions long). > > > > This incorrect estimate further causes unoptimal inlining decisions, > > unoptimal instruction scheduling and unoptimal code block alignments > > for functions that use these locking primitives. > > > > Use asm_inline instead: > > > > https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html > > > > which is a feature that makes GCC pretend some inline assembler code > > is tiny (while it would think it is huge), instead of just asm. > > > > For code size estimation, the size of the asm is then taken as > > the minimum size of one instruction, ignoring how many instructions > > compiler thinks it is. > > > > The code size of the resulting x86_64 defconfig object file increases > > for 33.264 kbytes, representing 1.2% code size increase: > > > > text data bss dec hex filename > > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > > > mainly due to different inlining decisions of -O2 build. > > So my request here would be not more benchmark figures (I don't think > it's a realistic expectation for contributors to be able to measure > much of an effect with such a type of change, let alone be certain > what a macro or micro-benchmark measures is causally connected with > the patch), but I'd like to ask for some qualitative analysis on the > code generation side: > > - +1.2% code size increase is a lot, especially if it's under the > default build flags of the kernel. Where does the extra code come > from? > > - Is there any effect on Clang? Are its inlining decisions around > these asm() statements comparable, worse/better? FTR, clang recognizes "asm inline", but there was no difference in code sizes: text data bss dec hex filename 27577163 4503078 807732 32887973 1f5d4a5 vmlinux-clang-patched.o 27577181 4503078 807732 32887991 1f5d4b7 vmlinux-clang-unpatched.o Uros.
On Thu, Mar 6, 2025 at 10:57 AM Ingo Molnar <mingo@kernel.org> wrote: > > > * Uros Bizjak <ubizjak@gmail.com> wrote: > > > According to: > > > > https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html > > > > the usage of asm pseudo directives in the asm template can confuse > > the compiler to wrongly estimate the size of the generated > > code. > > > > The LOCK_PREFIX macro expands to several asm pseudo directives, so > > its usage in atomic locking insns causes instruction length estimate > > to fail significantly (the specially instrumented compiler reports > > the estimated length of these asm templates to be 6 instructions long). > > > > This incorrect estimate further causes unoptimal inlining decisions, > > unoptimal instruction scheduling and unoptimal code block alignments > > for functions that use these locking primitives. > > > > Use asm_inline instead: > > > > https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html > > > > which is a feature that makes GCC pretend some inline assembler code > > is tiny (while it would think it is huge), instead of just asm. > > > > For code size estimation, the size of the asm is then taken as > > the minimum size of one instruction, ignoring how many instructions > > compiler thinks it is. > > > > The code size of the resulting x86_64 defconfig object file increases > > for 33.264 kbytes, representing 1.2% code size increase: > > > > text data bss dec hex filename > > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > > > mainly due to different inlining decisions of -O2 build. > > So my request here would be not more benchmark figures (I don't think > it's a realistic expectation for contributors to be able to measure > much of an effect with such a type of change, let alone be certain > what a macro or micro-benchmark measures is causally connected with > the patch), but I'd like to ask for some qualitative analysis on the > code generation side: > > - +1.2% code size increase is a lot, especially if it's under the > default build flags of the kernel. Where does the extra code come > from? > > - Is there any effect on Clang? Are its inlining decisions around > these asm() statements comparable, worse/better? Bah, apparently I can't calculate... the code increase is in fact 0.14%, but still 33k. bloat-o-meter shows top grows (> 500 bytes): add/remove: 82/283 grow/shrink: 870/372 up/down: 76272/-43618 (32654) Function old new delta copy_process 6465 10191 +3726 balance_dirty_pages_ratelimited_flags 237 2949 +2712 icl_plane_update_noarm 5800 7969 +2169 samsung_input_mapping 3375 5170 +1795 ext4_do_update_inode.isra - 1526 +1526 __schedule 2416 3472 +1056 __i915_vma_resource_unhold - 946 +946 sched_mm_cid_after_execve 175 1097 +922 __do_sys_membarrier - 862 +862 filemap_fault 2666 3462 +796 nl80211_send_wiphy 11185 11874 +689 samsung_input_mapping.cold 900 1500 +600 virtio_gpu_queue_fenced_ctrl_buffer 839 1410 +571 ilk_update_pipe_csc 1201 1735 +534 enable_step - 525 +525 icl_color_commit_noarm 1334 1847 +513 tg3_read_bc_ver - 501 +501 and top shrinks ( < 500 bytes): nl80211_send_iftype_data 580 - -580 samsung_gamepad_input_mapping.isra.cold 604 - -604 virtio_gpu_queue_ctrl_sgs 724 - -724 tg3_get_invariants 9218 8376 -842 __i915_vma_resource_unhold.part 899 - -899 ext4_mark_iloc_dirty 1735 106 -1629 samsung_gamepad_input_mapping.isra 2046 - -2046 icl_program_input_csc 2203 - -2203 copy_mm 2242 - -2242 balance_dirty_pages 2657 - -2657 Total: Before=22770320, After=22802974, chg +0.14% > A couple of concrete examples would go a long way: > > - "Function XXX was inlined 3 times before the patch, and it was > inlined 30 times after the patch. I have reviewed two such inlining > locations, and they have added more code to unlikely or > failure-handling branches collected near the function epilogue, > while the fast-path of the function was more optimal." > > Or you might end up finding: > > - "Function YYY was inlined 3x more frequently after the patch, but > the inlining decision increased register pressure and created less > optimal code in the fast-path, increasing both code size and likely > decreasing fast-path performance." > > Obviously we'd be sad about the second case, but it's well within the > spectrum of possibilities when we look at "+1.2% object code size > increase". > > What we cannot do is to throw up our hands and claim "-O2 trades > performance for size, and thus this patch improves performance". > We don't know that for sure and 30 years of kernel development > created a love-and-hate relationship and a fair level of distrust > between kernel developers and compiler inlining decisions, > especially around x86 asm() statements ... > > So these are roughly the high level requirements around such patches. > Does this make sense? In my opinion, the writing above makes perfect sense. As far as I'm concerned, I'm able and can do the above code analysis, the problematic part would be precise performance measurements. Although with your instructions, I can also try that. Thanks, Uros.
* Uros Bizjak <ubizjak@gmail.com> wrote: > > So these are roughly the high level requirements around such patches. > > Does this make sense? > > In my opinion, the writing above makes perfect sense. As far as I'm > concerned, I'm able and can do the above code analysis, the > problematic part would be precise performance measurements. Although > with your instructions, I can also try that. Yeah, so *personally* I find the kind of code generation analysis you routinely perform for your micro-optimization patches far more useful and persuasive, because it's basically a first principles argument: instructions removed are an inarguable positive in the overwhelming majority cases all other things equal (as long as it doesn't come at the expense of more function calls or worse instructions, etc.). For inlining decisions code generation analysis is arguably more complicated - but that's the nature of inlining related patches. Performance measurements can back up such arguments, and being more proficient in perf tooling is a useful toolbox to have anyway, but it's fundamentally a stohastic argument for something as comparatively small as a +0.12% code size increase. But if code generation analysis is inconclusive or even negative, then performance measurements can trump all of that, but it's a substantial barrier of entry as you noted - and I'm somewhat sceptical whether a 0.12% code generation effect *can* even be measured reliably even with the best of expertise and infrastructure... Also, to shorten build & test times you can use the x86-64 defconfig. It's a config more or less representative of what major distros enable, and it's even bootable on some systems and in VMs, but it builds in far less time. Thanks, Ingo
* Ingo Molnar <mingo@kernel.org> wrote:
> Also, to shorten build & test times you can use the x86-64 defconfig.
> It's a config more or less representative of what major distros
> enable, and it's even bootable on some systems and in VMs, but it
> builds in far less time.
And if your primary test method is KVM+Qemu, then the following build
method will give you a representative core kernel bzImage that will
boot most cloud VM images as-is:
$ make -j128 ARCH=x86 defconfig kvm_guest.config bzImage
$ ll arch/x86/boot/bzImage
-rw-rw-r-- 1 mingo mingo 13788160 Mar 6 11:42 arch/x86/boot/bzImage
And you can boot up a .raw distro image in an xterm:
$ RAW=your_cloud_image.raw
$ PARTUID=your_target_root_partition_UUID
$ qemu-system-x86_64 \
-enable-kvm \
-smp cpus=4 \
-m 2048 \
-machine q35 \
-cpu host \
-global ICH9-LPC.disable_s3=1 \
-net nic,model=virtio \
-net user,hostfwd=tcp::8022-:22,hostfwd=tcp::8090-:80 \
-drive "file=$RAW",if=none,format=raw,id=disk1 \
-device virtio-blk-pci,drive=disk1,bootindex=1 \
-serial mon:stdio \
-nographic \
-append "root=PARTUUID=$ID ro console=tty0 console=ttyS0,115200 earlyprintk=ttyS0,115200 consoleblank=0 ignore_loglevel" \
-kernel arch/x86/boot/bzImage
This way you don't need any initrd build or modules nonsense -
everything necessary is built in.
Bootable raw distro images can be found in numerous places, for example
at:
https://cloud.debian.org/images/cloud/bookworm-backports/
( And since I'm lazy to figure it out the 'cloud way', I usually read the
root UUID from the bootlog of the first unsuccessful attempt. )
Thanks,
Ingo
On 2/28/25 04:35, Uros Bizjak wrote: > The code size of the resulting x86_64 defconfig object file increases > for 33.264 kbytes, representing 1.2% code size increase: > > text data bss dec hex filename > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o So, first of all, thank you for including some objective measurement of the impact if your patches. It's much appreciated. But I think the patches need to come with a solid theory of why they're good. The minimum bar for that, I think, is *some* kind of actual real-world performance test. I'm not picky. Just *something* that spends a lot of time in the kernel and ideally where a profile points at some of the code you're poking here. I'm seriously not picky: will-it-scale, lmbench, dbench, kernel compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. But performance patches need to come with performance *numbers*.
On 2/28/25 08:48, Dave Hansen wrote: > On 2/28/25 04:35, Uros Bizjak wrote: >> The code size of the resulting x86_64 defconfig object file increases >> for 33.264 kbytes, representing 1.2% code size increase: >> >> text data bss dec hex filename >> 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o >> 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > So, first of all, thank you for including some objective measurement of > the impact if your patches. It's much appreciated. > > But I think the patches need to come with a solid theory of why they're > good. The minimum bar for that, I think, is *some* kind of actual > real-world performance test. I'm not picky. Just *something* that spends > a lot of time in the kernel and ideally where a profile points at some > of the code you're poking here. > > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > > But performance patches need to come with performance *numbers*. Incidentally, this is exactly the reason why gcc added "asm inline" *at our request*. We just haven't caught up with it everywhere yet. In fact, I would wonder if we shouldn't simply do: #define asm __asm__ __inline__ #define asm_noinline __asm__ ... in other words, to make asm inline an opt-out instead of an opt-in. It is comparatively unusual that we do complex things in inline assembly that we would want gcc to treat as something that should be avoided. -hpa
On Sat, Mar 8, 2025 at 8:08 PM H. Peter Anvin <hpa@zytor.com> wrote: > > On 2/28/25 08:48, Dave Hansen wrote: > > On 2/28/25 04:35, Uros Bizjak wrote: > >> The code size of the resulting x86_64 defconfig object file increases > >> for 33.264 kbytes, representing 1.2% code size increase: > >> > >> text data bss dec hex filename > >> 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > >> 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > > > So, first of all, thank you for including some objective measurement of > > the impact if your patches. It's much appreciated. > > > > But I think the patches need to come with a solid theory of why they're > > good. The minimum bar for that, I think, is *some* kind of actual > > real-world performance test. I'm not picky. Just *something* that spends > > a lot of time in the kernel and ideally where a profile points at some > > of the code you're poking here. > > > > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > > > > But performance patches need to come with performance *numbers*. > > Incidentally, this is exactly the reason why gcc added "asm inline" *at > our request*. We just haven't caught up with it everywhere yet. > > In fact, I would wonder if we shouldn't simply do: > > #define asm __asm__ __inline__ > #define asm_noinline __asm__ > > ... in other words, to make asm inline an opt-out instead of an opt-in. > It is comparatively unusual that we do complex things in inline assembly > that we would want gcc to treat as something that should be avoided. I don't think we need such radical changes. There are only a few groups of instructions, nicely hidden behind macros, that need asm noinline. Alternatives (gcc counted them as 20 - 23 instructions) are already using asm inline (please see arch/x86/include/asm/alternative.h) in their high-level macros, and my proposed patch converts all asm using LOCK_PREFIX by amending macros in 7 header files. Uros.
On Sun, 9 Mar 2025 08:50:08 +0100 Uros Bizjak <ubizjak@gmail.com> wrote: > On Sat, Mar 8, 2025 at 8:08 PM H. Peter Anvin <hpa@zytor.com> wrote: ... > > In fact, I would wonder if we shouldn't simply do: > > > > #define asm __asm__ __inline__ > > #define asm_noinline __asm__ > > > > ... in other words, to make asm inline an opt-out instead of an opt-in. The asm statements themselves get inlined (typically they are in an always_inline wrapper), the size affects whether the calling code is inlined. You are now in the 'games' of I$ fetches, overall code size and TLB lookups (not helped by the speculative execution mitigations for 'ret'). > > It is comparatively unusual that we do complex things in inline assembly > > that we would want gcc to treat as something that should be avoided. > > I don't think we need such radical changes. There are only a few > groups of instructions, nicely hidden behind macros, that need asm > noinline. Alternatives (gcc counted them as 20 - 23 instructions) are > already using asm inline (please see > arch/x86/include/asm/alternative.h) in their high-level macros, and my > proposed patch converts all asm using LOCK_PREFIX by amending macros > in 7 header files. The other ones that are likely to get mis-sized are the ones that change the section to add annotations. The size overestimate may be better in order to reduce the number of annotations? David
On Sun, Mar 9, 2025 at 10:46 AM David Laight <david.laight.linux@gmail.com> wrote: > > On Sun, 9 Mar 2025 08:50:08 +0100 > Uros Bizjak <ubizjak@gmail.com> wrote: > > > On Sat, Mar 8, 2025 at 8:08 PM H. Peter Anvin <hpa@zytor.com> wrote: > ... > > > In fact, I would wonder if we shouldn't simply do: > > > > > > #define asm __asm__ __inline__ > > > #define asm_noinline __asm__ > > > > > > ... in other words, to make asm inline an opt-out instead of an opt-in. > > The asm statements themselves get inlined (typically they are in an > always_inline wrapper), the size affects whether the calling code is inlined. > You are now in the 'games' of I$ fetches, overall code size and TLB lookups > (not helped by the speculative execution mitigations for 'ret'). > > > > It is comparatively unusual that we do complex things in inline assembly > > > that we would want gcc to treat as something that should be avoided. > > > > I don't think we need such radical changes. There are only a few > > groups of instructions, nicely hidden behind macros, that need asm > > noinline. Alternatives (gcc counted them as 20 - 23 instructions) are > > already using asm inline (please see > > arch/x86/include/asm/alternative.h) in their high-level macros, and my > > proposed patch converts all asm using LOCK_PREFIX by amending macros > > in 7 header files. > > The other ones that are likely to get mis-sized are the ones that change > the section to add annotations. > The size overestimate may be better in order to reduce the number of > annotations? Yes, this is why I think it is better to use asm inline on a per-case basis. While ALTERNATIVE and LOCK_HERE instructions result in one insn (and can be trivially amended with asm inline to instruct the compiler about this fact), this may not always be the case. Uros.
On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/28/25 04:35, Uros Bizjak wrote: > > The code size of the resulting x86_64 defconfig object file increases > > for 33.264 kbytes, representing 1.2% code size increase: > > > > text data bss dec hex filename > > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > So, first of all, thank you for including some objective measurement of > the impact if your patches. It's much appreciated. > > But I think the patches need to come with a solid theory of why they're > good. The minimum bar for that, I think, is *some* kind of actual > real-world performance test. I'm not picky. Just *something* that spends > a lot of time in the kernel and ideally where a profile points at some > of the code you're poking here. > > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > > But performance patches need to come with performance *numbers*. Please find lmbench results from unpatched (fedora.0) and patched (fedora.1) fedora-41 6.13.5 kernels. lmbench is from [1] [1] https://fedora.pkgs.org/41/rpm-sphere-x86_64/lmbench-3.0-0.a9.3.x86_64.rpm.html Some tests show quite different results, but I'd appreciate some help in interpreting the results. Maybe they show that the price of 33 kbytes is worth the improvement, or they will motivate someone experienced in kernel benchmarks to benchmark the patch in a more scientific way. Thanks, Uros.
On Sun, Mar 2, 2025 at 9:56 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote: > > > > On 2/28/25 04:35, Uros Bizjak wrote: > > > The code size of the resulting x86_64 defconfig object file increases > > > for 33.264 kbytes, representing 1.2% code size increase: > > > > > > text data bss dec hex filename > > > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > > > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > > > So, first of all, thank you for including some objective measurement of > > the impact if your patches. It's much appreciated. > > > > But I think the patches need to come with a solid theory of why they're > > good. The minimum bar for that, I think, is *some* kind of actual > > real-world performance test. I'm not picky. Just *something* that spends > > a lot of time in the kernel and ideally where a profile points at some > > of the code you're poking here. > > > > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > > > > But performance patches need to come with performance *numbers*. > > Please find lmbench results from unpatched (fedora.0) and patched > (fedora.1) fedora-41 6.13.5 kernels. > > lmbench is from [1] > > [1] https://fedora.pkgs.org/41/rpm-sphere-x86_64/lmbench-3.0-0.a9.3.x86_64.rpm.html > > Some tests show quite different results, but I'd appreciate some help > in interpreting the results. Maybe they show that the price of 33 > kbytes is worth the improvement, or they will motivate someone > experienced in kernel benchmarks to benchmark the patch in a more > scientific way. These go from: Process fork+exit: 270.0952 microseconds Process fork+execve: 2620.3333 microseconds Process fork+/bin/sh -c: 6781.0000 microseconds File /usr/tmp/XXX write bandwidth: 1780350 KB/sec Pagefaults on /usr/tmp/XXX: 0.3875 microseconds to: Process fork+exit: 298.6842 microseconds Process fork+execve: 1662.7500 microseconds Process fork+/bin/sh -c: 2127.6667 microseconds File /usr/tmp/XXX write bandwidth: 1950077 KB/sec Pagefaults on /usr/tmp/XXX: 0.1958 microseconds and from: Socket bandwidth using localhost 0.000001 2.52 MB/sec 0.000064 163.02 MB/sec 0.000128 321.70 MB/sec 0.000256 630.06 MB/sec 0.000512 1207.07 MB/sec 0.001024 2004.06 MB/sec 0.001437 2475.43 MB/sec 10.000000 5817.34 MB/sec Avg xfer: 3.2KB, 41.8KB in 1.2230 millisecs, 34.15 MB/sec AF_UNIX sock stream bandwidth: 9850.01 MB/sec Pipe bandwidth: 4631.28 MB/sec to: Socket bandwidth using localhost 0.000001 3.13 MB/sec 0.000064 187.08 MB/sec 0.000128 324.12 MB/sec 0.000256 618.51 MB/sec 0.000512 1137.13 MB/sec 0.001024 1962.95 MB/sec 0.001437 2458.27 MB/sec 10.000000 6168.08 MB/sec Avg xfer: 3.2KB, 41.8KB in 1.0060 millisecs, 41.52 MB/sec AF_UNIX sock stream bandwidth: 9921.68 MB/sec Pipe bandwidth: 4649.96 MB/sec Uros.
On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/28/25 04:35, Uros Bizjak wrote: > > The code size of the resulting x86_64 defconfig object file increases > > for 33.264 kbytes, representing 1.2% code size increase: > > > > text data bss dec hex filename > > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > So, first of all, thank you for including some objective measurement of > the impact if your patches. It's much appreciated. > > But I think the patches need to come with a solid theory of why they're > good. The minimum bar for that, I think, is *some* kind of actual > real-world performance test. I'm not picky. Just *something* that spends > a lot of time in the kernel and ideally where a profile points at some > of the code you're poking here. > > I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > > But performance patches need to come with performance *numbers*. I don't consider this patch a performance patch, it is more a patch that fixes a correctness issue. The compiler estimates the number of instructions in the asm template wrong, so the patch instructs the compiler that everything in the template in fact results in a single instruction, no matter the pseudos there. The correct estimation then allows the compiler to do its job better (e.g. better scheduling, better inlining decisions, etc...). The metric of code size is excellent for -Os compile, but not so good for -O2 compile, and measured results mirror that.. In the -O2 case, we actually requested from the compiler to prioritize the performance, not code size, so the code size measurements are only of limited relevance. The purpose of these measurements are to show that the effect of the patch is limited to the expected 1% of code size difference. I don't expect some noticeable performance changes from the non-algorithmic patch like this. TBH, I would be surprised if they were outside the measurement noise. Nevertheless, I'll try to provide some performance numbers. Thanks, Uros.
On 2/28/25 14:31, Uros Bizjak wrote: > On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote: >> >> On 2/28/25 04:35, Uros Bizjak wrote: >>> The code size of the resulting x86_64 defconfig object file increases >>> for 33.264 kbytes, representing 1.2% code size increase: >>> >>> text data bss dec hex filename >>> 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o >>> 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o >> >> So, first of all, thank you for including some objective measurement of >> the impact if your patches. It's much appreciated. >> >> But I think the patches need to come with a solid theory of why they're >> good. The minimum bar for that, I think, is *some* kind of actual >> real-world performance test. I'm not picky. Just *something* that spends >> a lot of time in the kernel and ideally where a profile points at some >> of the code you're poking here. >> >> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel >> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. >> >> But performance patches need to come with performance *numbers*. > > I don't consider this patch a performance patch, it is more a patch > that fixes a correctness issue. The compiler estimates the number of > instructions in the asm template wrong, so the patch instructs the > compiler that everything in the template in fact results in a single > instruction, no matter the pseudos there. The correct estimation then > allows the compiler to do its job better (e.g. better scheduling, > better inlining decisions, etc...). Why does it matter if the compiler does its job better? I'll let the other folks who maintain this code chime in if they think I'm off my rocker. But, *I* consider this -- and all of these, frankly -- performance patches. > I don't expect some noticeable performance changes from the > non-algorithmic patch like this. TBH, I would be surprised if they > were outside the measurement noise. Nevertheless, I'll try to provide > some performance numbers. That would be appreciated, thanks!
On Fri, 28 Feb 2025 14:58:47 -0800 Dave Hansen <dave.hansen@intel.com> wrote: > On 2/28/25 14:31, Uros Bizjak wrote: > > On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote: > >> > >> On 2/28/25 04:35, Uros Bizjak wrote: > >>> The code size of the resulting x86_64 defconfig object file increases > >>> for 33.264 kbytes, representing 1.2% code size increase: > >>> > >>> text data bss dec hex filename > >>> 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > >>> 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > >> > >> So, first of all, thank you for including some objective measurement of > >> the impact if your patches. It's much appreciated. > >> > >> But I think the patches need to come with a solid theory of why they're > >> good. The minimum bar for that, I think, is *some* kind of actual > >> real-world performance test. I'm not picky. Just *something* that spends > >> a lot of time in the kernel and ideally where a profile points at some > >> of the code you're poking here. > >> > >> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > >> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > >> > >> But performance patches need to come with performance *numbers*. > > > > I don't consider this patch a performance patch, it is more a patch > > that fixes a correctness issue. The compiler estimates the number of > > instructions in the asm template wrong, so the patch instructs the > > compiler that everything in the template in fact results in a single > > instruction, no matter the pseudos there. The correct estimation then > > allows the compiler to do its job better (e.g. better scheduling, > > better inlining decisions, etc...). > > Why does it matter if the compiler does its job better? > > I'll let the other folks who maintain this code chime in if they think > I'm off my rocker. But, *I* consider this -- and all of these, frankly > -- performance patches. I was looking at some size changes related to a different 'trivial' code change. It caused gcc to make apparently unrelated inlining decisions that caused some functions to grow/shrink by +/-100+ bytes even though the actual change would mostly only add/remove a single instruction. I've lost the patch for this one, but if the asm block does expand to a single instruction it is likely to making gcc decide to inline one of the functions that uses it - so increasing overall code size. Whether that helps or hinders performance is difficult to say. David
On Fri, Feb 28, 2025 at 11:58 PM Dave Hansen <dave.hansen@intel.com> wrote: > > On 2/28/25 14:31, Uros Bizjak wrote: > > On Fri, Feb 28, 2025 at 5:48 PM Dave Hansen <dave.hansen@intel.com> wrote: > >> > >> On 2/28/25 04:35, Uros Bizjak wrote: > >>> The code size of the resulting x86_64 defconfig object file increases > >>> for 33.264 kbytes, representing 1.2% code size increase: > >>> > >>> text data bss dec hex filename > >>> 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > >>> 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > >> > >> So, first of all, thank you for including some objective measurement of > >> the impact if your patches. It's much appreciated. > >> > >> But I think the patches need to come with a solid theory of why they're > >> good. The minimum bar for that, I think, is *some* kind of actual > >> real-world performance test. I'm not picky. Just *something* that spends > >> a lot of time in the kernel and ideally where a profile points at some > >> of the code you're poking here. > >> > >> I'm seriously not picky: will-it-scale, lmbench, dbench, kernel > >> compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop. > >> > >> But performance patches need to come with performance *numbers*. > > > > I don't consider this patch a performance patch, it is more a patch > > that fixes a correctness issue. The compiler estimates the number of > > instructions in the asm template wrong, so the patch instructs the > > compiler that everything in the template in fact results in a single > > instruction, no matter the pseudos there. The correct estimation then > > allows the compiler to do its job better (e.g. better scheduling, > > better inlining decisions, etc...). > > Why does it matter if the compiler does its job better? Please read the long thread [1], especially part [1.1], that was the reason for gcc to implement asm inline [2]. [1] https://lore.kernel.org/lkml/20181003213100.189959-1-namit@vmware.com/ [1.1] https://lore.kernel.org/lkml/20181007091805.GA30687@zn.tnic/ [2] https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html Accurate inline decisions are just one of compiler optimizations that depend on code growth factor, tail duplication [3] is another one, there are also code hoisting, function cloning, block reordering, basic block copying, to name a few from the top of my head. [3] https://gcc.gnu.org/projects/sched-treegion.html These all work better with accurate input data. These optimizations are also the reason for 1% code growth with -O2: additional code blocks now fall under the code size threshold that enables the mentioned optimizations, under the assumption of -O2 code size/performance tradeoffs. OTOH, -Os, where different code size/performance heuristics are used, now performs better w.r.t code size. Uros.
On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> OTOH, -Os, where different code size/performance heuristics are used, now
> performs better w.r.t code size.
Did anything change since:
281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
wrt -Os?
Because if not, we still don't love -Os and you can drop the -Os argument.
And without any perf data showing any improvement, this patch does nothing but
enlarge -O2 size...
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
>
> On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > OTOH, -Os, where different code size/performance heuristics are used, now
> > performs better w.r.t code size.
>
> Did anything change since:
>
> 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
>
> wrt -Os?
>
> Because if not, we still don't love -Os and you can drop the -Os argument.
The -Os argument was to show the effect of the patch when the compiler
is instructed to take care of the overall size. Giving the compiler
-O2 and then looking at the overall size of the produced binary is
just wrong.
> And without any perf data showing any improvement, this patch does nothing but
> enlarge -O2 size...
Even to my surprise, the patch has some noticeable effects on the
performance, please see the attachment in [1] for LMBench data or [2]
for some excerpts from the data. So, I think the patch has potential
to improve the performance.
[1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/
Thanks,
Uros.
On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote:
> The -Os argument was to show the effect of the patch when the compiler
> is instructed to take care of the overall size. Giving the compiler
> -O2 and then looking at the overall size of the produced binary is
> just wrong.
No one cares about -Os AFAICT. It might as well be non-existent. So the effect
doesn't matter.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Mar 05, 2025 at 09:36:33PM +0100, Borislav Petkov wrote: > On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote: > > The -Os argument was to show the effect of the patch when the compiler > > is instructed to take care of the overall size. Giving the compiler > > -O2 and then looking at the overall size of the produced binary is > > just wrong. > > No one cares about -Os AFAICT. It might as well be non-existent. So the effect > doesn't matter. Well, more people would care if it didn't stand for -Ostupid I suppose. That is, traditionally GCC made some very questionable choices with -Os, quite horrendous code-gen.
On Wed, Mar 5, 2025 at 10:26 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Mar 05, 2025 at 09:36:33PM +0100, Borislav Petkov wrote: > > On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote: > > > The -Os argument was to show the effect of the patch when the compiler > > > is instructed to take care of the overall size. Giving the compiler > > > -O2 and then looking at the overall size of the produced binary is > > > just wrong. > > > > No one cares about -Os AFAICT. It might as well be non-existent. So the effect > > doesn't matter. > > Well, more people would care if it didn't stand for -Ostupid I suppose. > That is, traditionally GCC made some very questionable choices with -Os, > quite horrendous code-gen. Size optimizations result in 15% code size reduction (x86_64 defconfig, gcc-14.2), so they reflect what user wanted: text data bss dec hex filename 27478996 4635807 814660 32929463 1f676b7 vmlinux-O2.o 23859143 4617419 814724 29291286 1bef316 vmlinux-Os.o The compiler heuristics depend on tradeoffs, and -Os uses different tradeoffs than -O2. Unfortunately, there is no -Os-but-I-really-want-performace switch, but OTOH, tradeoffs can be adjusted. The compiler is open-source, and these adjustments can be discussed in public spaces (mailing lists and bugzilla) and eventually re-tuned. We are aware that the world around us changes, so tunings are not set in stone, but we also depend on user feedback. Thanks, Uros.
* Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Mar 5, 2025 at 10:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Mar 05, 2025 at 09:36:33PM +0100, Borislav Petkov wrote:
> > > On Wed, Mar 05, 2025 at 09:54:11AM +0100, Uros Bizjak wrote:
> > > > The -Os argument was to show the effect of the patch when the compiler
> > > > is instructed to take care of the overall size. Giving the compiler
> > > > -O2 and then looking at the overall size of the produced binary is
> > > > just wrong.
> > >
> > > No one cares about -Os AFAICT. It might as well be non-existent. So the effect
> > > doesn't matter.
> >
> > Well, more people would care if it didn't stand for -Ostupid I suppose.
> > That is, traditionally GCC made some very questionable choices with -Os,
> > quite horrendous code-gen.
>
> Size optimizations result in 15% code size reduction (x86_64
> defconfig, gcc-14.2), so they reflect what user wanted:
>
> text data bss dec hex filename
> 27478996 4635807 814660 32929463 1f676b7 vmlinux-O2.o
> 23859143 4617419 814724 29291286 1bef316 vmlinux-Os.o
>
> The compiler heuristics depend on tradeoffs, and -Os uses different
> tradeoffs than -O2. Unfortunately, there is no
> -Os-but-I-really-want-performace switch, but OTOH, tradeoffs can be
> adjusted. The compiler is open-source, and these adjustments can be
> discussed in public spaces (mailing lists and bugzilla) and eventually
> re-tuned. We are aware that the world around us changes, so tunings
> are not set in stone, but we also depend on user feedback.
So the best way to drive -Os forward is not to insist that it's good
(it might still be crap), and not to insist that it's crap (it might
have become better), but to dig out old problems and to look at what
kind of code current compilers generate in the kernel with -Os.
There's been a few pathological GCC optimizations in the past, but also
other problems, such as this one 9 years ago that hid useful warnings:
=================>
877417e6ffb9 Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
=================>
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 25 Apr 2016 17:35:27 +0200
Subject: [PATCH] Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
CC_OPTIMIZE_FOR_SIZE disables the often useful -Wmaybe-unused warning,
because that causes a ridiculous amount of false positives when combined
with -Os.
This means a lot of warnings don't show up in testing by the developers
that should see them with an 'allmodconfig' kernel that has
CC_OPTIMIZE_FOR_SIZE enabled, but only later in randconfig builds
that don't.
And this one by Linus, 14 years ago:
=================>
281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
=================>
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 22 May 2011 14:30:36 -0700
Subject: [PATCH] Give up on pushing CC_OPTIMIZE_FOR_SIZE
I still happen to believe that I$ miss costs are a major thing, but
sadly, -Os doesn't seem to be the solution. With or without it, gcc
will miss some obvious code size improvements, and with it enabled gcc
will sometimes make choices that aren't good even with high I$ miss
ratios.
For example, with -Os, gcc on x86 will turn a 20-byte constant memcpy
into a "rep movsl". While I sincerely hope that x86 CPU's will some day
do a good job at that, they certainly don't do it yet, and the cost is
higher than a L1 I$ miss would be.
Some day I hope we can re-enable this.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
I'm quite sure there were more, but these were that popped up in a
quick Git archeology.
And yeah, it was me who pushed for -Os originally 17 years ago, due to
the positive I$ impact, in theory:
=================>
96fffeb4b413 ("make CC_OPTIMIZE_FOR_SIZE non-experimental")
=================>
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 28 Apr 2008 01:39:43 +0200
Subject: [PATCH] make CC_OPTIMIZE_FOR_SIZE non-experimental
this option has been the default on a wide range of distributions
for a long time - time to make it non-experimental.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
But practice disagreed with theory, and obviously in the kernel
practice has supremacy.
But yes, I'd cautiously agree that reduced kernel size with a -Os build
is a stochastic proxy metric for better code and better performance -
but it comes with caveats and needs to be backed by other data or
robust first principles arguments too.
Thanks,
Ingo
On Thu, 6 Mar 2025 10:43:26 +0100
Ingo Molnar <mingo@kernel.org> wrote:
> * Uros Bizjak <ubizjak@gmail.com> wrote:
...
> And this one by Linus, 14 years ago:
>
> =================>
> 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> =================>
>
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Sun, 22 May 2011 14:30:36 -0700
> Subject: [PATCH] Give up on pushing CC_OPTIMIZE_FOR_SIZE
>
> I still happen to believe that I$ miss costs are a major thing, but
> sadly, -Os doesn't seem to be the solution. With or without it, gcc
> will miss some obvious code size improvements, and with it enabled gcc
> will sometimes make choices that aren't good even with high I$ miss
> ratios.
>
> For example, with -Os, gcc on x86 will turn a 20-byte constant memcpy
> into a "rep movsl". While I sincerely hope that x86 CPU's will some day
> do a good job at that, they certainly don't do it yet, and the cost is
> higher than a L1 I$ miss would be.
Well 'rep movsb' is a lot better than it was then.
Even on Sandy bridge (IIRC) it is ~20 clocks for short transfers (of any length).
Unlike the P4 with a 140 clock overhead!
Still slower for short fixed sizes, but probably good for anything variable
because of the costs of the function call and the conditionals to select the
'best' algorithm.
OTOH if you know it is only a few bytes a code loop may be best - and gcc will
convert it to a memcpy() call for you!
The really silly one was 'push immd_byte; pop reg' to get a sign extended value.
But I do remember -O2 being smaller than -Oz !
Just changing the inlining thresholds and code replication on loops
(and never unrollong loops) would probably be a good start.
David
On Thu, Mar 6, 2025, at 10:43, Ingo Molnar wrote:
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Wed, Mar 5, 2025 at 10:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
> So the best way to drive -Os forward is not to insist that it's good
> (it might still be crap), and not to insist that it's crap (it might
> have become better), but to dig out old problems and to look at what
> kind of code current compilers generate in the kernel with -Os.
>
> There's been a few pathological GCC optimizations in the past, but also
> other problems, such as this one 9 years ago that hid useful warnings:
>
> =================>
> 877417e6ffb9 Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
> =================>
>
> From: Arnd Bergmann <arnd@arndb.de>
> Date: Mon, 25 Apr 2016 17:35:27 +0200
> Subject: [PATCH] Kbuild: change CC_OPTIMIZE_FOR_SIZE definition
>
> CC_OPTIMIZE_FOR_SIZE disables the often useful -Wmaybe-unused warning,
> because that causes a ridiculous amount of false positives when combined
> with -Os.
I think that should have said '-Wmaybe-uninitialized', which is a thing
of the past, since (IIRC) gcc-9 changed the default inlining rules and
Linus turned it off globally after the false positives started happening
with -O2 as well.
This is rather annoying of course, since we miss a lot of warnings
for real bugs, but at least clang's -Wsometimes-uninitialized and
smatch still catch the important ones in the CI systems, but we keep
seeing new ones in linux-next and have to patch them.
> But yes, I'd cautiously agree that reduced kernel size with a -Os build
> is a stochastic proxy metric for better code and better performance -
> but it comes with caveats and needs to be backed by other data or
> robust first principles arguments too.
My impression is that for the most part, the -Os and -O2 options do
exactly what they promise, picking either a smaller or faster object
code. Clearly there is no single ideal choice because if some
optimization step is a win for both performance and size it would
be turned on for both.
The crazy options (-Oz and -O3) on the other hand are intentionally
not offered by Kconfig.
Arnd
* Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > > OTOH, -Os, where different code size/performance heuristics are used, now
> > > performs better w.r.t code size.
> >
> > Did anything change since:
> >
> > 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> > 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
> >
> > wrt -Os?
> >
> > Because if not, we still don't love -Os and you can drop the -Os argument.
>
> The -Os argument was to show the effect of the patch when the compiler
> is instructed to take care of the overall size. Giving the compiler
> -O2 and then looking at the overall size of the produced binary is
> just wrong.
>
> > And without any perf data showing any improvement, this patch does nothing but
> > enlarge -O2 size...
>
> Even to my surprise, the patch has some noticeable effects on the
> performance, please see the attachment in [1] for LMBench data or [2]
> for some excerpts from the data. So, I think the patch has potential
> to improve the performance.
>
> [1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
> [2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/
If you are measuring micro-costs, please make sure you pin the workload
to a single CPU (via 'taskset' for example) and run 'perf stat --null
--repeat 5' or so to measure the run-over-run noise of the benchmark.
Thanks,
Ingo
* Ingo Molnar <mingo@kernel.org> wrote:
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > > > OTOH, -Os, where different code size/performance heuristics are used, now
> > > > performs better w.r.t code size.
> > >
> > > Did anything change since:
> > >
> > > 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> > > 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
> > >
> > > wrt -Os?
> > >
> > > Because if not, we still don't love -Os and you can drop the -Os argument.
> >
> > The -Os argument was to show the effect of the patch when the compiler
> > is instructed to take care of the overall size. Giving the compiler
> > -O2 and then looking at the overall size of the produced binary is
> > just wrong.
> >
> > > And without any perf data showing any improvement, this patch does nothing but
> > > enlarge -O2 size...
> >
> > Even to my surprise, the patch has some noticeable effects on the
> > performance, please see the attachment in [1] for LMBench data or [2]
> > for some excerpts from the data. So, I think the patch has potential
> > to improve the performance.
> >
> > [1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
> > [2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/
>
> If you are measuring micro-costs, please make sure you pin the
> workload to a single CPU (via 'taskset' for example) and run 'perf
> stat --null --repeat 5' or so to measure the run-over-run noise of
> the benchmark.
And if the benchmark is context-switching heavy, you'll want to use
'perf stat -a' option to not have PMU context switching costs, and the
-C option to only measure on the pinned CPU.
For example, to measure pipe handling overhead, the naive measurement is:
starship:~> perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes
Total time: 6.939 [sec]
6.939128 usecs/op
144110 ops/sec
starship:~> perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes
Total time: 6.879 [sec]
6.879282 usecs/op
145364 ops/sec
See how the run-to-run noise is 0.9%?
If we measure it naively with perf stat, we get:
starship:~> perf stat perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes
Total time: 11.870 [sec]
11.870403 usecs/op
84243 ops/sec
Performance counter stats for 'perf bench sched pipe':
10,722.04 msec task-clock # 0.903 CPUs utilized
2,000,093 context-switches # 186.540 K/sec
499 cpu-migrations # 46.540 /sec
1,482 page-faults # 138.220 /sec
27,853,380,218 cycles # 2.598 GHz
18,434,409,889 stalled-cycles-frontend # 66.18% frontend cycles idle
24,277,227,239 instructions # 0.87 insn per cycle
# 0.76 stalled cycles per insn
5,001,727,980 branches # 466.490 M/sec
572,756,283 branch-misses # 11.45% of all branches
11.875458968 seconds time elapsed
0.271152000 seconds user
11.272766000 seconds sys
See how the usecs/op increased by +70% due to PMU switching overhead?
With --null we can reduce the PMU switching overhead by only measuring
elapsed time:
starship:~> perf stat --null perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes
Total time: 6.916 [sec]
6.916700 usecs/op
144577 ops/sec
Performance counter stats for 'perf bench sched pipe':
6.921547909 seconds time elapsed
0.341734000 seconds user
6.215287000 seconds sys
But noise is still high:
starship:~> perf stat --null --repeat 5 perf bench sched pipe
6.854731 usecs/op
7.082047 usecs/op
7.087193 usecs/op
6.934439 usecs/op
7.056695 usecs/op
...
Performance counter stats for 'perf bench sched pipe' (5 runs):
7.0093 +- 0.0463 seconds time elapsed ( +- 0.66% )
Likely due to the tasks migrating semi-randomly among cores.
We can pin them down to a single CPU (CPU2 in this case) via taskset:
starship:~> taskset 4 perf stat --null --repeat 5 perf bench sched pipe
5.575906 usecs/op
5.637112 usecs/op
5.532060 usecs/op
5.703270 usecs/op
5.506517 usecs/op
Performance counter stats for 'perf bench sched pipe' (5 runs):
5.5929 +- 0.0359 seconds time elapsed ( +- 0.64% )
Note how performance increased by ~25%, due to lack of migration, but
noise is still a bit high.
A good way to reduce noise is to measure instructions only:
starship:~> taskset 0x4 perf stat -e instructions --repeat 5 perf bench sched pipe
6.962279 usecs/op
6.917374 usecs/op
6.928672 usecs/op
6.939555 usecs/op
6.942980 usecs/op
Performance counter stats for 'perf bench sched pipe' (5 runs):
32,561,773,780 instructions ( +- 0.27% )
6.93977 +- 0.00735 seconds time elapsed ( +- 0.11% )
'Number of instructions executed' is an imperfect proxy for overhead.
(Not every instruction has the same overhead - but for compiler code
generation it's a useful proxy in most cases.)
But the best measurement is to avoid the PMU switching overhead via the
'-a' option, and limiting the measurement to the saturated CPU#2:
starship:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
5.808068 usecs/op
5.843716 usecs/op
5.826543 usecs/op
5.801616 usecs/op
5.793129 usecs/op
Performance counter stats for 'system wide' (5 runs):
32,244,691,275 instructions ( +- 0.21% )
5.81624 +- 0.00912 seconds time elapsed ( +- 0.16% )
Note how this measurement provides the highest performance for the
workload, almost as good as --null.
- Beware the difference in CPU mask parameters between taskset and perf
stat. (I tried to convince the perf tooling people to integrate
CPU-pinning into perf stat, but I digress.)
- Beware of cpufreq considerations: changing CPU frequencies will skew
your workload's performance by a lot more than the 0.1% kind of
noise we are trying to gun for. Setting your CPU governor to
'performance' will eliminate some (but not all) cpufreq artifacts.
- On modern systems there's also boot-to-boot variance of key data
structure alignment and cache access patterns, that can sometimes
rise beyond the noise of the measurement. These can send you on a
wild goose chase ...
Finally, you can use something like 'nice -n -10' to increase the
priority of your benchmark and reduce the impact of other workloads
running on your system.
Anyway, I think noise levels of around 0.1%-0.2% are about the best you
can expect in context-switch heavy workloads. (A bit better in
CPU-bound workloads with low context switching.)
Thanks,
Ingo
Hi Ingo,
my interest comes, because I just started to try to better understand
PCL and am reading the perf manual pages. Perhaps I should therefore
keep my RO-bit permanent for some more months, but:
> And if the benchmark is context-switching heavy, you'll want to use
> 'perf stat -a' option to not have PMU context switching costs, and the
I'm sure you know what you are talking about so I don't doubt the above
is correct but perhaps, the manual page should also clarify -a:
-a::
--all-cpus::
system-wide collection from all CPUs (default if no target is specified)
In the last example -a is combined with -C 2 which is even more irritating when
you just started with the manual pages.
But the main reason why I thought it might be OK to once toggle my
RO-bit is that I tried your examples and with the first one I have way
higher numbers than yours and I thought that must be, because you just
own the faster machine (as I would have expected):
> starship:~> perf bench sched pipe
> # Running 'sched/pipe' benchmark:
> # Executed 1000000 pipe operations between two processes
>
> Total time: 6.939 [sec]
>
> 6.939128 usecs/op
> 144110 ops/sec
lena:~> perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes
Total time: 11.129 [sec]
11.129952 usecs/op
89847 ops/sec
And I expected this to continue throughout the examples.
But -- to keep this short -- with the last example, my numbers are
suddenly significantly lower than yours:
> starship:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
> 5.808068 usecs/op
> 5.843716 usecs/op
> 5.826543 usecs/op
> 5.801616 usecs/op
> 5.793129 usecs/op
>
> Performance counter stats for 'system wide' (5 runs):
>
> 32,244,691,275 instructions ( +- 0.21% )
>
> 5.81624 +- 0.00912 seconds time elapsed ( +- 0.16% )
lena:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe
4.204444 usecs/op
4.169279 usecs/op
4.186812 usecs/op
4.217039 usecs/op
4.208538 usecs/op
Performance counter stats for 'system wide' (5 runs):
14,196,762,588 instructions ( +- 0.04% )
4.20203 +- 0.00854 seconds time elapsed ( +- 0.20% )
Of course, I don't want to waste anyones time if this is a so obvious
thing that only newbies don't understand. So, feel free to just ignore this.
Regards
Dirk
( I've Cc:-ed some perf gents as to the measurement artifacts observed below. Full report quoted below. ) * Dirk Gouders <dirk@gouders.net> wrote: > Hi Ingo, > > my interest comes, because I just started to try to better understand > PCL and am reading the perf manual pages. Perhaps I should therefore > keep my RO-bit permanent for some more months, but: > > > And if the benchmark is context-switching heavy, you'll want to use > > 'perf stat -a' option to not have PMU context switching costs, and the > > I'm sure you know what you are talking about so I don't doubt the above > is correct but perhaps, the manual page should also clarify -a: > > -a:: > --all-cpus:: > system-wide collection from all CPUs (default if no target is specified) > > In the last example -a is combined with -C 2 which is even more irritating when > you just started with the manual pages. > > > But the main reason why I thought it might be OK to once toggle my > RO-bit is that I tried your examples and with the first one I have way > higher numbers than yours and I thought that must be, because you just > own the faster machine (as I would have expected): > > > starship:~> perf bench sched pipe > > # Running 'sched/pipe' benchmark: > > # Executed 1000000 pipe operations between two processes > > > > Total time: 6.939 [sec] > > > > 6.939128 usecs/op > > 144110 ops/sec > > lena:~> perf bench sched pipe > # Running 'sched/pipe' benchmark: > # Executed 1000000 pipe operations between two processes > > Total time: 11.129 [sec] > > 11.129952 usecs/op > 89847 ops/sec > > And I expected this to continue throughout the examples. > > But -- to keep this short -- with the last example, my numbers are > suddenly significantly lower than yours: > > > starship:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe > > 5.808068 usecs/op > > 5.843716 usecs/op > > 5.826543 usecs/op > > 5.801616 usecs/op > > 5.793129 usecs/op > > > > Performance counter stats for 'system wide' (5 runs): > > > > 32,244,691,275 instructions ( +- 0.21% ) > > > > 5.81624 +- 0.00912 seconds time elapsed ( +- 0.16% ) > > lena:~> taskset 0x4 perf stat -a -C 2 -e instructions --repeat 5 perf bench sched pipe > 4.204444 usecs/op > 4.169279 usecs/op > 4.186812 usecs/op > 4.217039 usecs/op > 4.208538 usecs/op > > Performance counter stats for 'system wide' (5 runs): > > 14,196,762,588 instructions ( +- 0.04% ) > > 4.20203 +- 0.00854 seconds time elapsed ( +- 0.20% ) > > > > Of course, I don't want to waste anyones time if this is a so obvious > thing that only newbies don't understand. So, feel free to just ignore this. > > Regards > > Dirk
On Wed, Mar 5, 2025 at 8:55 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > On Sat, Mar 1, 2025 at 1:38 PM Borislav Petkov <bp@alien8.de> wrote:
> > >
> > > On Sat, Mar 01, 2025 at 10:05:56AM +0100, Uros Bizjak wrote:
> > > > OTOH, -Os, where different code size/performance heuristics are used, now
> > > > performs better w.r.t code size.
> > >
> > > Did anything change since:
> > >
> > > 281dc5c5ec0f ("Give up on pushing CC_OPTIMIZE_FOR_SIZE")
> > > 3a55fb0d9fe8 ("Tell the world we gave up on pushing CC_OPTIMIZE_FOR_SIZE")
> > >
> > > wrt -Os?
> > >
> > > Because if not, we still don't love -Os and you can drop the -Os argument.
> >
> > The -Os argument was to show the effect of the patch when the compiler
> > is instructed to take care of the overall size. Giving the compiler
> > -O2 and then looking at the overall size of the produced binary is
> > just wrong.
> >
> > > And without any perf data showing any improvement, this patch does nothing but
> > > enlarge -O2 size...
> >
> > Even to my surprise, the patch has some noticeable effects on the
> > performance, please see the attachment in [1] for LMBench data or [2]
> > for some excerpts from the data. So, I think the patch has potential
> > to improve the performance.
> >
> > [1] https://lore.kernel.org/lkml/CAFULd4YBcG45bigHBox2pu+To+Y5BzbRxG+pUr42AVOWSnfKsg@mail.gmail.com/
> > [2] https://lore.kernel.org/lkml/CAFULd4ZsSKwJ4Dz3cCAgaVsa4ypbb0e2savO-3_Ltbs=1wzgKQ@mail.gmail.com/
>
> If you are measuring micro-costs, please make sure you pin the workload
> to a single CPU (via 'taskset' for example) and run 'perf stat --null
> --repeat 5' or so to measure the run-over-run noise of the benchmark.
I simply run the lmbench command, where the benchmark was obtained as
.rpm for Fedora 41 [1], assuming that the benchmark itself sets the
benchmarked system to the correct state and does enough repetitions to
obtain a meaningful result [2].
[1] https://fedora.pkgs.org/41/rpm-sphere-x86_64/lmbench-3.0-0.a9.3.x86_64.rpm.html
[2] https://www.usenix.org/legacy/publications/library/proceedings/sd96/full_papers/mcvoy.pdf
Thanks,
Uros.
* Uros Bizjak <ubizjak@gmail.com> wrote: > I simply run the lmbench command, where the benchmark was obtained as > .rpm for Fedora 41 [1], assuming that the benchmark itself sets the > benchmarked system to the correct state and does enough repetitions > to obtain a meaningful result [2]. These assumptions are not valid in a lot of cases - see my other email with an example. Thanks, Ingo
On Wed, Mar 5, 2025 at 9:21 PM Ingo Molnar <mingo@kernel.org> wrote: > > > * Uros Bizjak <ubizjak@gmail.com> wrote: > > > I simply run the lmbench command, where the benchmark was obtained as > > .rpm for Fedora 41 [1], assuming that the benchmark itself sets the > > benchmarked system to the correct state and does enough repetitions > > to obtain a meaningful result [2]. > > These assumptions are not valid in a lot of cases - see my other email > with an example. Thanks for the post, it was very thorough and informative, an interesting read indeed. OTOH, the proposed approach requires a deep specialist knowledge, which IMO is unreasonable to expect from the prospective patch submitter. I totally agree with Dave's request for some performance numbers [partial quote from his post earlier in the thread]: "I'm seriously not picky: will-it-scale, lmbench, dbench, kernel compiles. *ANYTHING*. *ANY* hardware. Run it on your laptop." that would indicate if the patch holds its promise. However, "Run it on your laptop" already implies the ability to compile, install and run a distribution kernel, not exactly a trivial task, compared to e.g. running a patched kernel under QEMU. Some kind of easy-to-run benchmark scripts, or documented easy-to-follow steps, would be of immense help to the patch submitter to assess the performance aspects of the patch. As said elsewhere, code size is a nice metric, but it is not exactly appropriate for an -O2 compiled kernel. As probably everybody reading LKML noticed, there is quite some CI infrastructure available that tests the posted patches. That infrastructure is set and maintained by experts that *can* precisely measure performance impact of the patch. My proposal would be to ease the requirements for the patches to something that Dave requested in order to allow patches to be integrated and further tested by the CI infrastructure. Disclaimer: Please read the above as a feedback from the wannabe contributor, not as a critique of the established development process. Thanks, Uros.
On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
>
> Even to my surprise, the patch has some noticeable effects on the
> performance, please see the attachment in [1] for LMBench data or [2]
> for some excerpts from the data. So, I think the patch has potential
> to improve the performance.
I suspect some of the performance difference - which looks
unexpectedly large - is due to having run them on a CPU with the
horrendous indirect return costs, and then inlining can make a huge
difference.
kvm
Regardless, I absolutely think that using asm_inline here is the right
thing for the locked instructions.
That said, I do want to bring up another issue: maybe it's time to
just retire the LOCK_PREFIX thing entirely?
It harkens back to Ye Olde Days when UP was the norm, and we didn't
want to pay the cost of lock prefixes when the kernel was built for
SMP but was run on an UP machine.
And honestly, none of that makes sense any more. You can't buy a UP
machine any more, and the only UP case would be some silly minimal
virtual environment, and if people really care about that minimal
case, they should just compile the kernel without SMP support.
Becxause UP has gone from being the default to being irrelevant. At
least for x86-64.
So I think we should just get rid of LOCK_PREFIX_HERE and the
smp_locks section entirely.
Which would probably obviate the need for your patch, since then the
compiler wouldn't see it as some big instruction. But your patch isn't
wrong, so this is absolutely not a NAK, more of a "we should go
further".
Hmm?
Linus
On 3/5/25 09:04, Linus Torvalds wrote: > > That said, I do want to bring up another issue: maybe it's time to > just retire the LOCK_PREFIX thing entirely? > > It harkens back to Ye Olde Days when UP was the norm, and we didn't > want to pay the cost of lock prefixes when the kernel was built for > SMP but was run on an UP machine. > > And honestly, none of that makes sense any more. You can't buy a UP > machine any more, and the only UP case would be some silly minimal > virtual environment, and if people really care about that minimal > case, they should just compile the kernel without SMP support. > Becxause UP has gone from being the default to being irrelevant. At > least for x86-64. > I think the key there is that "they should just compile the kernel without SMP support" (which may very well make sense for some oddball embedded uses) *does* still mean that remaining cases of locks can/should be elided so demacroizing it is still not an option. But yes... would get rid of a lot of crap machinery which is basically dead code, and that would be a Very Good Thing[TM]. -hpa
On Wed, 5 Mar 2025 07:04:08 -1000 Linus Torvalds <torvalds@linuxfoundation.org> wrote: > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > Even to my surprise, the patch has some noticeable effects on the > > performance, please see the attachment in [1] for LMBench data or [2] > > for some excerpts from the data. So, I think the patch has potential > > to improve the performance. > > I suspect some of the performance difference - which looks > unexpectedly large - is due to having run them on a CPU with the > horrendous indirect return costs, and then inlining can make a huge > difference. ... Another possibility is that the processes are getting bounced around cpu in a slightly different way. An idle cpu might be running at 800MHz, run something that spins on it and the clock speed will soon jump to 4GHz. But if your 'spinning' process is migrated to a different cpu it starts again at 800MHz. (I had something where a fpga compile when from 12 mins to over 20 because the kernel RSB stuffing caused the scheduler to behave differently even though nothing was doing a lot of system calls.) All sorts of things can affect that - possibly even making some code faster! The (IIRC) 30k increase in code size will be a few functions being inlined. The bloat-o-meter might show which, and forcing a few inlines the same way should reduce that difference. OTOH I'm surprised that a single (or two) instruction makes that much difference - unless gcc is managing to discard the size of the entire function rather than just the asm block itself. Benchmarking on modern cpu is hard. You really do need to lock the cpu frequencies - and that may not be supported. David
On Wed, Mar 5, 2025 at 9:14 PM David Laight <david.laight.linux@gmail.com> wrote: > > On Wed, 5 Mar 2025 07:04:08 -1000 > Linus Torvalds <torvalds@linuxfoundation.org> wrote: > > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > Even to my surprise, the patch has some noticeable effects on the > > > performance, please see the attachment in [1] for LMBench data or [2] > > > for some excerpts from the data. So, I think the patch has potential > > > to improve the performance. > > > > I suspect some of the performance difference - which looks > > unexpectedly large - is due to having run them on a CPU with the > > horrendous indirect return costs, and then inlining can make a huge > > difference. > ... > > Another possibility is that the processes are getting bounced around > cpu in a slightly different way. > An idle cpu might be running at 800MHz, run something that spins on it > and the clock speed will soon jump to 4GHz. > But if your 'spinning' process is migrated to a different cpu it starts > again at 800MHz. > > (I had something where a fpga compile when from 12 mins to over 20 because > the kernel RSB stuffing caused the scheduler to behave differently even > though nothing was doing a lot of system calls.) > > All sorts of things can affect that - possibly even making some code faster! > > The (IIRC) 30k increase in code size will be a few functions being inlined. > The bloat-o-meter might show which, and forcing a few inlines the same way > should reduce that difference. bloat-o-meter is an excellent idea, I'll analyse binaries some more and report my findings. > OTOH I'm surprised that a single (or two) instruction makes that much > difference - unless gcc is managing to discard the size of the entire > function rather than just the asm block itself. Actually, the compiler uses estimated function code size as one of the conditions when to fully (or partially - hot/cold split) inline the function. The estimated code size of functions that use (patched) locking primitives is now lower, so now they fall below the inlining threshold, causing the compiler to do more inlining. Compiler knows the performance/size tradeoff of setting up a function call and the perf/size tradeoff of creating the function frame in the called function and decides accordingly. Please note that the inlining is multi-level, so it doesn't stop at the first function. Thanks, Uros.
On Thu, Mar 6, 2025 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Mar 5, 2025 at 9:14 PM David Laight
> <david.laight.linux@gmail.com> wrote:
> >
> > On Wed, 5 Mar 2025 07:04:08 -1000
> > Linus Torvalds <torvalds@linuxfoundation.org> wrote:
> >
> > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > Even to my surprise, the patch has some noticeable effects on the
> > > > performance, please see the attachment in [1] for LMBench data or [2]
> > > > for some excerpts from the data. So, I think the patch has potential
> > > > to improve the performance.
> > >
> > > I suspect some of the performance difference - which looks
> > > unexpectedly large - is due to having run them on a CPU with the
> > > horrendous indirect return costs, and then inlining can make a huge
> > > difference.
> > ...
> >
> > Another possibility is that the processes are getting bounced around
> > cpu in a slightly different way.
> > An idle cpu might be running at 800MHz, run something that spins on it
> > and the clock speed will soon jump to 4GHz.
> > But if your 'spinning' process is migrated to a different cpu it starts
> > again at 800MHz.
> >
> > (I had something where a fpga compile when from 12 mins to over 20 because
> > the kernel RSB stuffing caused the scheduler to behave differently even
> > though nothing was doing a lot of system calls.)
> >
> > All sorts of things can affect that - possibly even making some code faster!
> >
> > The (IIRC) 30k increase in code size will be a few functions being inlined.
> > The bloat-o-meter might show which, and forcing a few inlines the same way
> > should reduce that difference.
>
> bloat-o-meter is an excellent idea, I'll analyse binaries some more
> and report my findings.
Please find attached bloat.txt where:
a) some functions now include once-called functions. These are:
copy_process 6465 10191 +3726
balance_dirty_pages_ratelimited_flags 237 2949 +2712
icl_plane_update_noarm 5800 7969 +2169
samsung_input_mapping 3375 5170 +1795
ext4_do_update_inode.isra - 1526 +1526
that now include:
ext4_mark_iloc_dirty 1735 106 -1629
samsung_gamepad_input_mapping.isra 2046 - -2046
icl_program_input_csc 2203 - -2203
copy_mm 2242 - -2242
balance_dirty_pages 2657 - -2657
b) ISRA [interprocedural scalar replacement of aggregates,
interprocedural pass that removes unused function return values
(turning functions returning a value which is never used into void
functions) and removes unused function parameters. It can also
replace an aggregate parameter by a set of other parameters
representing part of the original, turning those passed by reference
into new ones which pass the value directly.]
ext4_do_update_inode.isra - 1526 +1526
nfs4_begin_drain_session.isra - 249 +249
nfs4_end_drain_session.isra - 168 +168
__guc_action_register_multi_lrc_v70.isra 335 500 +165
__i915_gem_free_objects.isra - 144 +144
...
membarrier_register_private_expedited.isra 108 - -108
syncobj_eventfd_entry_func.isra 445 314 -131
__ext4_sb_bread_gfp.isra 140 - -140
class_preempt_notrace_destructor.isra 145 - -145
p9_fid_put.isra 151 - -151
__mm_cid_try_get.isra 238 - -238
membarrier_global_expedited.isra 294 - -294
mm_cid_get.isra 295 - -295
samsung_gamepad_input_mapping.isra.cold 604 - -604
samsung_gamepad_input_mapping.isra 2046 - -2046
c) different split points of hot/cold split that just move code around:
samsung_input_mapping.cold 900 1500 +600
__i915_request_reset.cold 311 389 +78
nfs_update_inode.cold 77 153 +76
__do_sys_swapon.cold 404 455 +51
copy_process.cold - 45 +45
tg3_get_invariants.cold 73 115 +42
...
hibernate.cold 671 643 -28
copy_mm.cold 31 - -31
software_resume.cold 249 207 -42
io_poll_wake.cold 106 54 -52
samsung_gamepad_input_mapping.isra.cold 604 - -604
c) full inline of small functions with locking insn (~150 cases).
These bring in most of the performance increase because there is no
call setup. E.g.:
0000000000a50e10 <release_devnum>:
a50e10: 48 63 07 movslq (%rdi),%rax
a50e13: 85 c0 test %eax,%eax
a50e15: 7e 10 jle a50e27 <release_devnum+0x17>
a50e17: 48 8b 4f 50 mov 0x50(%rdi),%rcx
a50e1b: f0 48 0f b3 41 50 lock btr %rax,0x50(%rcx)
a50e21: c7 07 ff ff ff ff movl $0xffffffff,(%rdi)
a50e27: e9 00 00 00 00 jmp a50e2c <release_devnum+0x1c>
a50e28: R_X86_64_PLT32 __x86_return_thunk-0x4
a50e2c: 0f 1f 40 00 nopl 0x0(%rax)
IMO, for 0.14% code increase, these changes are desirable.
Thanks,
Uros.
* Uros Bizjak <ubizjak@gmail.com> wrote:
> On Thu, Mar 6, 2025 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Mar 5, 2025 at 9:14 PM David Laight
> > <david.laight.linux@gmail.com> wrote:
> > >
> > > On Wed, 5 Mar 2025 07:04:08 -1000
> > > Linus Torvalds <torvalds@linuxfoundation.org> wrote:
> > >
> > > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > Even to my surprise, the patch has some noticeable effects on the
> > > > > performance, please see the attachment in [1] for LMBench data or [2]
> > > > > for some excerpts from the data. So, I think the patch has potential
> > > > > to improve the performance.
> > > >
> > > > I suspect some of the performance difference - which looks
> > > > unexpectedly large - is due to having run them on a CPU with the
> > > > horrendous indirect return costs, and then inlining can make a huge
> > > > difference.
> > > ...
> > >
> > > Another possibility is that the processes are getting bounced around
> > > cpu in a slightly different way.
> > > An idle cpu might be running at 800MHz, run something that spins on it
> > > and the clock speed will soon jump to 4GHz.
> > > But if your 'spinning' process is migrated to a different cpu it starts
> > > again at 800MHz.
> > >
> > > (I had something where a fpga compile when from 12 mins to over 20 because
> > > the kernel RSB stuffing caused the scheduler to behave differently even
> > > though nothing was doing a lot of system calls.)
> > >
> > > All sorts of things can affect that - possibly even making some code faster!
> > >
> > > The (IIRC) 30k increase in code size will be a few functions being inlined.
> > > The bloat-o-meter might show which, and forcing a few inlines the same way
> > > should reduce that difference.
> >
> > bloat-o-meter is an excellent idea, I'll analyse binaries some more
> > and report my findings.
>
> Please find attached bloat.txt where:
>
> a) some functions now include once-called functions. These are:
>
> copy_process 6465 10191 +3726
> balance_dirty_pages_ratelimited_flags 237 2949 +2712
> icl_plane_update_noarm 5800 7969 +2169
> samsung_input_mapping 3375 5170 +1795
> ext4_do_update_inode.isra - 1526 +1526
>
> that now include:
>
> ext4_mark_iloc_dirty 1735 106 -1629
> samsung_gamepad_input_mapping.isra 2046 - -2046
> icl_program_input_csc 2203 - -2203
> copy_mm 2242 - -2242
> balance_dirty_pages 2657 - -2657
>
> b) ISRA [interprocedural scalar replacement of aggregates,
> interprocedural pass that removes unused function return values
> (turning functions returning a value which is never used into void
> functions) and removes unused function parameters. It can also
> replace an aggregate parameter by a set of other parameters
> representing part of the original, turning those passed by reference
> into new ones which pass the value directly.]
>
> ext4_do_update_inode.isra - 1526 +1526
> nfs4_begin_drain_session.isra - 249 +249
> nfs4_end_drain_session.isra - 168 +168
> __guc_action_register_multi_lrc_v70.isra 335 500 +165
> __i915_gem_free_objects.isra - 144 +144
> ...
> membarrier_register_private_expedited.isra 108 - -108
> syncobj_eventfd_entry_func.isra 445 314 -131
> __ext4_sb_bread_gfp.isra 140 - -140
> class_preempt_notrace_destructor.isra 145 - -145
> p9_fid_put.isra 151 - -151
> __mm_cid_try_get.isra 238 - -238
> membarrier_global_expedited.isra 294 - -294
> mm_cid_get.isra 295 - -295
> samsung_gamepad_input_mapping.isra.cold 604 - -604
> samsung_gamepad_input_mapping.isra 2046 - -2046
>
> c) different split points of hot/cold split that just move code around:
>
> samsung_input_mapping.cold 900 1500 +600
> __i915_request_reset.cold 311 389 +78
> nfs_update_inode.cold 77 153 +76
> __do_sys_swapon.cold 404 455 +51
> copy_process.cold - 45 +45
> tg3_get_invariants.cold 73 115 +42
> ...
> hibernate.cold 671 643 -28
> copy_mm.cold 31 - -31
> software_resume.cold 249 207 -42
> io_poll_wake.cold 106 54 -52
> samsung_gamepad_input_mapping.isra.cold 604 - -604
>
> c) full inline of small functions with locking insn (~150 cases).
> These bring in most of the performance increase because there is no
> call setup. E.g.:
>
> 0000000000a50e10 <release_devnum>:
> a50e10: 48 63 07 movslq (%rdi),%rax
> a50e13: 85 c0 test %eax,%eax
> a50e15: 7e 10 jle a50e27 <release_devnum+0x17>
> a50e17: 48 8b 4f 50 mov 0x50(%rdi),%rcx
> a50e1b: f0 48 0f b3 41 50 lock btr %rax,0x50(%rcx)
> a50e21: c7 07 ff ff ff ff movl $0xffffffff,(%rdi)
> a50e27: e9 00 00 00 00 jmp a50e2c <release_devnum+0x1c>
> a50e28: R_X86_64_PLT32 __x86_return_thunk-0x4
> a50e2c: 0f 1f 40 00 nopl 0x0(%rax)
>
> IMO, for 0.14% code increase, these changes are desirable.
I concur, and it's extra desirable IMHO due to the per function
overhead of CPU bug mitigations like retpolines.
The number of function calls executed in a workload can be measured via
perf on most modern x86 CPUs as well. For example on Zen5 CPUs the
number of RET instructions can be counted:
{
EventName: ex_ret_near_ret,
EventCode: 0xc8,
BriefDescription: Retired near returns (RET or RET Iw).
},
Which ought to be a good proxy for function calls (modulo
tail-optimized jumps).
Thanks,
Ingo
On Thu, Mar 6, 2025 at 11:19 PM Ingo Molnar <mingo@kernel.org> wrote: > > > * Uros Bizjak <ubizjak@gmail.com> wrote: > > > On Thu, Mar 6, 2025 at 11:45 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > On Wed, Mar 5, 2025 at 9:14 PM David Laight > > > <david.laight.linux@gmail.com> wrote: > > > > > > > > On Wed, 5 Mar 2025 07:04:08 -1000 > > > > Linus Torvalds <torvalds@linuxfoundation.org> wrote: > > > > > > > > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > > > > > > > Even to my surprise, the patch has some noticeable effects on the > > > > > > performance, please see the attachment in [1] for LMBench data or [2] > > > > > > for some excerpts from the data. So, I think the patch has potential > > > > > > to improve the performance. > > > > > > > > > > I suspect some of the performance difference - which looks > > > > > unexpectedly large - is due to having run them on a CPU with the > > > > > horrendous indirect return costs, and then inlining can make a huge > > > > > difference. > > > > ... > > > > > > > > Another possibility is that the processes are getting bounced around > > > > cpu in a slightly different way. > > > > An idle cpu might be running at 800MHz, run something that spins on it > > > > and the clock speed will soon jump to 4GHz. > > > > But if your 'spinning' process is migrated to a different cpu it starts > > > > again at 800MHz. > > > > > > > > (I had something where a fpga compile when from 12 mins to over 20 because > > > > the kernel RSB stuffing caused the scheduler to behave differently even > > > > though nothing was doing a lot of system calls.) > > > > > > > > All sorts of things can affect that - possibly even making some code faster! > > > > > > > > The (IIRC) 30k increase in code size will be a few functions being inlined. > > > > The bloat-o-meter might show which, and forcing a few inlines the same way > > > > should reduce that difference. > > > > > > bloat-o-meter is an excellent idea, I'll analyse binaries some more > > > and report my findings. > > > > Please find attached bloat.txt where: > > > > a) some functions now include once-called functions. These are: > > > > copy_process 6465 10191 +3726 > > balance_dirty_pages_ratelimited_flags 237 2949 +2712 > > icl_plane_update_noarm 5800 7969 +2169 > > samsung_input_mapping 3375 5170 +1795 > > ext4_do_update_inode.isra - 1526 +1526 > > > > that now include: > > > > ext4_mark_iloc_dirty 1735 106 -1629 > > samsung_gamepad_input_mapping.isra 2046 - -2046 > > icl_program_input_csc 2203 - -2203 > > copy_mm 2242 - -2242 > > balance_dirty_pages 2657 - -2657 > > > > b) ISRA [interprocedural scalar replacement of aggregates, > > interprocedural pass that removes unused function return values > > (turning functions returning a value which is never used into void > > functions) and removes unused function parameters. It can also > > replace an aggregate parameter by a set of other parameters > > representing part of the original, turning those passed by reference > > into new ones which pass the value directly.] > > > > ext4_do_update_inode.isra - 1526 +1526 > > nfs4_begin_drain_session.isra - 249 +249 > > nfs4_end_drain_session.isra - 168 +168 > > __guc_action_register_multi_lrc_v70.isra 335 500 +165 > > __i915_gem_free_objects.isra - 144 +144 > > ... > > membarrier_register_private_expedited.isra 108 - -108 > > syncobj_eventfd_entry_func.isra 445 314 -131 > > __ext4_sb_bread_gfp.isra 140 - -140 > > class_preempt_notrace_destructor.isra 145 - -145 > > p9_fid_put.isra 151 - -151 > > __mm_cid_try_get.isra 238 - -238 > > membarrier_global_expedited.isra 294 - -294 > > mm_cid_get.isra 295 - -295 > > samsung_gamepad_input_mapping.isra.cold 604 - -604 > > samsung_gamepad_input_mapping.isra 2046 - -2046 > > > > c) different split points of hot/cold split that just move code around: > > > > samsung_input_mapping.cold 900 1500 +600 > > __i915_request_reset.cold 311 389 +78 > > nfs_update_inode.cold 77 153 +76 > > __do_sys_swapon.cold 404 455 +51 > > copy_process.cold - 45 +45 > > tg3_get_invariants.cold 73 115 +42 > > ... > > hibernate.cold 671 643 -28 > > copy_mm.cold 31 - -31 > > software_resume.cold 249 207 -42 > > io_poll_wake.cold 106 54 -52 > > samsung_gamepad_input_mapping.isra.cold 604 - -604 > > > > c) full inline of small functions with locking insn (~150 cases). > > These bring in most of the performance increase because there is no > > call setup. E.g.: > > > > 0000000000a50e10 <release_devnum>: > > a50e10: 48 63 07 movslq (%rdi),%rax > > a50e13: 85 c0 test %eax,%eax > > a50e15: 7e 10 jle a50e27 <release_devnum+0x17> > > a50e17: 48 8b 4f 50 mov 0x50(%rdi),%rcx > > a50e1b: f0 48 0f b3 41 50 lock btr %rax,0x50(%rcx) > > a50e21: c7 07 ff ff ff ff movl $0xffffffff,(%rdi) > > a50e27: e9 00 00 00 00 jmp a50e2c <release_devnum+0x1c> > > a50e28: R_X86_64_PLT32 __x86_return_thunk-0x4 > > a50e2c: 0f 1f 40 00 nopl 0x0(%rax) > > > > IMO, for 0.14% code increase, these changes are desirable. > > I concur, and it's extra desirable IMHO due to the per function > overhead of CPU bug mitigations like retpolines. Thanks! I will submit a v2 patch next week that will summarize all the measurements in its commit message. BR, Uros.
On Wed, Mar 5, 2025 at 6:04 PM Linus Torvalds <torvalds@linuxfoundation.org> wrote: > > On Tue, 4 Mar 2025 at 22:54, Uros Bizjak <ubizjak@gmail.com> wrote: > > > > Even to my surprise, the patch has some noticeable effects on the > > performance, please see the attachment in [1] for LMBench data or [2] > > for some excerpts from the data. So, I think the patch has potential > > to improve the performance. > > I suspect some of the performance difference - which looks > unexpectedly large - is due to having run them on a CPU with the > horrendous indirect return costs, and then inlining can make a huge > difference. > kvm > Regardless, I absolutely think that using asm_inline here is the right > thing for the locked instructions. It is "Intel(R) Core(TM) i7-10710U" > That said, I do want to bring up another issue: maybe it's time to > just retire the LOCK_PREFIX thing entirely? > > It harkens back to Ye Olde Days when UP was the norm, and we didn't > want to pay the cost of lock prefixes when the kernel was built for > SMP but was run on an UP machine. > > And honestly, none of that makes sense any more. You can't buy a UP > machine any more, and the only UP case would be some silly minimal > virtual environment, and if people really care about that minimal > case, they should just compile the kernel without SMP support. > Becxause UP has gone from being the default to being irrelevant. At > least for x86-64. > > So I think we should just get rid of LOCK_PREFIX_HERE and the > smp_locks section entirely. Please note that this functionality is shared with i386 target, so the removal, proposed above, would somehow penalize 32bit targets. The situation w.r.t. UP vs SMP is not that clear there, maybe some distro still provides i386 SMP kernels that would then run unoptimized on UP systems. From the compiler POV, now that "lock; " prefix lost its semicolon, removing LOCK_PREFIX_HERE or using asm_inline would result in exactly the same code. The problematic 31k code size increase (1.1%) with -O2 is inevitable either way, if we want to move forward. My proposal would be to use what a modern compiler offers. By using asm_inline, we can keep the status quo (mostly for i386) for some more time, and still move forward. And we get that -Os code size *decrease* as a bonus for those that want to shave the last byte from the kernel. Thanks, Uros.
On Wed, 5 Mar 2025 20:47:46 +0100 Uros Bizjak <ubizjak@gmail.com> wrote: ... > From the compiler POV, now that "lock; " prefix lost its semicolon, > removing LOCK_PREFIX_HERE or using asm_inline would result in exactly > the same code. The problematic 31k code size increase (1.1%) with -O2 > is inevitable either way, if we want to move forward. Have you looked to see where that 31k comes from? The compiler must be very near to inlining something and removing a couple of instructions is making a big difference. It is quite likely that some other change will have the same (or the reverse) effect. David
On Wed, Mar 05, 2025 at 07:04:08AM -1000, Linus Torvalds wrote: > And honestly, none of that makes sense any more. You can't buy a UP > machine any more, and the only UP case would be some silly minimal > virtual environment, and if people really care about that minimal > case, they should just compile the kernel without SMP support. > Becxause UP has gone from being the default to being irrelevant. At > least for x86-64. > > So I think we should just get rid of LOCK_PREFIX_HERE and the > smp_locks section entirely. > > Which would probably obviate the need for your patch, since then the > compiler wouldn't see it as some big instruction. But your patch isn't > wrong, so this is absolutely not a NAK, more of a "we should go > further". > > Hmm? I'm all for removing that. On that note, a number of architectures have already made the next step and that is mandate SMP=y. The down-side of that it that it would make a definite dent in the compile coverage for SMP=n code -- but perhaps we have enough robots to get away with that.
On Fri, Feb 28, 2025 at 1:38 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > According to: > > https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html > > the usage of asm pseudo directives in the asm template can confuse > the compiler to wrongly estimate the size of the generated > code. > > The LOCK_PREFIX macro expands to several asm pseudo directives, so > its usage in atomic locking insns causes instruction length estimate > to fail significantly (the specially instrumented compiler reports > the estimated length of these asm templates to be 6 instructions long). > > This incorrect estimate further causes unoptimal inlining decisions, > unoptimal instruction scheduling and unoptimal code block alignments > for functions that use these locking primitives. > > Use asm_inline instead: > > https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html > > which is a feature that makes GCC pretend some inline assembler code > is tiny (while it would think it is huge), instead of just asm. > > For code size estimation, the size of the asm is then taken as > the minimum size of one instruction, ignoring how many instructions > compiler thinks it is. > > The code size of the resulting x86_64 defconfig object file increases > for 33.264 kbytes, representing 1.2% code size increase: > > text data bss dec hex filename > 27450107 4633332 814148 32897587 1f5fa33 vmlinux-old.o > 27483371 4633784 814148 32931303 1f67de7 vmlinux-new.o > > mainly due to different inlining decisions of -O2 build. FTR, -Os (where generated code size really matters) x86_64 defconfig object file *decreases* for 24.388 kbytes, representing 1.0% code size *decrease*: text data bss dec hex filename 23883860 4617284 814212 29315356 1bf511c vmlinux-old.o 23859472 4615404 814212 29289088 1beea80 vmlinux-new.o again mainly due to different inlining decisions of -Os build. Uros.
© 2016 - 2025 Red Hat, Inc.