arch/arm/include/asm/div64.h | 6 +----- include/asm-generic/div64.h | 6 +----- include/linux/compiler_types.h | 10 ++++++++++ 3 files changed, 12 insertions(+), 10 deletions(-)
inline keyword is often ignored by compilers.
We need something slightly stronger in networking fast paths
but __always_inline is too strong.
Instead, generalize idea Nicolas used in commit d533cb2d2af4
("__arch_xprod64(): make __always_inline when optimizing for performance")
This will help CONFIG_CC_OPTIMIZE_FOR_SIZE=y users keeping
their kernels small.
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/176847720679.3956289.12601442580224129560.git-patchwork-notify@kernel.org/T/#m2d7e201372a8aae1ce62a0b548e55fd4fe804909
Cc: Nicolas Pitre <npitre@baylibre.com>
---
arch/arm/include/asm/div64.h | 6 +-----
include/asm-generic/div64.h | 6 +-----
include/linux/compiler_types.h | 10 ++++++++++
3 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index d3ef8e416b27d22d38bf084e091b0e4795f74bd4..877dfc4c4c7344849eec2109b66c2825561719dc 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -52,11 +52,7 @@ static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
#else
-#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
-static __always_inline
-#else
-static inline
-#endif
+static inline_for_performance
uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
{
unsigned long long res;
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 25e7b4b58dcf55a395b9db72e01f2cd220da58a0..9893356fff55679304f68833c11c8ae9052b9cea 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -134,11 +134,7 @@
* Hoping for compile-time optimization of conditional code.
* Architectures may provide their own optimized assembly implementation.
*/
-#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
-static __always_inline
-#else
-static inline
-#endif
+static inline_for_performance
uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
{
uint32_t m_lo = m;
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d3318a3c257775d4f44e8f2eb7911ac52eefecc5..58b3de1f4c2540b6ffabd916948396ac8df9ba8f 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -248,6 +248,16 @@ struct ftrace_likely_data {
*/
#define inline inline __gnu_inline __inline_maybe_unused notrace
+/*
+ * Compilers might decide to ignore inline hint.
+ * Functions that are performance critical can use inline_for_performance.
+ */
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+#define inline_for_performance __always_inline
+#else
+#define inline_for_performance
+#endif
+
/*
* gcc provides both __inline__ and __inline as alternate spellings of
* the inline keyword, though the latter is undocumented. New kernel
base-commit: e84d960149e71e8d5e4db69775ce31305898ed0c
--
2.52.0.457.g6b5491de43-goog
Hi Eric,
kernel test robot noticed the following build warnings:
[auto build test WARNING on e84d960149e71e8d5e4db69775ce31305898ed0c]
url: https://github.com/intel-lab-lkp/linux/commits/Eric-Dumazet/compiler_types-Introduce-inline_for_performance/20260118-232653
base: e84d960149e71e8d5e4db69775ce31305898ed0c
patch link: https://lore.kernel.org/r/20260118152448.2560414-1-edumazet%40google.com
patch subject: [PATCH] compiler_types: Introduce inline_for_performance
config: arm-randconfig-004-20260119 (https://download.01.org/0day-ci/archive/20260119/202601190420.RlBoZSGm-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 14.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260119/202601190420.RlBoZSGm-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601190420.RlBoZSGm-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from include/linux/math.h:6,
from include/linux/kernel.h:27,
from include/linux/random.h:7,
from include/linux/nodemask.h:94,
from include/linux/numa.h:6,
from include/linux/cpumask.h:15,
from include/linux/smp.h:13,
from include/linux/lockdep.h:14,
from include/linux/spinlock.h:63,
from lib/dec_and_lock.c:3:
>> arch/arm/include/asm/div64.h:56:10: warning: '__arch_xprod_64' defined but not used [-Wunused-function]
56 | uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
| ^~~~~~~~~~~~~~~
vim +/__arch_xprod_64 +56 arch/arm/include/asm/div64.h
fa4adc614922c2 include/asm-arm/div64.h Nicolas Pitre 2006-12-06 54
5f712d70e20a46 arch/arm/include/asm/div64.h Eric Dumazet 2026-01-18 55 static inline_for_performance
d533cb2d2af400 arch/arm/include/asm/div64.h Nicolas Pitre 2024-10-03 @56 uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 57 {
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 58 unsigned long long res;
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 59 register unsigned int tmp asm("ip") = 0;
06508533d51a1d arch/arm/include/asm/div64.h Nicolas Pitre 2024-10-03 60 bool no_ovf = __builtin_constant_p(m) &&
06508533d51a1d arch/arm/include/asm/div64.h Nicolas Pitre 2024-10-03 61 ((m >> 32) + (m & 0xffffffff) < 0x100000000);
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 62
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 63 if (!bias) {
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 64 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 65 "mov %Q0, #0"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 66 : "=&r" (res)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 67 : "r" (m), "r" (n)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 68 : "cc");
06508533d51a1d arch/arm/include/asm/div64.h Nicolas Pitre 2024-10-03 69 } else if (no_ovf) {
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 70 res = m;
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 71 asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 72 "mov %Q0, #0"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 73 : "+&r" (res)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 74 : "r" (m), "r" (n)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 75 : "cc");
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 76 } else {
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 77 asm ( "umull %Q0, %R0, %Q2, %Q3\n\t"
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 78 "cmn %Q0, %Q2\n\t"
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 79 "adcs %R0, %R0, %R2\n\t"
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 80 "adc %Q0, %1, #0"
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 81 : "=&r" (res), "+&r" (tmp)
73e592f3bc2cdc arch/arm/include/asm/div64.h Nicolas Pitre 2016-01-27 82 : "r" (m), "r" (n)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 83 : "cc");
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 84 }
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 85
06508533d51a1d arch/arm/include/asm/div64.h Nicolas Pitre 2024-10-03 86 if (no_ovf) {
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 87 asm ( "umlal %R0, %Q0, %R1, %Q2\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 88 "umlal %R0, %Q0, %Q1, %R2\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 89 "mov %R0, #0\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 90 "umlal %Q0, %R0, %R1, %R2"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 91 : "+&r" (res)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 92 : "r" (m), "r" (n)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 93 : "cc");
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 94 } else {
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 95 asm ( "umlal %R0, %Q0, %R2, %Q3\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 96 "umlal %R0, %1, %Q2, %R3\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 97 "mov %R0, #0\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 98 "adds %Q0, %1, %Q0\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 99 "adc %R0, %R0, #0\n\t"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 100 "umlal %Q0, %R0, %R2, %R3"
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 101 : "+&r" (res), "+&r" (tmp)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 102 : "r" (m), "r" (n)
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 103 : "cc");
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 104 }
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 105
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 106 return res;
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 107 }
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 108 #define __arch_xprod_64 __arch_xprod_64
040b323b5012b5 arch/arm/include/asm/div64.h Nicolas Pitre 2015-11-02 109
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Sun, 18 Jan 2026 15:24:48 +0000 Eric Dumazet <edumazet@google.com> wrote:
> inline keyword is often ignored by compilers.
>
> We need something slightly stronger in networking fast paths
> but __always_inline is too strong.
>
> Instead, generalize idea Nicolas used in commit d533cb2d2af4
> ("__arch_xprod64(): make __always_inline when optimizing for performance")
>
> This will help CONFIG_CC_OPTIMIZE_FOR_SIZE=y users keeping
> their kernels small.
This is good. __always_inline is ambiguous and the name lacks
commentary value.
If we take away __always_inline's for-performance role then what
remains? __always_inline is for tricky things where the compiler needs
to be coerced into doing what we want?
IOW, I wonder if we should take your concept further, create more
fine-grained controls over this which have self-explanatory names.
mm/ alone has 74 __always_inlines, none are documented, I don't know
why they're present, many are probably wrong.
Shit, uninlining only __get_user_pages_locked does this:
text data bss dec hex filename
115703 14018 64 129785 1faf9 mm/gup.o
103866 13058 64 116988 1c8fc mm/gup.o-after
On Sun, 18 Jan 2026 11:47:24 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:
> On Sun, 18 Jan 2026 15:24:48 +0000 Eric Dumazet <edumazet@google.com> wrote:
>
> > inline keyword is often ignored by compilers.
> >
> > We need something slightly stronger in networking fast paths
> > but __always_inline is too strong.
> >
> > Instead, generalize idea Nicolas used in commit d533cb2d2af4
> > ("__arch_xprod64(): make __always_inline when optimizing for performance")
> >
> > This will help CONFIG_CC_OPTIMIZE_FOR_SIZE=y users keeping
> > their kernels small.
>
> This is good. __always_inline is ambiguous and the name lacks
> commentary value.
>
> If we take away __always_inline's for-performance role then what
> remains? __always_inline is for tricky things where the compiler needs
> to be coerced into doing what we want?
>
> IOW, I wonder if we should take your concept further, create more
> fine-grained controls over this which have self-explanatory names.
>
>
>
> mm/ alone has 74 __always_inlines, none are documented, I don't know
> why they're present, many are probably wrong.
>
> Shit, uninlining only __get_user_pages_locked does this:
>
> text data bss dec hex filename
> 115703 14018 64 129785 1faf9 mm/gup.o
> 103866 13058 64 116988 1c8fc mm/gup.o-after
The next questions are does anything actually run faster (either way),
and should anything at all be marked 'inline' rather than 'always_inline'.
After all, if you call a function twice (not in a loop) you may
want a real function in order to avoid I-cache misses.
I've had to mark things that are called once 'always_inline', and
also 'big looking' functions that are called with constants and optimise
to almost nothing.
But I'm sure there is a lot of code that is 'inline_for_bloat' :-)
(Don't talk to me about C++ class definitions....)
On 32bit you probably don't want to inline __arch_xprod_64(), but you do
want to pass (bias ? m : 0) and may want separate functions for the
'no overflow' case (if it is common enough to worry about).
David
On Sun, 18 Jan 2026, David Laight wrote: > On 32bit you probably don't want to inline __arch_xprod_64(), but you do > want to pass (bias ? m : 0) and may want separate functions for the > 'no overflow' case (if it is common enough to worry about). You do want to inline it. Performance quickly degrades otherwise. Numbers are in the commit log where I introduced that change. And __arch_xprod_64() exists only for 32bit btw. Nicolas
On Mon, 19 Jan 2026 10:47:51 -0500 (EST) Nicolas Pitre <nico@fluxnic.net> wrote: > On Sun, 18 Jan 2026, David Laight wrote: > > > On 32bit you probably don't want to inline __arch_xprod_64(), but you do > > want to pass (bias ? m : 0) and may want separate functions for the > > 'no overflow' case (if it is common enough to worry about). > > You do want to inline it. Performance quickly degrades otherwise. If it isn't inlined you want a real C function in div.c (or similar), not the compiler generating a separate body in the object file of each file that uses it. That is just the worst of both worlds. > Numbers are in the commit log where I introduced that change. > > And __arch_xprod_64() exists only for 32bit btw. I wonder how much of a mess gcc makes of that code. I added asm functions for u64 mul_add(u32 a, u32 b, u32 c) calculating a * b + c without explicit zero extending any of the 32 bit values. Without that gcc runs out of registers and starts spilling to stack instead of just generating 'mul; add; adc $0'. I could only find the definition in the header file - may not have looked hard enough. But 64bit systems without a 64x64=>128 multiply (ie without u128 support) also need the 'multiply in 32bit chunks' code. And common code is fine with u128 support (ignoring old compilers that generate a call on 64bit mips even though it has exactly the instruction you want). David
On Mon, 19 Jan 2026, David Laight wrote: > On Mon, 19 Jan 2026 10:47:51 -0500 (EST) > Nicolas Pitre <nico@fluxnic.net> wrote: > > > On Sun, 18 Jan 2026, David Laight wrote: > > > > > On 32bit you probably don't want to inline __arch_xprod_64(), but you do > > > want to pass (bias ? m : 0) and may want separate functions for the > > > 'no overflow' case (if it is common enough to worry about). > > > > You do want to inline it. Performance quickly degrades otherwise. > > If it isn't inlined you want a real C function in div.c (or similar), > not the compiler generating a separate body in the object file of each > file that uses it. Yes you absolutely do in this very particular case. This relies on a long sequence of code that collapses to only a few assembly instructions due to constant propagation. But most of the time gcc is not smart enough to realize that (strangely enough it used to be fine more than 10 years ago). The corresponding function is not only slower but actually creates bigger code from the argument passing handling overhead. > > And __arch_xprod_64() exists only for 32bit btw. > > I wonder how much of a mess gcc makes of that code. > I added asm functions for u64 mul_add(u32 a, u32 b, u32 c) calculating > a * b + c without explicit zero extending any of the 32 bit values. > Without that gcc runs out of registers and starts spilling to stack > instead of just generating 'mul; add; adc $0'. Here this is different. Let me copy the definition: * Prototype: uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias) * Semantic: retval = ((bias ? m : 0) + m * n) >> 64 * * The product is a 128-bit value, scaled down to 64 bits. * Hoping for compile-time optimization of conditional code. * Architectures may provide their own optimized assembly implementation. ARM32 provides its own definition. Last time I checked, RV32 already produced optimal code from the default C implementation. > But 64bit systems without a 64x64=>128 multiply (ie without u128 > support) also need the 'multiply in 32bit chunks' code. Again this is only for 32-bit systems. 64-bit systems use none of that. Nicolas
On Sun, 18 Jan 2026 22:58:02 +0000 David Laight <david.laight.linux@gmail.com> wrote: > > mm/ alone has 74 __always_inlines, none are documented, I don't know > > why they're present, many are probably wrong. > > > > Shit, uninlining only __get_user_pages_locked does this: > > > > text data bss dec hex filename > > 115703 14018 64 129785 1faf9 mm/gup.o > > 103866 13058 64 116988 1c8fc mm/gup.o-after > > The next questions are does anything actually run faster (either way), > and should anything at all be marked 'inline' rather than 'always_inline'. > > After all, if you call a function twice (not in a loop) you may > want a real function in order to avoid I-cache misses. yup > But I'm sure there is a lot of code that is 'inline_for_bloat' :-) ooh, can we please have that? I do think that every always_inline should be justified and commented, but I haven't been energetic about asking for that. A fun little project would be go through each one, figure out whether were good reasons and if not, just remove them and see if anyone explains why that was incorrect.
On Sun, 18 Jan 2026 16:01:25 -0800 Andrew Morton <akpm@linux-foundation.org> wrote: > On Sun, 18 Jan 2026 22:58:02 +0000 David Laight <david.laight.linux@gmail.com> wrote: > > > > mm/ alone has 74 __always_inlines, none are documented, I don't know > > > why they're present, many are probably wrong. > > > > > > Shit, uninlining only __get_user_pages_locked does this: > > > > > > text data bss dec hex filename > > > 115703 14018 64 129785 1faf9 mm/gup.o > > > 103866 13058 64 116988 1c8fc mm/gup.o-after > > > > The next questions are does anything actually run faster (either way), > > and should anything at all be marked 'inline' rather than 'always_inline'. > > > > After all, if you call a function twice (not in a loop) you may > > want a real function in order to avoid I-cache misses. > > yup I had two adjacent strlen() calls in a bit of code, the first was an array (in a structure) and gcc inlined the 'word at a time' code, the second was a pointer and it called the library function. That had to be sub-optimal... > > But I'm sure there is a lot of code that is 'inline_for_bloat' :-) > > ooh, can we please have that? Or 'inline_to_speed_up_benchmark' and the associated 'unroll this loop because that must make it faster'. > I do think that every always_inline should be justified and commented, > but I haven't been energetic about asking for that. Apart from the 4-line functions where it is clearly obvious. Especially since the compiler can still decide to not-inline them if they are only 'inline'. > A fun little project would be go through each one, figure out whether > were good reasons and if not, just remove them and see if anyone > explains why that was incorrect. It's not just always_inline, a lot of the inline are dubious. Probably why the networking code doesn't like it. Maybe persuade Linus to do some of that. He can use his 'god' bit to just change them. David
On Mon, Jan 19, 2026 at 10:33 AM David Laight <david.laight.linux@gmail.com> wrote: > > On Sun, 18 Jan 2026 16:01:25 -0800 > Andrew Morton <akpm@linux-foundation.org> wrote: > > > On Sun, 18 Jan 2026 22:58:02 +0000 David Laight <david.laight.linux@gmail.com> wrote: > > > > > > mm/ alone has 74 __always_inlines, none are documented, I don't know > > > > why they're present, many are probably wrong. > > > > > > > > Shit, uninlining only __get_user_pages_locked does this: > > > > > > > > text data bss dec hex filename > > > > 115703 14018 64 129785 1faf9 mm/gup.o > > > > 103866 13058 64 116988 1c8fc mm/gup.o-after > > > > > > The next questions are does anything actually run faster (either way), > > > and should anything at all be marked 'inline' rather than 'always_inline'. > > > > > > After all, if you call a function twice (not in a loop) you may > > > want a real function in order to avoid I-cache misses. > > > > yup > > I had two adjacent strlen() calls in a bit of code, the first was an > array (in a structure) and gcc inlined the 'word at a time' code, the > second was a pointer and it called the library function. > That had to be sub-optimal... > > > > But I'm sure there is a lot of code that is 'inline_for_bloat' :-) > > > > ooh, can we please have that? > > Or 'inline_to_speed_up_benchmark' and the associated 'unroll this loop > because that must make it faster'. > > > I do think that every always_inline should be justified and commented, > > but I haven't been energetic about asking for that. > > Apart from the 4-line functions where it is clearly obvious. > Especially since the compiler can still decide to not-inline them > if they are only 'inline'. > > > A fun little project would be go through each one, figure out whether > > were good reasons and if not, just remove them and see if anyone > > explains why that was incorrect. > > It's not just always_inline, a lot of the inline are dubious. > Probably why the networking code doesn't like it. Many __always_inline came because of clang's reluctance to inline small things, even if the resulting code size is bigger and slower. It is a bit unclear, this seems to happen when callers are 'big enough'. noinstr (callers) functions are also a problem. Let's take the list_add() call from dev_gro_receive() : clang does not inline it, for some reason. After adding __always_inline to list_add() and __list_add() we have smaller and more efficient code, for real workloads, not only benchmarks. $ scripts/bloat-o-meter -t net/core/gro.o.old net/core/gro.o.new add/remove: 2/4 grow/shrink: 2/1 up/down: 86/-130 (-44) Function old new delta dev_gro_receive 1795 1845 +50 .Ltmp93 - 16 +16 .Ltmp89 - 16 +16 napi_gro_frags 968 972 +4 .Ltmp94 16 - -16 .Ltmp90 16 - -16 .Ltmp83 16 - -16 .Ltmp0 8396 8364 -32 list_add 50 - -50 Over the whole kernel (and also using __always_inline for list_add_tail(), __list_del(), list_del()) we have a similar outcome: $ size vmlinux.old vmlinux.new text data bss dec hex filename 39037635 23688605 4254712 66980952 3fe0c58 vmlinux.old 39035644 23688605 4254712 66978961 3fe0491 vmlinux.new $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 2/6 grow/shrink: 103/52 up/down: 6179/-6473 (-294) Function old new delta __list_del_entry - 672 +672 __do_semtimedop 1819 2204 +385 __pfx___list_del_entry - 256 +256 tracer_alloc_buffers 760 920 +160 ext4_orphan_add 1122 1254 +132 madvise_cold_or_pageout_pte_range 2202 2328 +126 iommu_sva_bind_device 744 865 +121 copy_page_range 12353 12473 +120 psi_trigger_create 767 886 +119 power_supply_register_extension 540 658 +118 srcu_gp_start_if_needed 1440 1548 +108 fanout_add 996 1097 +101 kvm_dev_ioctl 1507 1599 +92 relay_open 681 761 +80 lru_lazyfree 784 863 +79 input_register_device 1602 1680 +78 seccomp_notify_ioctl 1882 1959 +77 __iommu_probe_device 1203 1279 +76 spi_register_controller 1748 1823 +75 copy_process 4112 4186 +74 newseg 825 898 +73 optimize_kprobe 210 282 +72 do_msgsnd 1290 1362 +72 __bmc_get_device_id 3332 3404 +72 iscsi_queuecommand 948 1019 +71 hci_register_dev 667 737 +70 tcf_register_action 518 586 +68 memcg_write_event_control 1083 1147 +64 handle_one_recv_msg 3307 3371 +64 btf_module_notify 1669 1733 +64 tcf_mirred_init 1229 1292 +63 register_stat_tracer 340 403 +63 qdisc_get_stab 596 658 +62 io_submit_one 2056 2117 +61 fuse_dev_do_read 1186 1247 +61 bpf_map_offload_map_alloc 622 683 +61 zswap_setup 584 644 +60 register_acpi_bus_type 109 169 +60 ipv6_add_addr 1007 1067 +60 mraid_mm_register_adp 1379 1438 +59 mlock_folio_batch 3137 3196 +59 register_ife_op 230 288 +58 perf_event_alloc 2591 2649 +58 intel_iommu_domain_alloc_nested 498 556 +58 lru_deactivate_file 1280 1336 +56 fib_create_info 2567 2623 +56 iscsi_register_transport 533 588 +55 ext4_register_li_request 448 503 +55 do_msgrcv 1536 1589 +53 net_devmem_bind_dmabuf 1036 1088 +52 init_one_iommu 309 360 +51 shmem_get_folio_gfp 1379 1429 +50 dev_gro_receive 1795 1845 +50 .Ltmp68 160 208 +48 .Ltmp337 48 96 +48 .Ltmp204 48 96 +48 dm_get_device 501 548 +47 vfs_move_mount 509 555 +46 intel_nested_attach_dev 391 435 +44 fw_devlink_dev_sync_state 212 255 +43 bond_ipsec_add_sa 406 447 +41 bd_link_disk_holder 459 499 +40 bcm_tx_setup 1414 1454 +40 devl_linecard_create 401 440 +39 copy_tree 677 716 +39 fscrypt_setup_encryption_info 1479 1517 +38 region_del 631 668 +37 devlink_nl_rate_new_doit 501 536 +35 .Ltmp57 48 80 +32 .Ltmp126 48 80 +32 netdev_run_todo 1341 1372 +31 ipmi_create_user 411 442 +31 pci_register_host_bridge 1675 1705 +30 p9_read_work 961 990 +29 acpi_device_add 859 888 +29 mntput_no_expire_slowpath 603 631 +28 vmemmap_remap_pte 499 524 +25 move_cluster 160 184 +24 handle_userfault 2035 2059 +24 rdtgroup_mkdir 1500 1522 +22 clk_notifier_register 474 489 +15 bpf_event_notify 311 326 +15 dm_register_path_selector 245 256 +11 bpf_crypto_register_type 209 220 +11 usb_driver_set_configuration 232 242 +10 parse_gate_list 912 921 +9 devl_rate_leaf_create 266 274 +8 bt_accept_enqueue 465 473 +8 __flush_workqueue 1233 1240 +7 dev_forward_change 772 778 +6 thermal_add_hwmon_sysfs 861 865 +4 set_node_memory_tier 1027 1031 +4 napi_gro_frags 968 972 +4 mei_cl_notify_request 1067 1071 +4 mb_cache_shrink 447 451 +4 kfence_guarded_free 764 768 +4 blk_mq_try_issue_directly 606 610 +4 __neigh_update 2452 2456 +4 __folio_freeze_and_split_unmapped 3198 3202 +4 rwsem_down_write_slowpath 1595 1598 +3 eventfs_create_dir 477 480 +3 alloc_vmap_area 1967 1970 +3 unix_add_edges 618 620 +2 hid_connect 1529 1530 +1 dwc_prep_dma_memcpy 637 638 +1 scsi_eh_test_devices 700 699 -1 acpi_extract_power_resources 559 558 -1 deactivate_slab 758 756 -2 __team_options_change_check 220 218 -2 sock_map_update_common 524 521 -3 rcu_nocb_gp_kthread 2710 2707 -3 memsw_cgroup_usage_register_event 19 16 -3 kthread 564 561 -3 st_add_path 457 453 -4 flow_block_cb_setup_simple 543 539 -4 ep_try_send_events 875 871 -4 elv_register 524 520 -4 __mptcp_move_skbs_from_subflow 1318 1314 -4 __check_limbo 460 456 -4 __bpf_list_add 277 273 -4 trim_marked 375 370 -5 megaraid_mbox_runpendq 345 340 -5 ipmi_timeout_work 1813 1808 -5 handle_new_recv_msgs 419 414 -5 mei_cl_send_disconnect 186 180 -6 mei_cl_send_connect 186 180 -6 mptcp_sendmsg 1776 1769 -7 deferred_split_folio 561 554 -7 pcibios_allocate_resources 879 871 -8 find_css_set 1690 1682 -8 af_alg_sendmsg 2384 2376 -8 isolate_migratepages_block 4113 4104 -9 posixtimer_send_sigqueue 950 939 -11 mei_irq_write_handler 1350 1338 -12 dpm_prepare 1173 1161 -12 dpll_xa_ref_pin_add 679 667 -12 css_set_move_task 513 501 -12 cache_mark 583 571 -12 scsi_queue_rq 3449 3434 -15 mtd_queue_rq 1068 1053 -15 link_css_set 323 308 -15 key_garbage_collector 1119 1104 -15 do_dma_probe 1664 1649 -15 configfs_new_dirent 306 290 -16 .Ltmp69 208 192 -16 xfrm_state_walk 677 660 -17 __dpll_pin_register 761 742 -19 i2c_do_add_adapter 1018 998 -20 complete_io 421 401 -20 fsnotify_insert_event 390 368 -22 scsi_eh_ready_devs 3026 2997 -29 .Ltmp71 128 96 -32 .Ltmp58 128 96 -32 .Ltmp127 80 48 -32 migrate_pages_batch 4623 4584 -39 .Ltmp338 96 48 -48 .Ltmp205 96 48 -48 __pfx_list_del 256 - -256 __pfx_list_add_tail 384 - -384 __pfx_list_add 656 - -656 list_del 944 - -944 list_add_tail 1360 - -1360 list_add 2212 - -2212 Total: Before=25509319, After=25509025, chg -0.00%
On Mon, 19 Jan 2026 11:25:52 +0100 Eric Dumazet <edumazet@google.com> wrote: > On Mon, Jan 19, 2026 at 10:33 AM David Laight > <david.laight.linux@gmail.com> wrote: > > > > On Sun, 18 Jan 2026 16:01:25 -0800 > > Andrew Morton <akpm@linux-foundation.org> wrote: > > > > > On Sun, 18 Jan 2026 22:58:02 +0000 David Laight <david.laight.linux@gmail.com> wrote: > > > > > > > > mm/ alone has 74 __always_inlines, none are documented, I don't know > > > > > why they're present, many are probably wrong. > > > > > > > > > > Shit, uninlining only __get_user_pages_locked does this: > > > > > > > > > > text data bss dec hex filename > > > > > 115703 14018 64 129785 1faf9 mm/gup.o > > > > > 103866 13058 64 116988 1c8fc mm/gup.o-after > > > > > > > > The next questions are does anything actually run faster (either way), > > > > and should anything at all be marked 'inline' rather than 'always_inline'. > > > > > > > > After all, if you call a function twice (not in a loop) you may > > > > want a real function in order to avoid I-cache misses. > > > > > > yup > > > > I had two adjacent strlen() calls in a bit of code, the first was an > > array (in a structure) and gcc inlined the 'word at a time' code, the > > second was a pointer and it called the library function. > > That had to be sub-optimal... > > > > > > But I'm sure there is a lot of code that is 'inline_for_bloat' :-) > > > > > > ooh, can we please have that? > > > > Or 'inline_to_speed_up_benchmark' and the associated 'unroll this loop > > because that must make it faster'. > > > > > I do think that every always_inline should be justified and commented, > > > but I haven't been energetic about asking for that. > > > > Apart from the 4-line functions where it is clearly obvious. > > Especially since the compiler can still decide to not-inline them > > if they are only 'inline'. > > > > > A fun little project would be go through each one, figure out whether > > > were good reasons and if not, just remove them and see if anyone > > > explains why that was incorrect. > > > > It's not just always_inline, a lot of the inline are dubious. > > Probably why the networking code doesn't like it. > > Many __always_inline came because of clang's reluctance to inline > small things, even if the resulting code size is bigger and slower. > > It is a bit unclear, this seems to happen when callers are 'big > enough'. noinstr (callers) functions are also a problem. > > Let's take the list_add() call from dev_gro_receive() : clang does not > inline it, for some reason. > > After adding __always_inline to list_add() and __list_add() we have > smaller and more efficient code, > for real workloads, not only benchmarks. That falls into the '4-line function' category. Where s/inline/always_inline/ makes sense. > list_add 2212 - -2212 How many copies of list_add() is that... clearly a few. Generating a real function for a 'static inline' in a header is stupid. Pretty much the intent for those is to get them inlined. I'm sure there was a suggestion to make inline mean 'always inline', except there are places where it would just be bloat. David
On Mon, Jan 19, 2026 at 11:25 AM Eric Dumazet <edumazet@google.com> wrote: > > On Mon, Jan 19, 2026 at 10:33 AM David Laight > <david.laight.linux@gmail.com> wrote: > > > > On Sun, 18 Jan 2026 16:01:25 -0800 > > Andrew Morton <akpm@linux-foundation.org> wrote: > > > > > On Sun, 18 Jan 2026 22:58:02 +0000 David Laight <david.laight.linux@gmail.com> wrote: > > > > > > > > mm/ alone has 74 __always_inlines, none are documented, I don't know > > > > > why they're present, many are probably wrong. > > > > > > > > > > Shit, uninlining only __get_user_pages_locked does this: > > > > > > > > > > text data bss dec hex filename > > > > > 115703 14018 64 129785 1faf9 mm/gup.o > > > > > 103866 13058 64 116988 1c8fc mm/gup.o-after > > > > > > > > The next questions are does anything actually run faster (either way), > > > > and should anything at all be marked 'inline' rather than 'always_inline'. > > > > > > > > After all, if you call a function twice (not in a loop) you may > > > > want a real function in order to avoid I-cache misses. > > > > > > yup > > > > I had two adjacent strlen() calls in a bit of code, the first was an > > array (in a structure) and gcc inlined the 'word at a time' code, the > > second was a pointer and it called the library function. > > That had to be sub-optimal... > > > > > > But I'm sure there is a lot of code that is 'inline_for_bloat' :-) > > > > > > ooh, can we please have that? > > > > Or 'inline_to_speed_up_benchmark' and the associated 'unroll this loop > > because that must make it faster'. > > > > > I do think that every always_inline should be justified and commented, > > > but I haven't been energetic about asking for that. > > > > Apart from the 4-line functions where it is clearly obvious. > > Especially since the compiler can still decide to not-inline them > > if they are only 'inline'. > > > > > A fun little project would be go through each one, figure out whether > > > were good reasons and if not, just remove them and see if anyone > > > explains why that was incorrect. > > > > It's not just always_inline, a lot of the inline are dubious. > > Probably why the networking code doesn't like it. > > Many __always_inline came because of clang's reluctance to inline > small things, even if the resulting code size is bigger and slower. > > It is a bit unclear, this seems to happen when callers are 'big > enough'. noinstr (callers) functions are also a problem. > > Let's take the list_add() call from dev_gro_receive() : clang does not > inline it, for some reason. > > After adding __always_inline to list_add() and __list_add() we have > smaller and more efficient code, > for real workloads, not only benchmarks. > > $ scripts/bloat-o-meter -t net/core/gro.o.old net/core/gro.o.new > add/remove: 2/4 grow/shrink: 2/1 up/down: 86/-130 (-44) > Function old new delta > dev_gro_receive 1795 1845 +50 > .Ltmp93 - 16 +16 > .Ltmp89 - 16 +16 > napi_gro_frags 968 972 +4 > .Ltmp94 16 - -16 > .Ltmp90 16 - -16 > .Ltmp83 16 - -16 > .Ltmp0 8396 8364 -32 > list_add 50 - -50 > > Over the whole kernel (and also using __always_inline for > list_add_tail(), __list_del(), list_del()) we have a similar outcome: > > $ size vmlinux.old vmlinux.new > text data bss dec hex filename > 39037635 23688605 4254712 66980952 3fe0c58 vmlinux.old > 39035644 23688605 4254712 66978961 3fe0491 vmlinux.new > $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new > add/remove: 2/6 grow/shrink: 103/52 up/down: 6179/-6473 (-294) > Function old new delta > __list_del_entry - 672 +672 Ah, and of course after adding __always_inline to __list_del_entry () as well we have something even better. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 1/6 grow/shrink: 105/51 up/down: 5838/-6464 (-626) Function old new delta __do_semtimedop 1819 2204 +385 tracer_alloc_buffers 760 920 +160 __bmc_get_device_id 3332 3486 +154 power_supply_register_extension 540 692 +152 ext4_orphan_add 1122 1254 +132 madvise_cold_or_pageout_pte_range 2202 2328 +126 lru_lazyfree 784 906 +122 iommu_sva_bind_device 744 865 +121 copy_page_range 12353 12473 +120 psi_trigger_create 767 886 +119 srcu_gp_start_if_needed 1440 1548 +108 fanout_add 996 1101 +105 seccomp_notify_ioctl 1882 1986 +104 mlock_folio_batch 3137 3234 +97 kvm_dev_ioctl 1507 1599 +92 handle_userfault 2035 2118 +83 relay_open 681 761 +80 input_register_device 1602 1680 +78 __iommu_probe_device 1203 1279 +76 spi_register_controller 1748 1823 +75 copy_process 4112 4186 +74 newseg 825 898 +73 optimize_kprobe 210 282 +72 do_msgsnd 1290 1362 +72 iscsi_queuecommand 948 1019 +71 hci_register_dev 667 737 +70 rdtgroup_mkdir 1500 1569 +69 p9_read_work 961 1030 +69 tcf_register_action 518 586 +68 mntput_no_expire_slowpath 603 669 +66 netdev_run_todo 1341 1405 +64 memcg_write_event_control 1083 1147 +64 handle_one_recv_msg 3307 3371 +64 btf_module_notify 1669 1733 +64 tcf_mirred_init 1229 1292 +63 register_stat_tracer 340 403 +63 qdisc_get_stab 596 658 +62 io_submit_one 2056 2117 +61 fuse_dev_do_read 1186 1247 +61 bpf_map_offload_map_alloc 622 683 +61 zswap_setup 584 644 +60 register_acpi_bus_type 109 169 +60 ipv6_add_addr 1007 1067 +60 mraid_mm_register_adp 1379 1438 +59 register_ife_op 230 288 +58 perf_event_alloc 2591 2649 +58 intel_iommu_domain_alloc_nested 498 556 +58 do_msgrcv 1536 1593 +57 lru_deactivate_file 1280 1336 +56 fib_create_info 2567 2623 +56 acpi_device_add 859 915 +56 iscsi_register_transport 533 588 +55 ext4_register_li_request 448 503 +55 net_devmem_bind_dmabuf 1036 1088 +52 init_one_iommu 309 360 +51 shmem_get_folio_gfp 1379 1429 +50 dev_gro_receive 1795 1845 +50 .Ltmp68 160 208 +48 .Ltmp337 48 96 +48 .Ltmp204 48 96 +48 vmemmap_remap_pte 499 546 +47 dm_get_device 501 548 +47 vfs_move_mount 509 555 +46 intel_nested_attach_dev 391 435 +44 fw_devlink_dev_sync_state 212 255 +43 bond_ipsec_add_sa 406 447 +41 bd_link_disk_holder 459 499 +40 bcm_tx_setup 1414 1454 +40 devl_linecard_create 401 440 +39 copy_tree 677 716 +39 fscrypt_setup_encryption_info 1479 1517 +38 region_del 631 668 +37 devlink_nl_rate_new_doit 501 536 +35 __down_common 549 583 +34 __softirqentry_text_end - 32 +32 .Ltmp57 48 80 +32 .Ltmp126 48 80 +32 ipmi_create_user 411 442 +31 pci_register_host_bridge 1675 1705 +30 move_cluster 160 184 +24 isolate_migratepages_block 4113 4136 +23 clk_notifier_register 474 489 +15 bpf_event_notify 311 326 +15 dm_register_path_selector 245 256 +11 bpf_crypto_register_type 209 220 +11 usb_driver_set_configuration 232 242 +10 parse_gate_list 912 921 +9 devl_rate_leaf_create 266 274 +8 bt_accept_enqueue 465 473 +8 __flush_workqueue 1233 1240 +7 dev_forward_change 772 778 +6 thermal_add_hwmon_sysfs 861 865 +4 set_node_memory_tier 1027 1031 +4 napi_gro_frags 968 972 +4 mei_cl_notify_request 1067 1071 +4 mb_cache_shrink 447 451 +4 kfence_guarded_free 764 768 +4 blk_mq_try_issue_directly 606 610 +4 __neigh_update 2452 2456 +4 __folio_freeze_and_split_unmapped 3198 3202 +4 rwsem_down_write_slowpath 1595 1598 +3 eventfs_create_dir 477 480 +3 alloc_vmap_area 1967 1970 +3 unix_add_edges 618 620 +2 hid_connect 1529 1530 +1 dwc_prep_dma_memcpy 637 638 +1 scsi_eh_test_devices 700 699 -1 acpi_extract_power_resources 559 558 -1 deactivate_slab 758 756 -2 __team_options_change_check 220 218 -2 sock_map_update_common 524 521 -3 rcu_nocb_gp_kthread 2710 2707 -3 memsw_cgroup_usage_register_event 19 16 -3 kthread 564 561 -3 st_add_path 457 453 -4 flow_block_cb_setup_simple 543 539 -4 ep_try_send_events 875 871 -4 elv_register 524 520 -4 __mptcp_move_skbs_from_subflow 1318 1314 -4 __check_limbo 460 456 -4 __bpf_list_add 277 273 -4 trim_marked 375 370 -5 megaraid_mbox_runpendq 345 340 -5 ipmi_timeout_work 1813 1808 -5 handle_new_recv_msgs 419 414 -5 mei_cl_send_disconnect 186 180 -6 mei_cl_send_connect 186 180 -6 mptcp_sendmsg 1776 1769 -7 deferred_split_folio 561 554 -7 pcibios_allocate_resources 879 871 -8 find_css_set 1690 1682 -8 af_alg_sendmsg 2384 2376 -8 posixtimer_send_sigqueue 950 939 -11 mei_irq_write_handler 1350 1338 -12 dpm_prepare 1173 1161 -12 dpll_xa_ref_pin_add 679 667 -12 css_set_move_task 513 501 -12 cache_mark 583 571 -12 scsi_queue_rq 3449 3434 -15 mtd_queue_rq 1068 1053 -15 link_css_set 323 308 -15 key_garbage_collector 1119 1104 -15 do_dma_probe 1664 1649 -15 configfs_new_dirent 306 290 -16 .Ltmp69 208 192 -16 xfrm_state_walk 677 660 -17 __dpll_pin_register 761 742 -19 i2c_do_add_adapter 1018 998 -20 complete_io 421 401 -20 fsnotify_insert_event 390 368 -22 scsi_eh_ready_devs 3026 2997 -29 .Ltmp71 128 96 -32 .Ltmp58 128 96 -32 .Ltmp127 80 48 -32 migrate_pages_batch 4623 4584 -39 .Ltmp338 96 48 -48 .Ltmp205 96 48 -48 __pfx_list_del 256 - -256 __pfx_list_add_tail 384 - -384 __pfx_list_add 656 - -656 list_del 944 - -944 list_add_tail 1360 - -1360 list_add 2212 - -2212 Total: Before=25509319, After=25508693, chg -0.00%
On Sun, Jan 18, 2026 at 8:47 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Sun, 18 Jan 2026 15:24:48 +0000 Eric Dumazet <edumazet@google.com> wrote:
>
> > inline keyword is often ignored by compilers.
> >
> > We need something slightly stronger in networking fast paths
> > but __always_inline is too strong.
> >
> > Instead, generalize idea Nicolas used in commit d533cb2d2af4
> > ("__arch_xprod64(): make __always_inline when optimizing for performance")
> >
> > This will help CONFIG_CC_OPTIMIZE_FOR_SIZE=y users keeping
> > their kernels small.
>
> This is good. __always_inline is ambiguous and the name lacks
> commentary value.
>
> If we take away __always_inline's for-performance role then what
> remains? __always_inline is for tricky things where the compiler needs
> to be coerced into doing what we want?
Some functions should not be out-of-line, even if
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
A case-by case study would be needed.
>
> IOW, I wonder if we should take your concept further, create more
> fine-grained controls over this which have self-explanatory names.
>
>
>
> mm/ alone has 74 __always_inlines, none are documented, I don't know
> why they're present, many are probably wrong.
>
> Shit, uninlining only __get_user_pages_locked does this:
>
> text data bss dec hex filename
> 115703 14018 64 129785 1faf9 mm/gup.o
> 103866 13058 64 116988 1c8fc mm/gup.o-after
mm/slub.c has __fastpath_inline, depending on CONFIG_SLUB_TINY
This probably could also depend on CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
->
diff --git a/mm/slub.c b/mm/slub.c
index 861592ac54257b9d148ff921e6d8f62aced607b3..a8ca150a90355dd7a812f390c068ff9a7ccc2562
100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -224,7 +224,7 @@ do { \
#endif
#ifndef CONFIG_SLUB_TINY
-#define __fastpath_inline __always_inline
+#define __fastpath_inline inline_for_performance
#else
#define __fastpath_inline
#endif
Hi Eric,
kernel test robot noticed the following build warnings:
[auto build test WARNING on e84d960149e71e8d5e4db69775ce31305898ed0c]
url: https://github.com/intel-lab-lkp/linux/commits/Eric-Dumazet/compiler_types-Introduce-inline_for_performance/20260118-232653
base: e84d960149e71e8d5e4db69775ce31305898ed0c
patch link: https://lore.kernel.org/r/20260118152448.2560414-1-edumazet%40google.com
patch subject: [PATCH] compiler_types: Introduce inline_for_performance
config: m68k-amcore_defconfig (https://download.01.org/0day-ci/archive/20260119/202601190247.dDAvbbMH-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260119/202601190247.dDAvbbMH-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601190247.dDAvbbMH-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from arch/m68k/include/asm/div64.h:6,
from include/linux/math.h:6,
from include/linux/kernel.h:27,
from arch/m68k/coldfire/cache.c:12:
>> include/asm-generic/div64.h:138:10: warning: '__arch_xprod_64' defined but not used [-Wunused-function]
138 | uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
| ^~~~~~~~~~~~~~~
vim +/__arch_xprod_64 +138 include/asm-generic/div64.h
461a5e51060c93 Nicolas Pitre 2015-10-30 125
f682b27c57aec2 Nicolas Pitre 2015-10-30 126 #ifndef __arch_xprod_64
f682b27c57aec2 Nicolas Pitre 2015-10-30 127 /*
f682b27c57aec2 Nicolas Pitre 2015-10-30 128 * Default C implementation for __arch_xprod_64()
f682b27c57aec2 Nicolas Pitre 2015-10-30 129 *
f682b27c57aec2 Nicolas Pitre 2015-10-30 130 * Prototype: uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
f682b27c57aec2 Nicolas Pitre 2015-10-30 131 * Semantic: retval = ((bias ? m : 0) + m * n) >> 64
f682b27c57aec2 Nicolas Pitre 2015-10-30 132 *
f682b27c57aec2 Nicolas Pitre 2015-10-30 133 * The product is a 128-bit value, scaled down to 64 bits.
00a31dd3acea0f Nicolas Pitre 2024-10-03 134 * Hoping for compile-time optimization of conditional code.
f682b27c57aec2 Nicolas Pitre 2015-10-30 135 * Architectures may provide their own optimized assembly implementation.
f682b27c57aec2 Nicolas Pitre 2015-10-30 136 */
5f712d70e20a46 Eric Dumazet 2026-01-18 137 static inline_for_performance
d533cb2d2af400 Nicolas Pitre 2024-10-03 @138 uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
f682b27c57aec2 Nicolas Pitre 2015-10-30 139 {
f682b27c57aec2 Nicolas Pitre 2015-10-30 140 uint32_t m_lo = m;
f682b27c57aec2 Nicolas Pitre 2015-10-30 141 uint32_t m_hi = m >> 32;
f682b27c57aec2 Nicolas Pitre 2015-10-30 142 uint32_t n_lo = n;
f682b27c57aec2 Nicolas Pitre 2015-10-30 143 uint32_t n_hi = n >> 32;
00a31dd3acea0f Nicolas Pitre 2024-10-03 144 uint64_t x, y;
f682b27c57aec2 Nicolas Pitre 2015-10-30 145
00a31dd3acea0f Nicolas Pitre 2024-10-03 146 /* Determine if overflow handling can be dispensed with. */
00a31dd3acea0f Nicolas Pitre 2024-10-03 147 bool no_ovf = __builtin_constant_p(m) &&
00a31dd3acea0f Nicolas Pitre 2024-10-03 148 ((m >> 32) + (m & 0xffffffff) < 0x100000000);
f682b27c57aec2 Nicolas Pitre 2015-10-30 149
00a31dd3acea0f Nicolas Pitre 2024-10-03 150 if (no_ovf) {
00a31dd3acea0f Nicolas Pitre 2024-10-03 151 x = (uint64_t)m_lo * n_lo + (bias ? m : 0);
00a31dd3acea0f Nicolas Pitre 2024-10-03 152 x >>= 32;
00a31dd3acea0f Nicolas Pitre 2024-10-03 153 x += (uint64_t)m_lo * n_hi;
00a31dd3acea0f Nicolas Pitre 2024-10-03 154 x += (uint64_t)m_hi * n_lo;
00a31dd3acea0f Nicolas Pitre 2024-10-03 155 x >>= 32;
00a31dd3acea0f Nicolas Pitre 2024-10-03 156 x += (uint64_t)m_hi * n_hi;
f682b27c57aec2 Nicolas Pitre 2015-10-30 157 } else {
00a31dd3acea0f Nicolas Pitre 2024-10-03 158 x = (uint64_t)m_lo * n_lo + (bias ? m_lo : 0);
00a31dd3acea0f Nicolas Pitre 2024-10-03 159 y = (uint64_t)m_lo * n_hi + (uint32_t)(x >> 32) + (bias ? m_hi : 0);
00a31dd3acea0f Nicolas Pitre 2024-10-03 160 x = (uint64_t)m_hi * n_hi + (uint32_t)(y >> 32);
00a31dd3acea0f Nicolas Pitre 2024-10-03 161 y = (uint64_t)m_hi * n_lo + (uint32_t)y;
00a31dd3acea0f Nicolas Pitre 2024-10-03 162 x += (uint32_t)(y >> 32);
f682b27c57aec2 Nicolas Pitre 2015-10-30 163 }
f682b27c57aec2 Nicolas Pitre 2015-10-30 164
00a31dd3acea0f Nicolas Pitre 2024-10-03 165 return x;
f682b27c57aec2 Nicolas Pitre 2015-10-30 166 }
f682b27c57aec2 Nicolas Pitre 2015-10-30 167 #endif
f682b27c57aec2 Nicolas Pitre 2015-10-30 168
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Mon, 19 Jan 2026 02:36:18 +0800
kernel test robot <lkp@intel.com> wrote:
> Hi Eric,
...
> vim +/__arch_xprod_64 +138 include/asm-generic/div64.h
>
> 461a5e51060c93 Nicolas Pitre 2015-10-30 125
> f682b27c57aec2 Nicolas Pitre 2015-10-30 126 #ifndef __arch_xprod_64
> f682b27c57aec2 Nicolas Pitre 2015-10-30 127 /*
> f682b27c57aec2 Nicolas Pitre 2015-10-30 128 * Default C implementation for __arch_xprod_64()
> f682b27c57aec2 Nicolas Pitre 2015-10-30 129 *
> f682b27c57aec2 Nicolas Pitre 2015-10-30 130 * Prototype: uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
> f682b27c57aec2 Nicolas Pitre 2015-10-30 131 * Semantic: retval = ((bias ? m : 0) + m * n) >> 64
> f682b27c57aec2 Nicolas Pitre 2015-10-30 132 *
> f682b27c57aec2 Nicolas Pitre 2015-10-30 133 * The product is a 128-bit value, scaled down to 64 bits.
> 00a31dd3acea0f Nicolas Pitre 2024-10-03 134 * Hoping for compile-time optimization of conditional code.
> f682b27c57aec2 Nicolas Pitre 2015-10-30 135 * Architectures may provide their own optimized assembly implementation.
> f682b27c57aec2 Nicolas Pitre 2015-10-30 136 */
> 5f712d70e20a46 Eric Dumazet 2026-01-18 137 static inline_for_performance
> d533cb2d2af400 Nicolas Pitre 2024-10-03 @138 uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
> f682b27c57aec2 Nicolas Pitre 2015-10-30 139 {
> f682b27c57aec2 Nicolas Pitre 2015-10-30 140 uint32_t m_lo = m;
> f682b27c57aec2 Nicolas Pitre 2015-10-30 141 uint32_t m_hi = m >> 32;
> f682b27c57aec2 Nicolas Pitre 2015-10-30 142 uint32_t n_lo = n;
> f682b27c57aec2 Nicolas Pitre 2015-10-30 143 uint32_t n_hi = n >> 32;
> 00a31dd3acea0f Nicolas Pitre 2024-10-03 144 uint64_t x, y;
> f682b27c57aec2 Nicolas Pitre 2015-10-30 145
> 00a31dd3acea0f Nicolas Pitre 2024-10-03 146 /* Determine if overflow handling can be dispensed with. */
> 00a31dd3acea0f Nicolas Pitre 2024-10-03 147 bool no_ovf = __builtin_constant_p(m) &&
> 00a31dd3acea0f Nicolas Pitre 2024-10-03 148 ((m >> 32) + (m & 0xffffffff) < 0x100000000);
Can that ever have got compiled?
Won't the compiler complain about 0x100000000 being out of range?
Lots of alternatives...
If u128 exists this should probably just be:
return ((u128)m * n + (bias ? m : 0)) >> 64;
Which is probably the only alternative an architecture might provide (none do AFAICT).
David
Eric Dumazet <edumazet@google.com> wrote: > -#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE > -static __always_inline > -#else > -static inline > -#endif > +static inline_for_performance .. > -#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE > -static __always_inline > -#else > -static inline > -#endif > +static inline_for_performance .. > +#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE > +#define inline_for_performance __always_inline > +#else > +#define inline_for_performance > +#endif Should that read #else +#define inline_for_performance inline instead?
On Sun, Jan 18, 2026 at 4:32 PM Florian Westphal <fw@strlen.de> wrote: > > Eric Dumazet <edumazet@google.com> wrote: > > -#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE > > -static __always_inline > > -#else > > -static inline > > -#endif > > +static inline_for_performance > > .. > > > -#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE > > -static __always_inline > > -#else > > -static inline > > -#endif > > +static inline_for_performance > > .. > > > +#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE > > +#define inline_for_performance __always_inline > > +#else > > +#define inline_for_performance > > +#endif > > Should that read > > #else > +#define inline_for_performance inline > > instead? Damn, of course !
© 2016 - 2026 Red Hat, Inc.