x86_64: inline csum_ipv6_magic()

[PATCH] x86_64: inline csum_ipv6_magic()

Posted by Eric Dumazet 2 months, 3 weeks ago

Inline this small helper.

This reduces register pressure, as saddr and daddr are often
back to back in memory.

For instance code inlined in tcp6_gro_receive() will look like:

 55a:	48 03 73 28          	add    0x28(%rbx),%rsi
 55e:	8b 43 70             	mov    0x70(%rbx),%eax
 561:	29 f8                	sub    %edi,%eax
 563:	0f c8                	bswap  %eax
 565:	89 c0                	mov    %eax,%eax
 567:	48 05 00 06 00 00    	add    $0x600,%rax
 56d:	48 03 46 08          	add    0x8(%rsi),%rax
 571:	48 13 46 10          	adc    0x10(%rsi),%rax
 575:	48 13 46 18          	adc    0x18(%rsi),%rax
 579:	48 13 46 20          	adc    0x20(%rsi),%rax
 57d:	48 83 d0 00          	adc    $0x0,%rax
 581:	48 89 c6             	mov    %rax,%rsi
 584:	48 c1 ee 20          	shr    $0x20,%rsi
 588:	01 f0                	add    %esi,%eax
 58a:	83 d0 00             	adc    $0x0,%eax
 58d:	89 c6                	mov    %eax,%esi
 58f:	66 31 c0             	xor    %ax,%ax

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/x86/include/asm/checksum_64.h | 45 ++++++++++++++++++++++--------
 arch/x86/lib/csum-wrappers_64.c    | 22 ---------------
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 4d4a47a3a8ab2310d279f7e465032b1463200393..5bdfd2db2b5a573ff8193a4878d372c97b158f47 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/compiler.h>
+#include <linux/in6.h>
 #include <asm/byteorder.h>
 
 /**
@@ -145,6 +146,17 @@ extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
  */
 extern __sum16 ip_compute_csum(const void *buff, int len);
 
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+	asm("addl %2,%0\n\t"
+	    "adcl $0,%0"
+	    : "=r" (a)
+	    : "0" (a), "rm" (b));
+	return a;
+}
+
+#define _HAVE_ARCH_IPV6_CSUM 1
+
 /**
  * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
  * @saddr: source address
@@ -158,20 +170,29 @@ extern __sum16 ip_compute_csum(const void *buff, int len);
  * Returns the unfolded 32bit checksum.
  */
 
-struct in6_addr;
+static inline __sum16 csum_ipv6_magic(
+	const struct in6_addr *_saddr, const struct in6_addr *_daddr,
+	__u32 len, __u8 proto, __wsum sum)
+{
+	const unsigned long *saddr = (const unsigned long *)_saddr;
+	const unsigned long *daddr = (const unsigned long *)_daddr;
+	__u64 sum64;
 
-#define _HAVE_ARCH_IPV6_CSUM 1
-extern __sum16
-csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, __u8 proto, __wsum sum);
+	sum64 = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+		(__force __u64)sum;
 
-static inline unsigned add32_with_carry(unsigned a, unsigned b)
-{
-	asm("addl %2,%0\n\t"
-	    "adcl $0,%0"
-	    : "=r" (a)
-	    : "0" (a), "rm" (b));
-	return a;
+	asm("	addq %1,%[sum64]\n"
+	    "	adcq %2,%[sum64]\n"
+	    "	adcq %3,%[sum64]\n"
+	    "	adcq %4,%[sum64]\n"
+	    "	adcq $0,%[sum64]\n"
+
+	    : [sum64] "+r" (sum64)
+	    : "m" (saddr[0]), "m" (saddr[1]),
+	      "m" (daddr[0]), "m" (daddr[1]));
+
+	return csum_fold(
+	       (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
 }
 
 #define HAVE_ARCH_CSUM_ADD
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index f4df4d241526c64a5ad2eabdcbf5f0d8d56d6fd8..831b7110b041598b9764a6647fa259e1058efef2 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -68,25 +68,3 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len)
 }
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
-__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-			const struct in6_addr *daddr,
-			__u32 len, __u8 proto, __wsum sum)
-{
-	__u64 rest, sum64;
-
-	rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
-		(__force __u64)sum;
-
-	asm("	addq (%[saddr]),%[sum]\n"
-	    "	adcq 8(%[saddr]),%[sum]\n"
-	    "	adcq (%[daddr]),%[sum]\n"
-	    "	adcq 8(%[daddr]),%[sum]\n"
-	    "	adcq $0,%[sum]\n"
-
-	    : [sum] "=r" (sum64)
-	    : "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr));
-
-	return csum_fold(
-	       (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
-}
-EXPORT_SYMBOL(csum_ipv6_magic);
-- 
2.51.2.1041.gc1ab5b90ca-goog

Re: [PATCH] x86_64: inline csum_ipv6_magic()

Posted by Dave Hansen 2 months, 3 weeks ago

On 11/13/25 07:45, Eric Dumazet wrote:
> Inline this small helper.
> 
> This reduces register pressure, as saddr and daddr are often
> back to back in memory.
> 
> For instance code inlined in tcp6_gro_receive() will look like:

Could you please double check what the code growth is for this across
the tree? There are 80-ish users of csum_ipv6_magic().

Or, is there a discrete, measurable performance gain from doing this?

Re: [PATCH] x86_64: inline csum_ipv6_magic()

Posted by Eric Dumazet 2 months, 3 weeks ago

On Thu, Nov 13, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 11/13/25 07:45, Eric Dumazet wrote:
> > Inline this small helper.
> >
> > This reduces register pressure, as saddr and daddr are often
> > back to back in memory.
> >
> > For instance code inlined in tcp6_gro_receive() will look like:
>
> Could you please double check what the code growth is for this across
> the tree? There are 80-ish users of csum_ipv6_magic().

Hi Dave

Sure (allyesconfig build)

Before patch:

size vmlinux
   text    data     bss     dec     hex filename
886947242 245613190 40211540 1172771972 45e71484 vmlinux

After patch:
 size vmlinux
   text    data     bss     dec     hex filename
886947242 245613190 40211540 1172771972 45e71484 vmlinux

I found this a bit surprising, so I did a regular build (our Google
production kernel default config)

Before:

size vmlinux
   text    data     bss     dec     hex filename
34812872 22177397 5685248 62675517 3bc5a3d vmlinux

After:

 size vmlinux
   text    data     bss     dec     hex filename
34812501 22177365 5685248 62675114 3bc58aa vmlinux

So it would seem the patch saves 371 bytes for this config.

>
> Or, is there a discrete, measurable performance gain from doing this?

IPv6 incoming TCP/UDP paths call this function twice per packet, which is sad...
One call per TX packet.

Depending on the cpus I can see csum_ipv6_magic() using up to 0.75 %
of cpu cycles.
Then there is the cost in the callers, harder to measure...

Thank you.

Re: [PATCH] x86_64: inline csum_ipv6_magic()

Posted by David Laight 2 months, 3 weeks ago

On Thu, 13 Nov 2025 10:18:08 -0800
Eric Dumazet <edumazet@google.com> wrote:

> On Thu, Nov 13, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
> >
> > On 11/13/25 07:45, Eric Dumazet wrote:  
> > > Inline this small helper.
> > >
> > > This reduces register pressure, as saddr and daddr are often
> > > back to back in memory.
> > >
> > > For instance code inlined in tcp6_gro_receive() will look like:  
> >
> > Could you please double check what the code growth is for this across
> > the tree? There are 80-ish users of csum_ipv6_magic().  
> 
> Hi Dave
> 
> Sure (allyesconfig build)

Does't allyesconfig pull in all the KASAN stuff as well.
Which makes it fairly useless for normal build tests.

	David

Re: [PATCH] x86_64: inline csum_ipv6_magic()

Posted by Eric Dumazet 2 months, 3 weeks ago

On Thu, Nov 13, 2025 at 12:03 PM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Thu, 13 Nov 2025 10:18:08 -0800
> Eric Dumazet <edumazet@google.com> wrote:
>
> > On Thu, Nov 13, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
> > >
> > > On 11/13/25 07:45, Eric Dumazet wrote:
> > > > Inline this small helper.
> > > >
> > > > This reduces register pressure, as saddr and daddr are often
> > > > back to back in memory.
> > > >
> > > > For instance code inlined in tcp6_gro_receive() will look like:
> > >
> > > Could you please double check what the code growth is for this across
> > > the tree? There are 80-ish users of csum_ipv6_magic().
> >
> > Hi Dave
> >
> > Sure (allyesconfig build)
>
> Does't allyesconfig pull in all the KASAN stuff as well.
> Which makes it fairly useless for normal build tests.

This is why I added a more standard build.

I do not think we have a "make allyesconfig_but_no_debug_stuff"

BTW, inlining rb_first() also saves 744 bytes.

Re: [PATCH] x86_64: inline csum_ipv6_magic()

Posted by Dave Hansen 2 months, 3 weeks ago

On 11/13/25 10:18, Eric Dumazet wrote:
> So it would seem the patch saves 371 bytes for this config.
> 
>> Or, is there a discrete, measurable performance gain from doing this?
> IPv6 incoming TCP/UDP paths call this function twice per packet, which is sad...
> One call per TX packet.
> 
> Depending on the cpus I can see csum_ipv6_magic() using up to 0.75 %
> of cpu cycles.
> Then there is the cost in the callers, harder to measure...

Oh, wow. That's more than I was expecting. But it does make sense.
Thanks for the info. I'll stick this in the queue to apply in a month or
so after the next -rc1, unless it needs more urgency.

Acked-by: Dave Hansen <dave.hansen@linux.intel.com>

Re: [PATCH] x86_64: inline csum_ipv6_magic()

Posted by Eric Dumazet 1 month, 2 weeks ago

On Thu, Nov 13, 2025 at 7:40 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 11/13/25 10:18, Eric Dumazet wrote:
> > So it would seem the patch saves 371 bytes for this config.
> >
> >> Or, is there a discrete, measurable performance gain from doing this?
> > IPv6 incoming TCP/UDP paths call this function twice per packet, which is sad...
> > One call per TX packet.
> >
> > Depending on the cpus I can see csum_ipv6_magic() using up to 0.75 %
> > of cpu cycles.
> > Then there is the cost in the callers, harder to measure...
>
> Oh, wow. That's more than I was expecting. But it does make sense.
> Thanks for the info. I'll stick this in the queue to apply in a month or
> so after the next -rc1, unless it needs more urgency.
>
> Acked-by: Dave Hansen <dave.hansen@linux.intel.com>

Gentle ping, I have not seen this patch reaching the tip tree.

Thanks a lot !

[tip: x86/misc] x86/lib: Inline csum_ipv6_magic()

Posted by tip-bot2 for Eric Dumazet 1 month ago

The following commit has been merged into the x86/misc branch of tip:

Commit-ID:     529676cabcf4a5046d217bba2c8f3b94a3f6a10f
Gitweb:        https://git.kernel.org/tip/529676cabcf4a5046d217bba2c8f3b94a3f6a10f
Author:        Eric Dumazet <edumazet@google.com>
AuthorDate:    Thu, 13 Nov 2025 15:45:45 
Committer:     Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Mon, 05 Jan 2026 10:14:05 -08:00

x86/lib: Inline csum_ipv6_magic()

Inline this small helper. It has been observed to consume up
to 0.75%, which is significant for such a small function.

This should reduce register pressure, as saddr and daddr are often
back to back in memory.

For instance code inlined in tcp6_gro_receive() will look like:

 55a:	48 03 73 28          	add    0x28(%rbx),%rsi
 55e:	8b 43 70             	mov    0x70(%rbx),%eax
 561:	29 f8                	sub    %edi,%eax
 563:	0f c8                	bswap  %eax
 565:	89 c0                	mov    %eax,%eax
 567:	48 05 00 06 00 00    	add    $0x600,%rax
 56d:	48 03 46 08          	add    0x8(%rsi),%rax
 571:	48 13 46 10          	adc    0x10(%rsi),%rax
 575:	48 13 46 18          	adc    0x18(%rsi),%rax
 579:	48 13 46 20          	adc    0x20(%rsi),%rax
 57d:	48 83 d0 00          	adc    $0x0,%rax
 581:	48 89 c6             	mov    %rax,%rsi
 584:	48 c1 ee 20          	shr    $0x20,%rsi
 588:	01 f0                	add    %esi,%eax
 58a:	83 d0 00             	adc    $0x0,%eax
 58d:	89 c6                	mov    %eax,%esi
 58f:	66 31 c0             	xor    %ax,%ax

Surprisingly, this inlining does not seem to bloat kernel text size.
It at least two cases[1], it either has no effect or results in a
slightly smaller kernel.

1. https://lore.kernel.org/all/CANn89iJzcb_XO9oCApKYfRxsMMmg7BHukRDqWTca3ZLQ8HT0iQ@mail.gmail.com/

[ dhansen: add justification and note about lack of kernel bloat ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://patch.msgid.link/20251113154545.594580-1-edumazet@google.com
---
 arch/x86/include/asm/checksum_64.h | 45 +++++++++++++++++++++--------
 arch/x86/lib/csum-wrappers_64.c    | 22 +--------------
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 4d4a47a..5bdfd2d 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/compiler.h>
+#include <linux/in6.h>
 #include <asm/byteorder.h>
 
 /**
@@ -145,6 +146,17 @@ extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
  */
 extern __sum16 ip_compute_csum(const void *buff, int len);
 
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+	asm("addl %2,%0\n\t"
+	    "adcl $0,%0"
+	    : "=r" (a)
+	    : "0" (a), "rm" (b));
+	return a;
+}
+
+#define _HAVE_ARCH_IPV6_CSUM 1
+
 /**
  * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
  * @saddr: source address
@@ -158,20 +170,29 @@ extern __sum16 ip_compute_csum(const void *buff, int len);
  * Returns the unfolded 32bit checksum.
  */
 
-struct in6_addr;
+static inline __sum16 csum_ipv6_magic(
+	const struct in6_addr *_saddr, const struct in6_addr *_daddr,
+	__u32 len, __u8 proto, __wsum sum)
+{
+	const unsigned long *saddr = (const unsigned long *)_saddr;
+	const unsigned long *daddr = (const unsigned long *)_daddr;
+	__u64 sum64;
 
-#define _HAVE_ARCH_IPV6_CSUM 1
-extern __sum16
-csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, __u8 proto, __wsum sum);
+	sum64 = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+		(__force __u64)sum;
 
-static inline unsigned add32_with_carry(unsigned a, unsigned b)
-{
-	asm("addl %2,%0\n\t"
-	    "adcl $0,%0"
-	    : "=r" (a)
-	    : "0" (a), "rm" (b));
-	return a;
+	asm("	addq %1,%[sum64]\n"
+	    "	adcq %2,%[sum64]\n"
+	    "	adcq %3,%[sum64]\n"
+	    "	adcq %4,%[sum64]\n"
+	    "	adcq $0,%[sum64]\n"
+
+	    : [sum64] "+r" (sum64)
+	    : "m" (saddr[0]), "m" (saddr[1]),
+	      "m" (daddr[0]), "m" (daddr[1]));
+
+	return csum_fold(
+	       (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
 }
 
 #define HAVE_ARCH_CSUM_ADD
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index f4df4d2..831b711 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -68,25 +68,3 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len)
 }
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
-__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-			const struct in6_addr *daddr,
-			__u32 len, __u8 proto, __wsum sum)
-{
-	__u64 rest, sum64;
-
-	rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
-		(__force __u64)sum;
-
-	asm("	addq (%[saddr]),%[sum]\n"
-	    "	adcq 8(%[saddr]),%[sum]\n"
-	    "	adcq (%[daddr]),%[sum]\n"
-	    "	adcq 8(%[daddr]),%[sum]\n"
-	    "	adcq $0,%[sum]\n"
-
-	    : [sum] "=r" (sum64)
-	    : "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr));
-
-	return csum_fold(
-	       (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
-}
-EXPORT_SYMBOL(csum_ipv6_magic);