[v3] net: reduce RFS/ARFS flow updates by checking LLC affinity

[PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity

Posted by Chuang Wang 1 month, 2 weeks ago

The current implementation of rps_record_sock_flow() updates the flow
table every time a socket is processed on a different CPU. In high-load
scenarios, especially with Accelerated RFS (ARFS), this triggers
frequent flow steering updates via ndo_rx_flow_steer.

For drivers like mlx5 that implement hardware flow steering, these
constant updates lead to significant contention on internal driver locks
(e.g., arfs_lock). This contention often becomes a performance
bottleneck that outweighs the steering benefits.

This patch introduces a cache-aware update strategy: the flow record is
only updated if the flow migrates across Last Level Cache (LLC)
boundaries. This minimizes expensive hardware reconfigurations while
preserving cache locality for the application. A new sysctl,
net.core.rps_feat_llc_affinity, is added to toggle this feature.

Performance Test Results:
The patch was tested in a K8s environment (AMD CPU 128*2, 16-core Pod
with CPU pinning, mlx5 NIC) using brpc[1] echo_server and rpc_press.

rpc_press Commands:

  for i in {1..8}; do
    ./rpc_press -proto=./echo.proto -method=example.EchoService.Echo
    -server=<IP>:8000 -input='{"message":"hello"}'
    -qps=0 -thread_num=512 -connection_type=pooled &
  done

Monitor mlx5e_rx_flow_steer frequency:

  /usr/share/bcc/tools/funccount -i 1 mlx5e_rx_flow_steer

Frequency of mlx5e_rx_flow_steer (via funccount[2]):

  Before: ~335,000 counts/sec
  After:   ~23,000 counts/sec (reduced by ~93%)

System Metrics (after enabling rps_feat_llc_affinity):

  CPU Utilization: 38% -> 32%
  CPU PSI (Pressure Stall Information): 20% -> 10%

These results demonstrate that filtering updates by LLC affinity
significantly reduces driver lock contention and improves overall
CPU efficiency under heavy network load.

[1] https://github.com/apache/brpc/
[2] https://github.com/iovisor/bcc/blob/master/tools/funccount.py

Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
---
v2 -> v3: patch net -> net-next
v1 -> v2: add rps_feat_llc_affinity; add brpc tests

 include/net/rps.h          | 18 ++--------
 net/core/dev.c             | 72 ++++++++++++++++++++++++++++++++++++++
 net/core/sysctl_net_core.c | 34 ++++++++++++++++++
 3 files changed, 108 insertions(+), 16 deletions(-)

diff --git a/include/net/rps.h b/include/net/rps.h
index e33c6a2fa8bb..37bbb7009c36 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -12,6 +12,7 @@
 
 extern struct static_key_false rps_needed;
 extern struct static_key_false rfs_needed;
+extern struct static_key_false rps_feat_llc_affinity;
 
 /*
  * This structure holds an RPS map which can be of variable length.  The
@@ -55,22 +56,7 @@ struct rps_sock_flow_table {
 
 #define RPS_NO_CPU 0xffff
 
-static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
-{
-	unsigned int index = hash & rps_tag_to_mask(tag_ptr);
-	u32 val = hash & ~net_hotdata.rps_cpu_mask;
-	struct rps_sock_flow_table *table;
-
-	/* We only give a hint, preemption can change CPU under us */
-	val |= raw_smp_processor_id();
-
-	table = rps_tag_to_table(tag_ptr);
-	/* The following WRITE_ONCE() is paired with the READ_ONCE()
-	 * here, and another one in get_rps_cpu().
-	 */
-	if (READ_ONCE(table[index].ent) != val)
-		WRITE_ONCE(table[index].ent, val);
-}
+void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
 
 static inline void _sock_rps_record_flow_hash(__u32 hash)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 203dc36aaed5..630a7f21d8de 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4964,6 +4964,8 @@ struct static_key_false rps_needed __read_mostly;
 EXPORT_SYMBOL(rps_needed);
 struct static_key_false rfs_needed __read_mostly;
 EXPORT_SYMBOL(rfs_needed);
+struct static_key_false rps_feat_llc_affinity __read_mostly;
+EXPORT_SYMBOL(rps_feat_llc_affinity);
 
 static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
 {
@@ -5175,6 +5177,76 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	return cpu;
 }
 
+/**
+ * rps_record_cond - Determine if RPS flow table should be updated
+ * @old_val: Previous flow record value
+ * @new_val: Target flow record value
+ *
+ * Returns true if the record needs an update.
+ */
+static inline bool rps_record_cond(u32 old_val, u32 new_val)
+{
+	u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
+	u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
+
+	if (old_val == new_val)
+		return false;
+
+	/*
+	 * RPS LLC Affinity Feature:
+	 * Reduce RFS/ARFS flow updates by checking LLC affinity.
+	 *
+	 * Frequent flow table updates can trigger constant hardware steering
+	 * reconfigurations (e.g., ndo_rx_flow_steer), leading to significant
+	 * contention on driver internal locks (like mlx5's arfs_lock).
+	 *
+	 * This strategy only updates the flow record if it migrates across LLC
+	 * boundaries. This minimizes expensive hardware updates while preserving
+	 * cache locality for the application.
+	 */
+	if (static_branch_unlikely(&rps_feat_llc_affinity)) {
+		/* Force update if the recorded CPU is invalid or has gone offline */
+		if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
+			return true;
+
+		/*
+		 * Force an update if the current task is no longer permitted
+		 * to run on the old_cpu.
+		 */
+		if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
+			return true;
+
+		/*
+		 * If CPUs do not share a cache, allow the update to prevent
+		 * expensive remote memory accesses and cache misses.
+		 */
+		if (!cpus_share_cache(old_cpu, new_cpu))
+			return true;
+
+		return false;
+	}
+
+	return true;
+}
+
+void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
+{
+	unsigned int index = hash & rps_tag_to_mask(tag_ptr);
+	u32 val = hash & ~net_hotdata.rps_cpu_mask;
+	struct rps_sock_flow_table *table;
+
+	/* We only give a hint, preemption can change CPU under us */
+	val |= raw_smp_processor_id();
+
+	table = rps_tag_to_table(tag_ptr);
+	/* The following WRITE_ONCE() is paired with the READ_ONCE()
+	 * here, and another one in get_rps_cpu().
+	 */
+	if (rps_record_cond(READ_ONCE(table[index].ent), val))
+		WRITE_ONCE(table[index].ent, val);
+}
+EXPORT_SYMBOL(rps_record_sock_flow);
+
 #ifdef CONFIG_RFS_ACCEL
 
 /**
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 502705e04649..dbc99aea7bb0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -210,6 +210,32 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
 	kvfree_rcu_mightsleep(tofree);
 	return ret;
 }
+
+static int rps_feat_llc_affinity_sysctl(const struct ctl_table *table, int write,
+					void *buffer, size_t *lenp, loff_t *ppos)
+{
+	u8 curr_state;
+	int ret;
+	const struct ctl_table tmp = {
+		.data = &curr_state,
+		.maxlen = sizeof(curr_state),
+		.mode = table->mode,
+		.extra1 = table->extra1,
+		.extra2 = table->extra2
+	};
+
+	curr_state = static_branch_unlikely(&rps_feat_llc_affinity) ? 1 : 0;
+
+	ret = proc_dou8vec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && ret == 0) {
+		if (curr_state && !static_branch_unlikely(&rps_feat_llc_affinity))
+			static_branch_enable(&rps_feat_llc_affinity);
+		else if (!curr_state && static_branch_unlikely(&rps_feat_llc_affinity))
+			static_branch_disable(&rps_feat_llc_affinity);
+	}
+
+	return ret;
+}
 #endif /* CONFIG_RPS */
 
 #ifdef CONFIG_NET_FLOW_LIMIT
@@ -531,6 +557,14 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= rps_sock_flow_sysctl
 	},
+	{
+		.procname	= "rps_feat_llc_affinity",
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler   = rps_feat_llc_affinity_sysctl,
+		.extra1     = SYSCTL_ZERO,
+		.extra2     = SYSCTL_ONE
+	},
 #endif
 #ifdef CONFIG_NET_FLOW_LIMIT
 	{
-- 
2.47.3

Re: [PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity

Posted by Eric Dumazet 1 month, 2 weeks ago

On Mon, Apr 27, 2026 at 7:56 PM Chuang Wang <nashuiliang@gmail.com> wrote:
>
> The current implementation of rps_record_sock_flow() updates the flow
> table every time a socket is processed on a different CPU. In high-load
> scenarios, especially with Accelerated RFS (ARFS), this triggers
> frequent flow steering updates via ndo_rx_flow_steer.
>
> For drivers like mlx5 that implement hardware flow steering, these
> constant updates lead to significant contention on internal driver locks
> (e.g., arfs_lock). This contention often becomes a performance
> bottleneck that outweighs the steering benefits.
>
> This patch introduces a cache-aware update strategy: the flow record is
> only updated if the flow migrates across Last Level Cache (LLC)
> boundaries. This minimizes expensive hardware reconfigurations while
> preserving cache locality for the application. A new sysctl,
> net.core.rps_feat_llc_affinity, is added to toggle this feature.
>
> Performance Test Results:
> The patch was tested in a K8s environment (AMD CPU 128*2, 16-core Pod
> with CPU pinning, mlx5 NIC) using brpc[1] echo_server and rpc_press.
>
> rpc_press Commands:
>
>   for i in {1..8}; do
>     ./rpc_press -proto=./echo.proto -method=example.EchoService.Echo
>     -server=<IP>:8000 -input='{"message":"hello"}'
>     -qps=0 -thread_num=512 -connection_type=pooled &
>   done
>
> Monitor mlx5e_rx_flow_steer frequency:
>
>   /usr/share/bcc/tools/funccount -i 1 mlx5e_rx_flow_steer
>
> Frequency of mlx5e_rx_flow_steer (via funccount[2]):
>
>   Before: ~335,000 counts/sec
>   After:   ~23,000 counts/sec (reduced by ~93%)
>
> System Metrics (after enabling rps_feat_llc_affinity):
>
>   CPU Utilization: 38% -> 32%
>   CPU PSI (Pressure Stall Information): 20% -> 10%
>
> These results demonstrate that filtering updates by LLC affinity
> significantly reduces driver lock contention and improves overall
> CPU efficiency under heavy network load.
>
> [1] https://github.com/apache/brpc/
> [2] https://github.com/iovisor/bcc/blob/master/tools/funccount.py
>
> Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
> ---
> v2 -> v3: patch net -> net-next
> v1 -> v2: add rps_feat_llc_affinity; add brpc tests
>
>  include/net/rps.h          | 18 ++--------
>  net/core/dev.c             | 72 ++++++++++++++++++++++++++++++++++++++
>  net/core/sysctl_net_core.c | 34 ++++++++++++++++++
>  3 files changed, 108 insertions(+), 16 deletions(-)
>
> diff --git a/include/net/rps.h b/include/net/rps.h
> index e33c6a2fa8bb..37bbb7009c36 100644
> --- a/include/net/rps.h
> +++ b/include/net/rps.h
> @@ -12,6 +12,7 @@
>
>  extern struct static_key_false rps_needed;
>  extern struct static_key_false rfs_needed;
> +extern struct static_key_false rps_feat_llc_affinity;
>
>  /*
>   * This structure holds an RPS map which can be of variable length.  The
> @@ -55,22 +56,7 @@ struct rps_sock_flow_table {
>
>  #define RPS_NO_CPU 0xffff
>
> -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> -{
> -       unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> -       u32 val = hash & ~net_hotdata.rps_cpu_mask;
> -       struct rps_sock_flow_table *table;
> -
> -       /* We only give a hint, preemption can change CPU under us */
> -       val |= raw_smp_processor_id();
> -
> -       table = rps_tag_to_table(tag_ptr);
> -       /* The following WRITE_ONCE() is paired with the READ_ONCE()
> -        * here, and another one in get_rps_cpu().
> -        */
> -       if (READ_ONCE(table[index].ent) != val)
> -               WRITE_ONCE(table[index].ent, val);
> -}
> +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
>
>  static inline void _sock_rps_record_flow_hash(__u32 hash)
>  {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 203dc36aaed5..630a7f21d8de 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4964,6 +4964,8 @@ struct static_key_false rps_needed __read_mostly;
>  EXPORT_SYMBOL(rps_needed);
>  struct static_key_false rfs_needed __read_mostly;
>  EXPORT_SYMBOL(rfs_needed);
> +struct static_key_false rps_feat_llc_affinity __read_mostly;
> +EXPORT_SYMBOL(rps_feat_llc_affinity);
>
>  static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
>  {
> @@ -5175,6 +5177,76 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>         return cpu;
>  }
>
> +/**
> + * rps_record_cond - Determine if RPS flow table should be updated
> + * @old_val: Previous flow record value
> + * @new_val: Target flow record value
> + *
> + * Returns true if the record needs an update.
> + */
> +static inline bool rps_record_cond(u32 old_val, u32 new_val)
> +{
> +       u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
> +       u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
> +
> +       if (old_val == new_val)
> +               return false;
> +
> +       /*
> +        * RPS LLC Affinity Feature:
> +        * Reduce RFS/ARFS flow updates by checking LLC affinity.
> +        *
> +        * Frequent flow table updates can trigger constant hardware steering
> +        * reconfigurations (e.g., ndo_rx_flow_steer), leading to significant
> +        * contention on driver internal locks (like mlx5's arfs_lock).
> +        *
> +        * This strategy only updates the flow record if it migrates across LLC
> +        * boundaries. This minimizes expensive hardware updates while preserving
> +        * cache locality for the application.
> +        */
> +       if (static_branch_unlikely(&rps_feat_llc_affinity)) {
> +               /* Force update if the recorded CPU is invalid or has gone offline */
> +               if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
> +                       return true;
> +
> +               /*
> +                * Force an update if the current task is no longer permitted
> +                * to run on the old_cpu.
> +                */
> +               if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
> +                       return true;
> +
> +               /*
> +                * If CPUs do not share a cache, allow the update to prevent
> +                * expensive remote memory accesses and cache misses.
> +                */
> +               if (!cpus_share_cache(old_cpu, new_cpu))
> +                       return true;
> +
> +               return false;
> +       }
> +
> +       return true;
> +}
> +
> +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> +{
> +       unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> +       u32 val = hash & ~net_hotdata.rps_cpu_mask;
> +       struct rps_sock_flow_table *table;
> +
> +       /* We only give a hint, preemption can change CPU under us */
> +       val |= raw_smp_processor_id();
> +
> +       table = rps_tag_to_table(tag_ptr);
> +       /* The following WRITE_ONCE() is paired with the READ_ONCE()
> +        * here, and another one in get_rps_cpu().
> +        */
> +       if (rps_record_cond(READ_ONCE(table[index].ent), val))
> +               WRITE_ONCE(table[index].ent, val);
> +}
> +EXPORT_SYMBOL(rps_record_sock_flow);

We do not want to put rps_record_sock_flow out of line.
rps_llc_check() is probably fine, it should not be called often.


diff --git a/include/net/rps.h b/include/net/rps.h
index e33c6a2fa8bbca3555ecccbbf9132d01cc433c36..7e98918d8751eb824b7057cca9e5d40c28e5f18a
100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -55,10 +55,12 @@ struct rps_sock_flow_table {

 #define RPS_NO_CPU 0xffff

+bool rps_llc_check(u32 old_val, u32 new_val);
+
 static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
 {
        unsigned int index = hash & rps_tag_to_mask(tag_ptr);
-       u32 val = hash & ~net_hotdata.rps_cpu_mask;
+       u32 old_val, val = hash & ~net_hotdata.rps_cpu_mask;
        struct rps_sock_flow_table *table;

        /* We only give a hint, preemption can change CPU under us */
@@ -68,7 +70,8 @@ static inline void rps_record_sock_flow(rps_tag_ptr
tag_ptr, u32 hash)
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in get_rps_cpu().
         */
-       if (READ_ONCE(table[index].ent) != val)
+       old_val = READ_ONCE(table[index].ent);
+       if (old_val != val && rps_llc_check(old_val, val))
                WRITE_ONCE(table[index].ent, val);
 }

Re: [PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity

Posted by chuang 1 month, 1 week ago

On Tue, Apr 28, 2026 at 1:09 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Apr 27, 2026 at 7:56 PM Chuang Wang <nashuiliang@gmail.com> wrote:
> >
> > The current implementation of rps_record_sock_flow() updates the flow
> > table every time a socket is processed on a different CPU. In high-load
> > scenarios, especially with Accelerated RFS (ARFS), this triggers
> > frequent flow steering updates via ndo_rx_flow_steer.
> >
> > For drivers like mlx5 that implement hardware flow steering, these
> > constant updates lead to significant contention on internal driver locks
> > (e.g., arfs_lock). This contention often becomes a performance
> > bottleneck that outweighs the steering benefits.
> >
> > This patch introduces a cache-aware update strategy: the flow record is
> > only updated if the flow migrates across Last Level Cache (LLC)
> > boundaries. This minimizes expensive hardware reconfigurations while
> > preserving cache locality for the application. A new sysctl,
> > net.core.rps_feat_llc_affinity, is added to toggle this feature.
> >
> > Performance Test Results:
> > The patch was tested in a K8s environment (AMD CPU 128*2, 16-core Pod
> > with CPU pinning, mlx5 NIC) using brpc[1] echo_server and rpc_press.
> >
> > rpc_press Commands:
> >
> >   for i in {1..8}; do
> >     ./rpc_press -proto=./echo.proto -method=example.EchoService.Echo
> >     -server=<IP>:8000 -input='{"message":"hello"}'
> >     -qps=0 -thread_num=512 -connection_type=pooled &
> >   done
> >
> > Monitor mlx5e_rx_flow_steer frequency:
> >
> >   /usr/share/bcc/tools/funccount -i 1 mlx5e_rx_flow_steer
> >
> > Frequency of mlx5e_rx_flow_steer (via funccount[2]):
> >
> >   Before: ~335,000 counts/sec
> >   After:   ~23,000 counts/sec (reduced by ~93%)
> >
> > System Metrics (after enabling rps_feat_llc_affinity):
> >
> >   CPU Utilization: 38% -> 32%
> >   CPU PSI (Pressure Stall Information): 20% -> 10%
> >
> > These results demonstrate that filtering updates by LLC affinity
> > significantly reduces driver lock contention and improves overall
> > CPU efficiency under heavy network load.
> >
> > [1] https://github.com/apache/brpc/
> > [2] https://github.com/iovisor/bcc/blob/master/tools/funccount.py
> >
> > Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
> > ---
> > v2 -> v3: patch net -> net-next
> > v1 -> v2: add rps_feat_llc_affinity; add brpc tests
> >
> >  include/net/rps.h          | 18 ++--------
> >  net/core/dev.c             | 72 ++++++++++++++++++++++++++++++++++++++
> >  net/core/sysctl_net_core.c | 34 ++++++++++++++++++
> >  3 files changed, 108 insertions(+), 16 deletions(-)
> >
> > diff --git a/include/net/rps.h b/include/net/rps.h
> > index e33c6a2fa8bb..37bbb7009c36 100644
> > --- a/include/net/rps.h
> > +++ b/include/net/rps.h
> > @@ -12,6 +12,7 @@
> >
> >  extern struct static_key_false rps_needed;
> >  extern struct static_key_false rfs_needed;
> > +extern struct static_key_false rps_feat_llc_affinity;
> >
> >  /*
> >   * This structure holds an RPS map which can be of variable length.  The
> > @@ -55,22 +56,7 @@ struct rps_sock_flow_table {
> >
> >  #define RPS_NO_CPU 0xffff
> >
> > -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> > -{
> > -       unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> > -       u32 val = hash & ~net_hotdata.rps_cpu_mask;
> > -       struct rps_sock_flow_table *table;
> > -
> > -       /* We only give a hint, preemption can change CPU under us */
> > -       val |= raw_smp_processor_id();
> > -
> > -       table = rps_tag_to_table(tag_ptr);
> > -       /* The following WRITE_ONCE() is paired with the READ_ONCE()
> > -        * here, and another one in get_rps_cpu().
> > -        */
> > -       if (READ_ONCE(table[index].ent) != val)
> > -               WRITE_ONCE(table[index].ent, val);
> > -}
> > +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
> >
> >  static inline void _sock_rps_record_flow_hash(__u32 hash)
> >  {
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 203dc36aaed5..630a7f21d8de 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4964,6 +4964,8 @@ struct static_key_false rps_needed __read_mostly;
> >  EXPORT_SYMBOL(rps_needed);
> >  struct static_key_false rfs_needed __read_mostly;
> >  EXPORT_SYMBOL(rfs_needed);
> > +struct static_key_false rps_feat_llc_affinity __read_mostly;
> > +EXPORT_SYMBOL(rps_feat_llc_affinity);
> >
> >  static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
> >  {
> > @@ -5175,6 +5177,76 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> >         return cpu;
> >  }
> >
> > +/**
> > + * rps_record_cond - Determine if RPS flow table should be updated
> > + * @old_val: Previous flow record value
> > + * @new_val: Target flow record value
> > + *
> > + * Returns true if the record needs an update.
> > + */
> > +static inline bool rps_record_cond(u32 old_val, u32 new_val)
> > +{
> > +       u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
> > +       u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
> > +
> > +       if (old_val == new_val)
> > +               return false;
> > +
> > +       /*
> > +        * RPS LLC Affinity Feature:
> > +        * Reduce RFS/ARFS flow updates by checking LLC affinity.
> > +        *
> > +        * Frequent flow table updates can trigger constant hardware steering
> > +        * reconfigurations (e.g., ndo_rx_flow_steer), leading to significant
> > +        * contention on driver internal locks (like mlx5's arfs_lock).
> > +        *
> > +        * This strategy only updates the flow record if it migrates across LLC
> > +        * boundaries. This minimizes expensive hardware updates while preserving
> > +        * cache locality for the application.
> > +        */
> > +       if (static_branch_unlikely(&rps_feat_llc_affinity)) {
> > +               /* Force update if the recorded CPU is invalid or has gone offline */
> > +               if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
> > +                       return true;
> > +
> > +               /*
> > +                * Force an update if the current task is no longer permitted
> > +                * to run on the old_cpu.
> > +                */
> > +               if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
> > +                       return true;
> > +
> > +               /*
> > +                * If CPUs do not share a cache, allow the update to prevent
> > +                * expensive remote memory accesses and cache misses.
> > +                */
> > +               if (!cpus_share_cache(old_cpu, new_cpu))
> > +                       return true;
> > +
> > +               return false;
> > +       }
> > +
> > +       return true;
> > +}
> > +
> > +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> > +{
> > +       unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> > +       u32 val = hash & ~net_hotdata.rps_cpu_mask;
> > +       struct rps_sock_flow_table *table;
> > +
> > +       /* We only give a hint, preemption can change CPU under us */
> > +       val |= raw_smp_processor_id();
> > +
> > +       table = rps_tag_to_table(tag_ptr);
> > +       /* The following WRITE_ONCE() is paired with the READ_ONCE()
> > +        * here, and another one in get_rps_cpu().
> > +        */
> > +       if (rps_record_cond(READ_ONCE(table[index].ent), val))
> > +               WRITE_ONCE(table[index].ent, val);
> > +}
> > +EXPORT_SYMBOL(rps_record_sock_flow);
>
> We do not want to put rps_record_sock_flow out of line.
> rps_llc_check() is probably fine, it should not be called often.
>

The same issue reported in:
https://lore.kernel.org/netdev/CACueBy4KyU8DjwtLM6pzjQNTbiy2M+ZhZdO7Ag=ssqWq00CJ7w@mail.gmail.com/
The reason is that 'tun' uses sock_rps_record_flow_hash() in
tun_flow_update(), which triggers the following compilation error due
to symbol visibility when CONFIG_TUN is built as a module:

ERROR: modpost: "rps_llc_check" [drivers/net/tun.ko] undefined!
make[2]: *** [scripts/Makefile.modpost:147: Module.symvers] Error 1

To resolve this, it seems more appropriate to export
sock_rps_record_flow_hash in net/core/dev.c:

+void sock_rps_record_flow_hash(__u32 hash)
+{
+#ifdef CONFIG_RPS
+       if (!rfs_is_needed())
+               return;
+
+       _sock_rps_record_flow_hash(hash);
+#endif
+}
+EXPORT_SYMBOL(sock_rps_record_flow_hash);
+

> diff --git a/include/net/rps.h b/include/net/rps.h
> index e33c6a2fa8bbca3555ecccbbf9132d01cc433c36..7e98918d8751eb824b7057cca9e5d40c28e5f18a
> 100644
> --- a/include/net/rps.h
> +++ b/include/net/rps.h
> @@ -55,10 +55,12 @@ struct rps_sock_flow_table {
>
>  #define RPS_NO_CPU 0xffff
>
> +bool rps_llc_check(u32 old_val, u32 new_val);
> +
>  static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
>  {
>         unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> -       u32 val = hash & ~net_hotdata.rps_cpu_mask;
> +       u32 old_val, val = hash & ~net_hotdata.rps_cpu_mask;
>         struct rps_sock_flow_table *table;
>
>         /* We only give a hint, preemption can change CPU under us */
> @@ -68,7 +70,8 @@ static inline void rps_record_sock_flow(rps_tag_ptr
> tag_ptr, u32 hash)
>         /* The following WRITE_ONCE() is paired with the READ_ONCE()
>          * here, and another one in get_rps_cpu().
>          */
> -       if (READ_ONCE(table[index].ent) != val)
> +       old_val = READ_ONCE(table[index].ent);
> +       if (old_val != val && rps_llc_check(old_val, val))
>                 WRITE_ONCE(table[index].ent, val);
>  }