include/net/rps.h | 17 +-------------- net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 16 deletions(-)
The current implementation of rps_record_sock_flow() updates the flow
table every time a socket is processed on a different CPU. In high-load
scenarios, especially with Accelerated RFS (ARFS), this triggers
frequent flow steering updates via ndo_rx_flow_steer.
For drivers like mlx5 that implement hardware flow steering, these
constant updates lead to significant contention on internal driver locks
(e.g., arfs_lock). This contention often becomes a performance
bottleneck that outweighs the steering benefits.
This patch introduces a cache-aware update strategy: the flow record is
only updated if the flow migrates across Last Level Cache (LLC)
boundaries. This minimizes expensive hardware reconfigurations while
preserving cache locality for the application.
Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
---
include/net/rps.h | 17 +--------------
net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 55 insertions(+), 16 deletions(-)
diff --git a/include/net/rps.h b/include/net/rps.h
index e33c6a2fa8bb..2cd8698a79d5 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -55,22 +55,7 @@ struct rps_sock_flow_table {
#define RPS_NO_CPU 0xffff
-static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
-{
- unsigned int index = hash & rps_tag_to_mask(tag_ptr);
- u32 val = hash & ~net_hotdata.rps_cpu_mask;
- struct rps_sock_flow_table *table;
-
- /* We only give a hint, preemption can change CPU under us */
- val |= raw_smp_processor_id();
-
- table = rps_tag_to_table(tag_ptr);
- /* The following WRITE_ONCE() is paired with the READ_ONCE()
- * here, and another one in get_rps_cpu().
- */
- if (READ_ONCE(table[index].ent) != val)
- WRITE_ONCE(table[index].ent, val);
-}
+void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
static inline void _sock_rps_record_flow_hash(__u32 hash)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index 203dc36aaed5..770cfb6fe06b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5175,6 +5175,60 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
return cpu;
}
+/**
+ * rps_record_cond - Determine if RPS flow table should be updated
+ * @old_val: Previous flow record value
+ * @new_val: Target flow record value
+ *
+ * Returns true if the record needs an update.
+ */
+static inline bool rps_record_cond(u32 old_val, u32 new_val)
+{
+ u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
+ u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
+
+ if (old_val == new_val)
+ return false;
+
+ /* Force update if the recorded CPU is invalid or has gone offline */
+ if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
+ return true;
+
+ /*
+ * Force an update if the current task is no longer permitted
+ * to run on the old_cpu.
+ */
+ if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
+ return true;
+
+ /*
+ * If CPUs do not share a cache, allow the update to prevent
+ * expensive remote memory accesses and cache misses.
+ */
+ if (!cpus_share_cache(old_cpu, new_cpu))
+ return true;
+
+ return false;
+}
+
+void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
+{
+ unsigned int index = hash & rps_tag_to_mask(tag_ptr);
+ u32 val = hash & ~net_hotdata.rps_cpu_mask;
+ struct rps_sock_flow_table *table;
+
+ /* We only give a hint, preemption can change CPU under us */
+ val |= raw_smp_processor_id();
+
+ table = rps_tag_to_table(tag_ptr);
+ /* The following WRITE_ONCE() is paired with the READ_ONCE()
+ * here, and another one in get_rps_cpu().
+ */
+ if (rps_record_cond(READ_ONCE(table[index].ent), val))
+ WRITE_ONCE(table[index].ent, val);
+}
+EXPORT_SYMBOL(rps_record_sock_flow);
+
#ifdef CONFIG_RFS_ACCEL
/**
--
2.47.3
On Sun, Mar 8, 2026 at 8:10 AM Chuang Wang <nashuiliang@gmail.com> wrote:
>
> The current implementation of rps_record_sock_flow() updates the flow
> table every time a socket is processed on a different CPU. In high-load
> scenarios, especially with Accelerated RFS (ARFS), this triggers
> frequent flow steering updates via ndo_rx_flow_steer.
>
> For drivers like mlx5 that implement hardware flow steering, these
> constant updates lead to significant contention on internal driver locks
> (e.g., arfs_lock). This contention often becomes a performance
> bottleneck that outweighs the steering benefits.
>
> This patch introduces a cache-aware update strategy: the flow record is
> only updated if the flow migrates across Last Level Cache (LLC)
> boundaries. This minimizes expensive hardware reconfigurations while
> preserving cache locality for the application.
>
> Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
> ---
> include/net/rps.h | 17 +--------------
> net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 55 insertions(+), 16 deletions(-)
>
> diff --git a/include/net/rps.h b/include/net/rps.h
> index e33c6a2fa8bb..2cd8698a79d5 100644
> --- a/include/net/rps.h
> +++ b/include/net/rps.h
> @@ -55,22 +55,7 @@ struct rps_sock_flow_table {
>
> #define RPS_NO_CPU 0xffff
>
> -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> -{
> - unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> - u32 val = hash & ~net_hotdata.rps_cpu_mask;
> - struct rps_sock_flow_table *table;
> -
> - /* We only give a hint, preemption can change CPU under us */
> - val |= raw_smp_processor_id();
> -
> - table = rps_tag_to_table(tag_ptr);
> - /* The following WRITE_ONCE() is paired with the READ_ONCE()
> - * here, and another one in get_rps_cpu().
> - */
> - if (READ_ONCE(table[index].ent) != val)
> - WRITE_ONCE(table[index].ent, val);
> -}
> +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
>
> static inline void _sock_rps_record_flow_hash(__u32 hash)
> {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 203dc36aaed5..770cfb6fe06b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5175,6 +5175,60 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> return cpu;
> }
>
> +/**
> + * rps_record_cond - Determine if RPS flow table should be updated
> + * @old_val: Previous flow record value
> + * @new_val: Target flow record value
> + *
> + * Returns true if the record needs an update.
> + */
> +static inline bool rps_record_cond(u32 old_val, u32 new_val)
> +{
> + u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
> + u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
> +
> + if (old_val == new_val)
> + return false;
> +
> + /* Force update if the recorded CPU is invalid or has gone offline */
> + if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
> + return true;
> +
> + /*
> + * Force an update if the current task is no longer permitted
> + * to run on the old_cpu.
> + */
> + if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
> + return true;
> +
> + /*
> + * If CPUs do not share a cache, allow the update to prevent
> + * expensive remote memory accesses and cache misses.
> + */
> + if (!cpus_share_cache(old_cpu, new_cpu))
> + return true;
> +
> + return false;
> +}
> +
> +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> +{
> + unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> + u32 val = hash & ~net_hotdata.rps_cpu_mask;
> + struct rps_sock_flow_table *table;
> +
> + /* We only give a hint, preemption can change CPU under us */
> + val |= raw_smp_processor_id();
> +
> + table = rps_tag_to_table(tag_ptr);
> + /* The following WRITE_ONCE() is paired with the READ_ONCE()
> + * here, and another one in get_rps_cpu().
> + */
> + if (rps_record_cond(READ_ONCE(table[index].ent), val))
> + WRITE_ONCE(table[index].ent, val);
> +}
> +EXPORT_SYMBOL(rps_record_sock_flow);
> +
> #ifdef CONFIG_RFS_ACCEL
>
> /**
> --
> 2.47.3
>
Interesting idea but:
1) Some of us do not use CONFIG_RFS_ACCEL yet.
2) You put a very fast path function out-of-line, why ?
3) I think the behavior should be selectable with a static key or
something like that.
4) Please provide benchmark results.
Thanks.
Hi,
On Sun, Mar 8, 2026 at 3:19 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sun, Mar 8, 2026 at 8:10 AM Chuang Wang <nashuiliang@gmail.com> wrote:
> >
> > The current implementation of rps_record_sock_flow() updates the flow
> > table every time a socket is processed on a different CPU. In high-load
> > scenarios, especially with Accelerated RFS (ARFS), this triggers
> > frequent flow steering updates via ndo_rx_flow_steer.
> >
> > For drivers like mlx5 that implement hardware flow steering, these
> > constant updates lead to significant contention on internal driver locks
> > (e.g., arfs_lock). This contention often becomes a performance
> > bottleneck that outweighs the steering benefits.
> >
> > This patch introduces a cache-aware update strategy: the flow record is
> > only updated if the flow migrates across Last Level Cache (LLC)
> > boundaries. This minimizes expensive hardware reconfigurations while
> > preserving cache locality for the application.
> >
> > Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
> > ---
> > include/net/rps.h | 17 +--------------
> > net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 55 insertions(+), 16 deletions(-)
> >
> > diff --git a/include/net/rps.h b/include/net/rps.h
> > index e33c6a2fa8bb..2cd8698a79d5 100644
> > --- a/include/net/rps.h
> > +++ b/include/net/rps.h
> > @@ -55,22 +55,7 @@ struct rps_sock_flow_table {
> >
> > #define RPS_NO_CPU 0xffff
> >
> > -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> > -{
> > - unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> > - u32 val = hash & ~net_hotdata.rps_cpu_mask;
> > - struct rps_sock_flow_table *table;
> > -
> > - /* We only give a hint, preemption can change CPU under us */
> > - val |= raw_smp_processor_id();
> > -
> > - table = rps_tag_to_table(tag_ptr);
> > - /* The following WRITE_ONCE() is paired with the READ_ONCE()
> > - * here, and another one in get_rps_cpu().
> > - */
> > - if (READ_ONCE(table[index].ent) != val)
> > - WRITE_ONCE(table[index].ent, val);
> > -}
> > +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
> >
> > static inline void _sock_rps_record_flow_hash(__u32 hash)
> > {
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 203dc36aaed5..770cfb6fe06b 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -5175,6 +5175,60 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> > return cpu;
> > }
> >
> > +/**
> > + * rps_record_cond - Determine if RPS flow table should be updated
> > + * @old_val: Previous flow record value
> > + * @new_val: Target flow record value
> > + *
> > + * Returns true if the record needs an update.
> > + */
> > +static inline bool rps_record_cond(u32 old_val, u32 new_val)
> > +{
> > + u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
> > + u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
> > +
> > + if (old_val == new_val)
> > + return false;
> > +
> > + /* Force update if the recorded CPU is invalid or has gone offline */
> > + if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
> > + return true;
> > +
> > + /*
> > + * Force an update if the current task is no longer permitted
> > + * to run on the old_cpu.
> > + */
> > + if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
> > + return true;
> > +
> > + /*
> > + * If CPUs do not share a cache, allow the update to prevent
> > + * expensive remote memory accesses and cache misses.
> > + */
> > + if (!cpus_share_cache(old_cpu, new_cpu))
> > + return true;
> > +
> > + return false;
> > +}
> > +
> > +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> > +{
> > + unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> > + u32 val = hash & ~net_hotdata.rps_cpu_mask;
> > + struct rps_sock_flow_table *table;
> > +
> > + /* We only give a hint, preemption can change CPU under us */
> > + val |= raw_smp_processor_id();
> > +
> > + table = rps_tag_to_table(tag_ptr);
> > + /* The following WRITE_ONCE() is paired with the READ_ONCE()
> > + * here, and another one in get_rps_cpu().
> > + */
> > + if (rps_record_cond(READ_ONCE(table[index].ent), val))
> > + WRITE_ONCE(table[index].ent, val);
> > +}
> > +EXPORT_SYMBOL(rps_record_sock_flow);
> > +
> > #ifdef CONFIG_RFS_ACCEL
> >
> > /**
> > --
> > 2.47.3
> >
>
> Interesting idea but:
>
> 1) Some of us do not use CONFIG_RFS_ACCEL yet.
Points 1 and 3 are noted. If this approach is acceptable, I can
implement it as a selectable option (e.g., using a static key) to
toggle the LLC-aware strategy.
> 2) You put a very fast path function out-of-line, why ?
The reason is that tun uses sock_rps_record_flow_hash(). When I moved
all rps_record_sock_flow and rps_record_cond modifications into
include/net/rps.h, it triggered the following compilation errors due
to symbol visibility:
ERROR: modpost: "cpus_share_cache" [drivers/net/tun.ko] undefined!
ERROR: modpost: "cpus_share_cache" [net/sctp/sctp.ko] undefined!
make[2]: *** [scripts/Makefile.modpost:147: Module.symvers] Error 1
This arises because the patch uses cpus_share_cache() to limit the
RFS/ARFS update frequency at the LLC level. To keep this in the fast
path, I could move cpus_share_cache() to
include/linux/sched/topology.h.
> 3) I think the behavior should be selectable with a static key or
> something like that.
Please refer to my response to point 1.
>
> 4) Please provide benchmark results.
The scenario is similar to the one described in "[RFC] problems with
RFS on bRPC applications"[1].
I attempted to enable ARFS on a Mellanox CX-6 NIC. While it performs
well for simple workloads, performance degrades significantly when
running a bRPC[2] workload on a 2-node NUMA machine. After tracing, I
identified patterns that ARFS/RFS fails to handle efficiently:
- Multiple threads use epoll to read from the same socket, causing
frequent flow updates in sock_flow_table.
- Threads reading from the socket migrate frequently between CPUs.
I tested a PoC version using a bRPC service, utilizing funccount [3]
to monitor execution frequency and perf top to observe hotspots:
Before Patch
The mlx5e_rx_flow_steer frequency is over 380k/s, and queued_spin_lock
is a major hotspot (6.30% in perf top). The application also suffers
from a noticeable drop.
FUNC COUNT
mlx5e_rx_flow_steer 387594
FUNC COUNT
mlx5e_rx_flow_steer 390142
FUNC COUNT
mlx5e_rx_flow_steer 386694
FUNC COUNT
mlx5e_rx_flow_steer 389094
# perf top hotspot:
queued_spin_lock 6.30%
After Patch
The ARFS update frequency is significantly reduced. queued_spin_lock
is no longer a hotspot in perf top, and the application's overall
performance has improved.
FUNC COUNT
mlx5e_rx_flow_steer 43
FUNC COUNT
mlx5e_rx_flow_steer 9
FUNC COUNT
mlx5e_rx_flow_steer 207
FUNC COUNT
mlx5e_rx_flow_steer 26
1: https://lore.kernel.org/netdev/CAHCEFEwToeQe_Ey8e=sf8fOmoobvrDCPsxw+hfUSoRawPX03+Q@mail.gmail.com/t/#u
2: https://github.com/apache/brpc
3: https://github.com/iovisor/bcc/blob/master/tools/funccount.py
>
> Thanks.
Hi,
Any thoughts or suggestions on the patch "reduce RFS/ARFS flow updates
by checking LLC affinity"? It’s been a while since the last update.
On Sun, Mar 8, 2026 at 4:20 PM chuang <nashuiliang@gmail.com> wrote:
>
> Hi,
>
> On Sun, Mar 8, 2026 at 3:19 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Sun, Mar 8, 2026 at 8:10 AM Chuang Wang <nashuiliang@gmail.com> wrote:
> > >
> > > The current implementation of rps_record_sock_flow() updates the flow
> > > table every time a socket is processed on a different CPU. In high-load
> > > scenarios, especially with Accelerated RFS (ARFS), this triggers
> > > frequent flow steering updates via ndo_rx_flow_steer.
> > >
> > > For drivers like mlx5 that implement hardware flow steering, these
> > > constant updates lead to significant contention on internal driver locks
> > > (e.g., arfs_lock). This contention often becomes a performance
> > > bottleneck that outweighs the steering benefits.
> > >
> > > This patch introduces a cache-aware update strategy: the flow record is
> > > only updated if the flow migrates across Last Level Cache (LLC)
> > > boundaries. This minimizes expensive hardware reconfigurations while
> > > preserving cache locality for the application.
> > >
> > > Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
> > > ---
> > > include/net/rps.h | 17 +--------------
> > > net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 55 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/include/net/rps.h b/include/net/rps.h
> > > index e33c6a2fa8bb..2cd8698a79d5 100644
> > > --- a/include/net/rps.h
> > > +++ b/include/net/rps.h
> > > @@ -55,22 +55,7 @@ struct rps_sock_flow_table {
> > >
> > > #define RPS_NO_CPU 0xffff
> > >
> > > -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> > > -{
> > > - unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> > > - u32 val = hash & ~net_hotdata.rps_cpu_mask;
> > > - struct rps_sock_flow_table *table;
> > > -
> > > - /* We only give a hint, preemption can change CPU under us */
> > > - val |= raw_smp_processor_id();
> > > -
> > > - table = rps_tag_to_table(tag_ptr);
> > > - /* The following WRITE_ONCE() is paired with the READ_ONCE()
> > > - * here, and another one in get_rps_cpu().
> > > - */
> > > - if (READ_ONCE(table[index].ent) != val)
> > > - WRITE_ONCE(table[index].ent, val);
> > > -}
> > > +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
> > >
> > > static inline void _sock_rps_record_flow_hash(__u32 hash)
> > > {
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 203dc36aaed5..770cfb6fe06b 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -5175,6 +5175,60 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> > > return cpu;
> > > }
> > >
> > > +/**
> > > + * rps_record_cond - Determine if RPS flow table should be updated
> > > + * @old_val: Previous flow record value
> > > + * @new_val: Target flow record value
> > > + *
> > > + * Returns true if the record needs an update.
> > > + */
> > > +static inline bool rps_record_cond(u32 old_val, u32 new_val)
> > > +{
> > > + u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
> > > + u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
> > > +
> > > + if (old_val == new_val)
> > > + return false;
> > > +
> > > + /* Force update if the recorded CPU is invalid or has gone offline */
> > > + if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
> > > + return true;
> > > +
> > > + /*
> > > + * Force an update if the current task is no longer permitted
> > > + * to run on the old_cpu.
> > > + */
> > > + if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
> > > + return true;
> > > +
> > > + /*
> > > + * If CPUs do not share a cache, allow the update to prevent
> > > + * expensive remote memory accesses and cache misses.
> > > + */
> > > + if (!cpus_share_cache(old_cpu, new_cpu))
> > > + return true;
> > > +
> > > + return false;
> > > +}
> > > +
> > > +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> > > +{
> > > + unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> > > + u32 val = hash & ~net_hotdata.rps_cpu_mask;
> > > + struct rps_sock_flow_table *table;
> > > +
> > > + /* We only give a hint, preemption can change CPU under us */
> > > + val |= raw_smp_processor_id();
> > > +
> > > + table = rps_tag_to_table(tag_ptr);
> > > + /* The following WRITE_ONCE() is paired with the READ_ONCE()
> > > + * here, and another one in get_rps_cpu().
> > > + */
> > > + if (rps_record_cond(READ_ONCE(table[index].ent), val))
> > > + WRITE_ONCE(table[index].ent, val);
> > > +}
> > > +EXPORT_SYMBOL(rps_record_sock_flow);
> > > +
> > > #ifdef CONFIG_RFS_ACCEL
> > >
> > > /**
> > > --
> > > 2.47.3
> > >
> >
> > Interesting idea but:
> >
> > 1) Some of us do not use CONFIG_RFS_ACCEL yet.
>
> Points 1 and 3 are noted. If this approach is acceptable, I can
> implement it as a selectable option (e.g., using a static key) to
> toggle the LLC-aware strategy.
>
> > 2) You put a very fast path function out-of-line, why ?
>
> The reason is that tun uses sock_rps_record_flow_hash(). When I moved
> all rps_record_sock_flow and rps_record_cond modifications into
> include/net/rps.h, it triggered the following compilation errors due
> to symbol visibility:
>
> ERROR: modpost: "cpus_share_cache" [drivers/net/tun.ko] undefined!
> ERROR: modpost: "cpus_share_cache" [net/sctp/sctp.ko] undefined!
> make[2]: *** [scripts/Makefile.modpost:147: Module.symvers] Error 1
>
> This arises because the patch uses cpus_share_cache() to limit the
> RFS/ARFS update frequency at the LLC level. To keep this in the fast
> path, I could move cpus_share_cache() to
> include/linux/sched/topology.h.
>
> > 3) I think the behavior should be selectable with a static key or
> > something like that.
>
> Please refer to my response to point 1.
>
> >
> > 4) Please provide benchmark results.
>
> The scenario is similar to the one described in "[RFC] problems with
> RFS on bRPC applications"[1].
>
> I attempted to enable ARFS on a Mellanox CX-6 NIC. While it performs
> well for simple workloads, performance degrades significantly when
> running a bRPC[2] workload on a 2-node NUMA machine. After tracing, I
> identified patterns that ARFS/RFS fails to handle efficiently:
>
> - Multiple threads use epoll to read from the same socket, causing
> frequent flow updates in sock_flow_table.
> - Threads reading from the socket migrate frequently between CPUs.
>
> I tested a PoC version using a bRPC service, utilizing funccount [3]
> to monitor execution frequency and perf top to observe hotspots:
>
> Before Patch
>
> The mlx5e_rx_flow_steer frequency is over 380k/s, and queued_spin_lock
> is a major hotspot (6.30% in perf top). The application also suffers
> from a noticeable drop.
>
> FUNC COUNT
> mlx5e_rx_flow_steer 387594
>
> FUNC COUNT
> mlx5e_rx_flow_steer 390142
>
> FUNC COUNT
> mlx5e_rx_flow_steer 386694
>
> FUNC COUNT
> mlx5e_rx_flow_steer 389094
>
> # perf top hotspot:
> queued_spin_lock 6.30%
>
> After Patch
>
> The ARFS update frequency is significantly reduced. queued_spin_lock
> is no longer a hotspot in perf top, and the application's overall
> performance has improved.
>
> FUNC COUNT
> mlx5e_rx_flow_steer 43
>
> FUNC COUNT
> mlx5e_rx_flow_steer 9
>
> FUNC COUNT
> mlx5e_rx_flow_steer 207
>
> FUNC COUNT
> mlx5e_rx_flow_steer 26
>
> 1: https://lore.kernel.org/netdev/CAHCEFEwToeQe_Ey8e=sf8fOmoobvrDCPsxw+hfUSoRawPX03+Q@mail.gmail.com/t/#u
> 2: https://github.com/apache/brpc
> 3: https://github.com/iovisor/bcc/blob/master/tools/funccount.py
>
>
> >
> > Thanks.
On Tue, Mar 31, 2026 at 12:42 AM chuang <nashuiliang@gmail.com> wrote: > > Hi, > > Any thoughts or suggestions on the patch "reduce RFS/ARFS flow updates > by checking LLC affinity"? It’s been a while since the last update. Please do not top-post on netdev@ Did I miss a V2 of your patch? Include actual performance numbers in your submission to catch our attention, or ask people using aRFS setups to test your patch.
© 2016 - 2026 Red Hat, Inc.