[PATCH mptcp-next v2 2/2] mptcp: record subflows in RPS table

Christoph Paasch via B4 Relay posted 2 patches 3 weeks, 1 day ago
[PATCH mptcp-next v2 2/2] mptcp: record subflows in RPS table
Posted by Christoph Paasch via B4 Relay 3 weeks, 1 day ago
From: Christoph Paasch <cpaasch@openai.com>

Accelerated Receive Flow Steering (aRFS) relies on sockets recording
their RX flow hash into the rps_sock_flow_table so that incoming packets
are steered to the CPU where the application runs.

With MPTCP, the application interacts with the parent MPTCP socket while
data is carried over per-subflow TCP sockets. Without recording these
subflows, aRFS cannot steer interrupts and RX processing for the flows
to the desired CPU.

Record all subflows in the RPS table by calling sock_rps_record_flow()
for each subflow at the start of mptcp_sendmsg(), mptcp_recvmsg() and
mptcp_stream_accept(), by using the new helper
mptcp_rps_record_subflows().

It does not by itself improve throughput, but ensures that IRQ and RX
processing are directed to the right CPU, which is a
prerequisite for effective aRFS.

Signed-off-by: Christoph Paasch <cpaasch@openai.com>
---
 net/mptcp/protocol.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4b510e04724f5c8be08a00a1cc03093bcd031905..543506b6a011af78208381857ab9c1962cbc6636 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -12,6 +12,7 @@
 #include <linux/sched/signal.h>
 #include <linux/atomic.h>
 #include <net/aligned_data.h>
+#include <net/rps.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
@@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
 	return limit - not_sent;
 }
 
+static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow;
+
+	if (!rfs_is_needed())
+		return;
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		sock_rps_record_flow(ssk);
+	}
+}
+
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	lock_sock(sk);
 
+	mptcp_rps_record_subflows(msk);
+
 	if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
 		     msg->msg_flags & MSG_FASTOPEN)) {
 		int copied_syn = 0;
@@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		goto out_err;
 	}
 
+	mptcp_rps_record_subflows(msk);
+
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	len = min_t(size_t, len, INT_MAX);
@@ -3921,6 +3940,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 				mptcp_sock_graft(ssk, newsock);
 		}
 
+		mptcp_rps_record_subflows(msk);
+
 		/* Do late cleanup for the first subflow as necessary. Also
 		 * deal with bad peers not doing a complete shutdown.
 		 */

-- 
2.50.1
Re: [PATCH mptcp-next v2 2/2] mptcp: record subflows in RPS table
Posted by Matthieu Baerts 2 weeks, 6 days ago
Hi Christoph,

On 26/08/2025 06:30, Christoph Paasch via B4 Relay wrote:
> From: Christoph Paasch <cpaasch@openai.com>
> 
> Accelerated Receive Flow Steering (aRFS) relies on sockets recording
> their RX flow hash into the rps_sock_flow_table so that incoming packets
> are steered to the CPU where the application runs.
> 
> With MPTCP, the application interacts with the parent MPTCP socket while
> data is carried over per-subflow TCP sockets. Without recording these
> subflows, aRFS cannot steer interrupts and RX processing for the flows
> to the desired CPU.
> 
> Record all subflows in the RPS table by calling sock_rps_record_flow()
> for each subflow at the start of mptcp_sendmsg(), mptcp_recvmsg() and
> mptcp_stream_accept(), by using the new helper
> mptcp_rps_record_subflows().
> 
> It does not by itself improve throughput, but ensures that IRQ and RX
> processing are directed to the right CPU, which is a
> prerequisite for effective aRFS.

Thank you for the v2! I just have two small questions below, but I don't
think they are blocking.

> Signed-off-by: Christoph Paasch <cpaasch@openai.com>
> ---
>  net/mptcp/protocol.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 4b510e04724f5c8be08a00a1cc03093bcd031905..543506b6a011af78208381857ab9c1962cbc6636 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -12,6 +12,7 @@
>  #include <linux/sched/signal.h>
>  #include <linux/atomic.h>
>  #include <net/aligned_data.h>
> +#include <net/rps.h>
>  #include <net/sock.h>
>  #include <net/inet_common.h>
>  #include <net/inet_hashtables.h>
> @@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
>  	return limit - not_sent;
>  }
>  
> +static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
> +{
> +	struct mptcp_subflow_context *subflow;
> +
> +	if (!rfs_is_needed())
> +		return;
> +
> +	mptcp_for_each_subflow(msk, subflow) {
> +		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +
> +		sock_rps_record_flow(ssk);

I guess we don't want to call _sock_rps_record_flow() here to avoid
calling rfs_is_needed() multiple times in a row, because it is just an
extra static branch check?

> +	}
> +}
> +
>  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  {
>  	struct mptcp_sock *msk = mptcp_sk(sk);
> @@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>  
>  	lock_sock(sk);
>  
> +	mptcp_rps_record_subflows(msk);
> +
>  	if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
>  		     msg->msg_flags & MSG_FASTOPEN)) {
>  		int copied_syn = 0;
> @@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  		goto out_err;
>  	}
>  
> +	mptcp_rps_record_subflows(msk);
> +
>  	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
>  
>  	len = min_t(size_t, len, INT_MAX);
> @@ -3921,6 +3940,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
>  				mptcp_sock_graft(ssk, newsock);
>  		}
>  
> +		mptcp_rps_record_subflows(msk);

Out of curiosity, why not calling sock_rps_record_flow(ssk) here?

> +
>  		/* Do late cleanup for the first subflow as necessary. Also
>  		 * deal with bad peers not doing a complete shutdown.
>  		 */
> 

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.
Re: [PATCH mptcp-next v2 2/2] mptcp: record subflows in RPS table
Posted by Christoph Paasch 2 weeks, 6 days ago
On Thu, Aug 28, 2025 at 8:50 AM Matthieu Baerts <matttbe@kernel.org> wrote:
>
> Hi Christoph,
>
> On 26/08/2025 06:30, Christoph Paasch via B4 Relay wrote:
> > From: Christoph Paasch <cpaasch@openai.com>
> >
> > Accelerated Receive Flow Steering (aRFS) relies on sockets recording
> > their RX flow hash into the rps_sock_flow_table so that incoming packets
> > are steered to the CPU where the application runs.
> >
> > With MPTCP, the application interacts with the parent MPTCP socket while
> > data is carried over per-subflow TCP sockets. Without recording these
> > subflows, aRFS cannot steer interrupts and RX processing for the flows
> > to the desired CPU.
> >
> > Record all subflows in the RPS table by calling sock_rps_record_flow()
> > for each subflow at the start of mptcp_sendmsg(), mptcp_recvmsg() and
> > mptcp_stream_accept(), by using the new helper
> > mptcp_rps_record_subflows().
> >
> > It does not by itself improve throughput, but ensures that IRQ and RX
> > processing are directed to the right CPU, which is a
> > prerequisite for effective aRFS.
>
> Thank you for the v2! I just have two small questions below, but I don't
> think they are blocking.
>
> > Signed-off-by: Christoph Paasch <cpaasch@openai.com>
> > ---
> >  net/mptcp/protocol.c | 21 +++++++++++++++++++++
> >  1 file changed, 21 insertions(+)
> >
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 4b510e04724f5c8be08a00a1cc03093bcd031905..543506b6a011af78208381857ab9c1962cbc6636 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -12,6 +12,7 @@
> >  #include <linux/sched/signal.h>
> >  #include <linux/atomic.h>
> >  #include <net/aligned_data.h>
> > +#include <net/rps.h>
> >  #include <net/sock.h>
> >  #include <net/inet_common.h>
> >  #include <net/inet_hashtables.h>
> > @@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
> >       return limit - not_sent;
> >  }
> >
> > +static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
> > +{
> > +     struct mptcp_subflow_context *subflow;
> > +
> > +     if (!rfs_is_needed())
> > +             return;
> > +
> > +     mptcp_for_each_subflow(msk, subflow) {
> > +             struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> > +
> > +             sock_rps_record_flow(ssk);
>
> I guess we don't want to call _sock_rps_record_flow() here to avoid
> calling rfs_is_needed() multiple times in a row, because it is just an
> extra static branch check?

IMO, the _sock_rps_* functions are not meant to be called  from
outside rps.h. They are just in the rps.h file so that they can be
static inline.
In any case, the _sock_rps_* functions are guarded by #ifdef CONFIG_RPS.

> > +     }
> > +}
> > +
> >  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> >  {
> >       struct mptcp_sock *msk = mptcp_sk(sk);
> > @@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> >
> >       lock_sock(sk);
> >
> > +     mptcp_rps_record_subflows(msk);
> > +
> >       if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
> >                    msg->msg_flags & MSG_FASTOPEN)) {
> >               int copied_syn = 0;
> > @@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> >               goto out_err;
> >       }
> >
> > +     mptcp_rps_record_subflows(msk);
> > +
> >       timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
> >
> >       len = min_t(size_t, len, INT_MAX);
> > @@ -3921,6 +3940,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
> >                               mptcp_sock_graft(ssk, newsock);
> >               }
> >
> > +             mptcp_rps_record_subflows(msk);
>
> Out of curiosity, why not calling sock_rps_record_flow(ssk) here?

Again, primarily to keep the code clean. MPTCP-code calls
mptcp_rps_record_subflows, which calls sock_rps_record_flow() which
calls the _sock_rps helpers. In my opinion that is a better structure
and flow.
No real pure technical reason.


Christoph
Re: [PATCH mptcp-next v2 2/2] mptcp: record subflows in RPS table
Posted by Matthieu Baerts 2 weeks, 6 days ago
On 28/08/2025 18:12, Christoph Paasch wrote:
> On Thu, Aug 28, 2025 at 8:50 AM Matthieu Baerts <matttbe@kernel.org> wrote:
>>
>> Hi Christoph,
>>
>> On 26/08/2025 06:30, Christoph Paasch via B4 Relay wrote:
>>> From: Christoph Paasch <cpaasch@openai.com>
>>>
>>> Accelerated Receive Flow Steering (aRFS) relies on sockets recording
>>> their RX flow hash into the rps_sock_flow_table so that incoming packets
>>> are steered to the CPU where the application runs.
>>>
>>> With MPTCP, the application interacts with the parent MPTCP socket while
>>> data is carried over per-subflow TCP sockets. Without recording these
>>> subflows, aRFS cannot steer interrupts and RX processing for the flows
>>> to the desired CPU.
>>>
>>> Record all subflows in the RPS table by calling sock_rps_record_flow()
>>> for each subflow at the start of mptcp_sendmsg(), mptcp_recvmsg() and
>>> mptcp_stream_accept(), by using the new helper
>>> mptcp_rps_record_subflows().
>>>
>>> It does not by itself improve throughput, but ensures that IRQ and RX
>>> processing are directed to the right CPU, which is a
>>> prerequisite for effective aRFS.
>>
>> Thank you for the v2! I just have two small questions below, but I don't
>> think they are blocking.
>>
>>> Signed-off-by: Christoph Paasch <cpaasch@openai.com>
>>> ---
>>>  net/mptcp/protocol.c | 21 +++++++++++++++++++++
>>>  1 file changed, 21 insertions(+)
>>>
>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>> index 4b510e04724f5c8be08a00a1cc03093bcd031905..543506b6a011af78208381857ab9c1962cbc6636 100644
>>> --- a/net/mptcp/protocol.c
>>> +++ b/net/mptcp/protocol.c
>>> @@ -12,6 +12,7 @@
>>>  #include <linux/sched/signal.h>
>>>  #include <linux/atomic.h>
>>>  #include <net/aligned_data.h>
>>> +#include <net/rps.h>
>>>  #include <net/sock.h>
>>>  #include <net/inet_common.h>
>>>  #include <net/inet_hashtables.h>
>>> @@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
>>>       return limit - not_sent;
>>>  }
>>>
>>> +static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
>>> +{
>>> +     struct mptcp_subflow_context *subflow;
>>> +
>>> +     if (!rfs_is_needed())
>>> +             return;
>>> +
>>> +     mptcp_for_each_subflow(msk, subflow) {
>>> +             struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>>> +
>>> +             sock_rps_record_flow(ssk);
>>
>> I guess we don't want to call _sock_rps_record_flow() here to avoid
>> calling rfs_is_needed() multiple times in a row, because it is just an
>> extra static branch check?
> 
> IMO, the _sock_rps_* functions are not meant to be called  from
> outside rps.h. They are just in the rps.h file so that they can be
> static inline.
> In any case, the _sock_rps_* functions are guarded by #ifdef CONFIG_RPS.

Thank you for the explanation, fine by me!

>>> +     }
>>> +}
>>> +
>>>  static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>  {
>>>       struct mptcp_sock *msk = mptcp_sk(sk);
>>> @@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>>>
>>>       lock_sock(sk);
>>>
>>> +     mptcp_rps_record_subflows(msk);
>>> +
>>>       if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
>>>                    msg->msg_flags & MSG_FASTOPEN)) {
>>>               int copied_syn = 0;
>>> @@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>>>               goto out_err;
>>>       }
>>>
>>> +     mptcp_rps_record_subflows(msk);
>>> +
>>>       timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
>>>
>>>       len = min_t(size_t, len, INT_MAX);
>>> @@ -3921,6 +3940,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
>>>                               mptcp_sock_graft(ssk, newsock);
>>>               }
>>>
>>> +             mptcp_rps_record_subflows(msk);
>>
>> Out of curiosity, why not calling sock_rps_record_flow(ssk) here?
> 
> Again, primarily to keep the code clean. MPTCP-code calls
> mptcp_rps_record_subflows, which calls sock_rps_record_flow() which
> calls the _sock_rps helpers. In my opinion that is a better structure
> and flow.
> No real pure technical reason.

I see, I understand the logic there, and I just noticed the code above
is doing a mptcp_for_each_subflow() as well, fine to call the mptcp_
version here then!

Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.