[PATCH mptcp-next v3 2/3] mptcp: support MSG_ERRQUEUE on the parent socket

David Carlier posted 3 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH mptcp-next v3 2/3] mptcp: support MSG_ERRQUEUE on the parent socket
Posted by David Carlier 1 month, 2 weeks ago
Handle MSG_ERRQUEUE on the MPTCP socket by selecting a subflow with
pending errqueue data, moving one error skb to the parent socket, and
consuming it through the parent socket ABI.

This surfaces subflow errqueue activity through poll(), keeps the
userspace ABI tied to the socket being used, and restores the skb to
the subflow errqueue if requeueing to the parent fails under rmem
pressure.

Signed-off-by: David Carlier <devnexen@gmail.com>
Assisted-by: Codex:gpt-5
Signed-off-by: David Carlier <devnexen@gmail.com>
---
 net/mptcp/protocol.c | 123 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 103 insertions(+), 20 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6b486fc94c16..87871216bab2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -818,28 +818,29 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
 static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
 {
 	int ssk_state;
-	int err;
+	int err = 0;
+	bool has_errqueue;
+
+	has_errqueue = !skb_queue_empty_lockless(&ssk->sk_error_queue);
 
-	/* only propagate errors on fallen-back sockets or
-	 * on MPC connect
+	/* Only fallback sockets and the MPC connect path inherit TCP's sk_err
+	 * semantics; consume ssk->sk_err only on those paths so steady-state
+	 * MPTCP doesn't silently drop TCP's one-shot errors.
 	 */
-	if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
-		return false;
+	if (sk->sk_state == TCP_SYN_SENT ||
+	    __mptcp_check_fallback(mptcp_sk(sk))) {
+		err = sock_error(ssk);
+		if (err) {
+			ssk_state = inet_sk_state_load(ssk);
+			if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+				mptcp_set_state(sk, ssk_state);
+			WRITE_ONCE(sk->sk_err, -err);
+		}
+	}
 
-	err = sock_error(ssk);
-	if (!err)
+	if (!err && !has_errqueue)
 		return false;
 
-	/* We need to propagate only transition to CLOSE state.
-	 * Orphaned socket will see such state change via
-	 * subflow_sched_work_if_closed() and that path will properly
-	 * destroy the msk as needed.
-	 */
-	ssk_state = inet_sk_state_load(ssk);
-	if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
-		mptcp_set_state(sk, ssk_state);
-	WRITE_ONCE(sk->sk_err, -err);
-
 	/* This barrier is coupled with smp_rmb() in mptcp_poll() */
 	smp_wmb();
 	sk_error_report(sk);
@@ -2286,6 +2287,68 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
 	return 0;
 }
 
+static struct sock *mptcp_pick_errqueue_subflow(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow;
+	struct sock *ssk = NULL;
+
+	lock_sock(sk);
+	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+		struct sock *subflow_sk = mptcp_subflow_tcp_sock(subflow);
+
+		if (skb_queue_empty_lockless(&subflow_sk->sk_error_queue))
+			continue;
+
+		if (!refcount_inc_not_zero(&subflow_sk->sk_refcnt))
+			continue;
+
+		ssk = subflow_sk;
+		break;
+	}
+	release_sock(sk);
+
+	return ssk;
+}
+
+static bool mptcp_has_error_queue(const struct sock *sk)
+{
+	return !skb_queue_empty_lockless(&sk->sk_error_queue);
+}
+
+static int mptcp_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sk_buff *skb;
+	struct sock *ssk;
+	int ret, ret2;
+
+	if (READ_ONCE(sk->sk_err) || mptcp_has_error_queue(sk))
+		return inet_recv_error(sk, msg, len);
+
+	ssk = mptcp_pick_errqueue_subflow(sk);
+	if (!ssk)
+		return -EAGAIN;
+
+	skb = sock_dequeue_err_skb(ssk);
+	if (!skb)
+		goto put_ssk;
+
+	ret = sock_queue_err_skb(sk, skb);
+	if (ret) {
+		ret2 = sock_queue_err_skb(ssk, skb);
+		sock_put(ssk);
+		if (ret2)
+			kfree_skb(skb);
+		return ret;
+	}
+
+	sock_put(ssk);
+	return inet_recv_error(sk, msg, len);
+
+put_ssk:
+	sock_put(ssk);
+	return -EAGAIN;
+}
+
 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			 int flags)
 {
@@ -2295,9 +2358,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	int target;
 	long timeo;
 
-	/* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
 	if (unlikely(flags & MSG_ERRQUEUE))
-		return inet_recv_error(sk, msg, len);
+		return mptcp_recv_error(sk, msg, len);
 
 	lock_sock(sk);
 	if (unlikely(sk->sk_state == TCP_LISTEN)) {
@@ -4296,6 +4358,26 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
 	return 0;
 }
 
+static bool mptcp_subflow_has_error(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow;
+	bool has_error = false;
+
+	mptcp_data_lock(sk);
+	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		if (READ_ONCE(ssk->sk_err) ||
+		    !skb_queue_empty_lockless(&ssk->sk_error_queue)) {
+			has_error = true;
+			break;
+		}
+	}
+	mptcp_data_unlock(sk);
+
+	return has_error;
+}
+
 static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait)
 {
@@ -4339,7 +4421,8 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 
 	/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
 	smp_rmb();
-	if (READ_ONCE(sk->sk_err))
+	if (READ_ONCE(sk->sk_err) || mptcp_has_error_queue(sk) ||
+	    mptcp_subflow_has_error(sk))
 		mask |= EPOLLERR;
 
 	return mask;
-- 
2.53.0
Re: [PATCH mptcp-next v3 2/3] mptcp: support MSG_ERRQUEUE on the parent socket
Posted by Paolo Abeni 1 month, 2 weeks ago

On 4/22/26 12:33 AM, David Carlier wrote:
> Handle MSG_ERRQUEUE on the MPTCP socket by selecting a subflow with
> pending errqueue data, moving one error skb to the parent socket, and
> consuming it through the parent socket ABI.
> 
> This surfaces subflow errqueue activity through poll(), keeps the
> userspace ABI tied to the socket being used, and restores the skb to
> the subflow errqueue if requeueing to the parent fails under rmem
> pressure.
> 
> Signed-off-by: David Carlier <devnexen@gmail.com>
> Assisted-by: Codex:gpt-5
> Signed-off-by: David Carlier <devnexen@gmail.com>
> ---
>  net/mptcp/protocol.c | 123 ++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 103 insertions(+), 20 deletions(-)
> 
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 6b486fc94c16..87871216bab2 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -818,28 +818,29 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
>  static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
>  {
>  	int ssk_state;
> -	int err;
> +	int err = 0;
> +	bool has_errqueue;

Reverse christmas tree above.

> +
> +	has_errqueue = !skb_queue_empty_lockless(&ssk->sk_error_queue);
>  
> -	/* only propagate errors on fallen-back sockets or
> -	 * on MPC connect
> +	/* Only fallback sockets and the MPC connect path inherit TCP's sk_err
> +	 * semantics; consume ssk->sk_err only on those paths so steady-state
> +	 * MPTCP doesn't silently drop TCP's one-shot errors.
>  	 */
> -	if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
> -		return false;
> +	if (sk->sk_state == TCP_SYN_SENT ||
> +	    __mptcp_check_fallback(mptcp_sk(sk))) {
> +		err = sock_error(ssk);
> +		if (err) {
> +			ssk_state = inet_sk_state_load(ssk);
> +			if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
> +				mptcp_set_state(sk, ssk_state);
> +			WRITE_ONCE(sk->sk_err, -err);
> +		}
> +	}
>  
> -	err = sock_error(ssk);
> -	if (!err)
> +	if (!err && !has_errqueue)
>  		return false;
>  
> -	/* We need to propagate only transition to CLOSE state.
> -	 * Orphaned socket will see such state change via
> -	 * subflow_sched_work_if_closed() and that path will properly
> -	 * destroy the msk as needed.
> -	 */
> -	ssk_state = inet_sk_state_load(ssk);
> -	if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
> -		mptcp_set_state(sk, ssk_state);
> -	WRITE_ONCE(sk->sk_err, -err);

Avoid reordering the code: it makes the patch more complex and hard to
read. Keep the existing path as-is and add the needed handling under the
existing:

	if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))

condition. If needed use goto/label to consolidate the return path.

More important: am not sure propagating subflow errors to the main
socket is the right thing. A fatal error on a subflow does not affect
the mptcp unless there is a single active subflow. Application will
start receiving unexpected errors when the transfer is working just fine.

 -
>  	/* This barrier is coupled with smp_rmb() in mptcp_poll() */
>  	smp_wmb();
>  	sk_error_report(sk);
> @@ -2286,6 +2287,68 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
>  	return 0;
>  }
>  
> +static struct sock *mptcp_pick_errqueue_subflow(struct sock *sk)
> +{
> +	struct mptcp_subflow_context *subflow;
> +	struct sock *ssk = NULL;
> +
> +	lock_sock(sk);
> +	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
> +		struct sock *subflow_sk = mptcp_subflow_tcp_sock(subflow);
> +
> +		if (skb_queue_empty_lockless(&subflow_sk->sk_error_queue))
> +			continue;
> +
> +		if (!refcount_inc_not_zero(&subflow_sk->sk_refcnt))
> +			continue;
> +
> +		ssk = subflow_sk;
> +		break;
> +	}
> +	release_sock(sk);
> +
> +	return ssk;
> +}
> +
> +static bool mptcp_has_error_queue(const struct sock *sk)
> +{
> +	return !skb_queue_empty_lockless(&sk->sk_error_queue);
> +}
> +
> +static int mptcp_recv_error(struct sock *sk, struct msghdr *msg, int len)
> +{
> +	struct sk_buff *skb;
> +	struct sock *ssk;
> +	int ret, ret2;
> +
> +	if (READ_ONCE(sk->sk_err) || mptcp_has_error_queue(sk))
> +		return inet_recv_error(sk, msg, len);
> +
> +	ssk = mptcp_pick_errqueue_subflow(sk);
> +	if (!ssk)
> +		return -EAGAIN;
> +
> +	skb = sock_dequeue_err_skb(ssk);
> +	if (!skb)
> +		goto put_ssk;
> +
> +	ret = sock_queue_err_skb(sk, skb);
> +	if (ret) {
> +		ret2 = sock_queue_err_skb(ssk, skb);
> +		sock_put(ssk);
> +		if (ret2)
> +			kfree_skb(skb);
> +		return ret;
> +	}
> +
> +	sock_put(ssk);
> +	return inet_recv_error(sk, msg, len);
> +
> +put_ssk:
> +	sock_put(ssk);
> +	return -EAGAIN;
> +}
> +
>  static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  			 int flags)
>  {
> @@ -2295,9 +2358,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  	int target;
>  	long timeo;
>  
> -	/* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
>  	if (unlikely(flags & MSG_ERRQUEUE))
> -		return inet_recv_error(sk, msg, len);
> +		return mptcp_recv_error(sk, msg, len);
>  
>  	lock_sock(sk);
>  	if (unlikely(sk->sk_state == TCP_LISTEN)) {
> @@ -4296,6 +4358,26 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
>  	return 0;
>  }
>  
> +static bool mptcp_subflow_has_error(struct sock *sk)
> +{
> +	struct mptcp_subflow_context *subflow;
> +	bool has_error = false;
> +
> +	mptcp_data_lock(sk);
> +	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
> +		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +
> +		if (READ_ONCE(ssk->sk_err) ||
> +		    !skb_queue_empty_lockless(&ssk->sk_error_queue)) {
> +			has_error = true;
> +			break;
> +		}
> +	}
> +	mptcp_data_unlock(sk);

The data lock is not enough to protect the subflows list, and
unconditionally acquiring it in the poll callback is an no go (will
impact performances negatively).

_If_ error propagation makes sense, you should probably try to move the
err skb from the subflow error queue into the msk one at
sk_error_queue() time.

/P
Re: [PATCH mptcp-next v3 2/3] mptcp: support MSG_ERRQUEUE on the parent socket
Posted by David CARLIER 1 month, 2 weeks ago
 On 4/22/26 10:28 AM, Paolo Abeni wrote:
  > Reverse christmas tree above.

  Ack.

  > Avoid reordering the code [...] use goto/label to consolidate the
  > return path.

  Ack.

  > More important: am not sure propagating subflow errors to the main
  > socket is the right thing. [...]

  The sk->sk_err propagation stays gated on the existing
  SYN_SENT/fallback condition — the patch only adds errqueue skb
  handling (ICMP, timestamps, zerocopy completions), whose cmsg
payload
  identifies the originating peer. So fatal-error semantics are
  unchanged.

  > _If_ error propagation makes sense, you should probably try to
move
  > the err skb from the subflow error queue into the msk one at
  > sk_error_queue() time.

  Yes — I'll splice from subflow_error_report() onto
  msk->sk_error_queue. mptcp_poll() then just adds a
  !skb_queue_empty_lockless() check, recvmsg(MSG_ERRQUEUE) stays a
plain
  inet_recv_error(), and the subflow walk + data-lock go away. Does
that
  match what you had in mind?