:p
atchew
Login
MPTCP already advertises IP_RECVERR/IPV6_RECVERR as supported, but the parent socket does not currently provide usable MSG_ERRQUEUE handling. This series wires the MPTCP socket up to the IPv4/IPv6 error queue paths. It propagates RECVERR-related sockopts to existing and future subflows, makes poll() report pending errqueue activity through the parent socket, and allows recvmsg(MSG_ERRQUEUE) on the MPTCP socket to consume queued errors with the parent socket ABI. The series also handles mixed-family subflows by applying the matching sockopt according to each subflow family, and avoids silently losing an error skb if requeueing to the parent socket fails under rmem pressure. v2 -> v3: - Only consume ssk->sk_err in the fallback / MPC-connect branch of __mptcp_subflow_error_report(). Steady-state MPTCP now leaves TCP's one-shot sk_err to TCP's own consumer instead of silently draining it via sock_error(). - In mptcp_recv_error(), also route to inet_recv_error() when sk->sk_err is set, so a fallback-propagated error reaches userspace even when the parent errqueue is empty. - Scope the new selftest to IP_RECVERR sockopt propagation only. End-to-end errqueue delivery (TX timestamps, ICMP, zerocopy) depends on subflow-side producers that are out of scope for this series and will be covered by follow-up work. Fixes the mptcp_sockopt selftest timeout reported by the MPTCP CI on v2. v1 -> v2: - Retargeted to mptcp-next per Matthieu Baerts' feedback (net-next closed during the merge window; iterate on the MPTCP tree). - Guard mptcp_setsockopt_v6_recverr() and its dispatch cases in mptcp_setsockopt_v6() with #if IS_ENABLED(CONFIG_IPV6) to fix the MPTCP CI link break on without_ipv6/with_mptcp configs (undefined reference to ipv6_setsockopt). v1: https://lore.kernel.org/mptcp/20260421152216.38127-1-devnexen@gmail.com/ v2: https://lore.kernel.org/mptcp/20260421191337.58341-1-devnexen@gmail.com/ David Carlier (3): mptcp: propagate RECVERR sockopts to subflows mptcp: support MSG_ERRQUEUE on the parent socket selftests: mptcp: cover IP_RECVERR sockopt propagation net/mptcp/protocol.c | 123 ++++++++++++++--- net/mptcp/sockopt.c | 129 ++++++++++++++++++ .../selftests/net/mptcp/mptcp_sockopt.c | 55 ++++++++ 3 files changed, 287 insertions(+), 20 deletions(-) base-commit: 4464afe97dc56e817a23b730979cbc6fc48f1912 -- 2.53.0
Propagate IP_RECVERR/IP_RECVERR_RFC4884 and IPV6_RECVERR/IPV6_RECVERR_RFC4884 from the MPTCP socket to existing and future subflows. Apply the matching sockopt according to the subflow family so mixed- family subflows stay aligned with the parent socket configuration, including disable-time errqueue purge semantics. Signed-off-by: David Carlier <devnexen@gmail.com> Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/sockopt.c | 129 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -XXX,XX +XXX,XX @@ #include <linux/kernel.h> #include <linux/module.h> +#include <net/ip.h> +#include <net/ipv6.h> #include <net/sock.h> #include <net/protocol.h> #include <net/tcp.h> @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static bool mptcp_recverr_enabled(const struct sock *sk, bool rfc4884) +{ + bool enabled; + + enabled = rfc4884 ? inet_test_bit(RECVERR_RFC4884, sk) : + inet_test_bit(RECVERR, sk); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + enabled |= rfc4884 ? inet6_test_bit(RECVERR6_RFC4884, sk) : + inet6_test_bit(RECVERR6, sk); +#endif + + return enabled; +} + +static int mptcp_subflow_set_recverr(struct sock *sk, struct sock *ssk, + bool rfc4884) +{ + int level, optname, val; + +#if IS_ENABLED(CONFIG_IPV6) + if (ssk->sk_family == AF_INET6) { + level = SOL_IPV6; + optname = rfc4884 ? IPV6_RECVERR_RFC4884 : IPV6_RECVERR; + } else +#endif + { + level = SOL_IP; + optname = rfc4884 ? IP_RECVERR_RFC4884 : IP_RECVERR; + } + + val = mptcp_recverr_enabled(sk, rfc4884); + return tcp_setsockopt(ssk, level, optname, KERNEL_SOCKPTR(&val), + sizeof(val)); +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mptcp_setsockopt_v6_recverr(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int ret; + + ret = ipv6_setsockopt(sk, SOL_IPV6, optname, optval, optlen); + if (ret) + return ret; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool rfc4884 = optname == IPV6_RECVERR_RFC4884; + + ret = mptcp_subflow_set_recverr(sk, ssk, rfc4884); + if (ret) + break; + subflow->setsockopt_seq = msk->setsockopt_seq; + } + release_sock(sk); + + return ret; +} +#endif + static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, release_sock(sk); break; +#if IS_ENABLED(CONFIG_IPV6) + case IPV6_RECVERR: + case IPV6_RECVERR_RFC4884: + ret = mptcp_setsockopt_v6_recverr(msk, optname, optval, optlen); + break; +#endif } return ret; @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, return 0; } +static int mptcp_setsockopt_v4_recverr(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int err; + + err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); + if (err) + return err; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool rfc4884 = optname == IP_RECVERR_RFC4884; + + err = mptcp_subflow_set_recverr(sk, ssk, rfc4884); + if (err) + break; + subflow->setsockopt_seq = msk->setsockopt_seq; + } + release_sock(sk); + + return err; +} + static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); + case IP_RECVERR: + case IP_RECVERR_RFC4884: + return mptcp_setsockopt_v4_recverr(msk, optname, optval, optlen); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, case IP_LOCAL_PORT_RANGE: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->local_port_range)); + case IP_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR, sk)); + case IP_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR_RFC4884, sk)); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); + case IPV6_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6, sk)); + case IPV6_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6_RFC4884, sk)); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool recverr, recverr_rfc4884; bool keep_open; keep_open = sock_flag(sk, SOCK_KEEPOPEN); @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); + recverr = mptcp_recverr_enabled(sk, false); + recverr_rfc4884 = mptcp_recverr_enabled(sk, true); +#if IS_ENABLED(CONFIG_IPV6) + if (ssk->sk_family == AF_INET6) { + inet6_assign_bit(RECVERR6, ssk, recverr); + inet6_assign_bit(RECVERR6_RFC4884, ssk, recverr_rfc4884); + } else +#endif + { + inet_assign_bit(RECVERR, ssk, recverr); + inet_assign_bit(RECVERR_RFC4884, ssk, recverr_rfc4884); + } } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) -- 2.53.0
Handle MSG_ERRQUEUE on the MPTCP socket by selecting a subflow with pending errqueue data, moving one error skb to the parent socket, and consuming it through the parent socket ABI. This surfaces subflow errqueue activity through poll(), keeps the userspace ABI tied to the socket being used, and restores the skb to the subflow errqueue if requeueing to the parent fails under rmem pressure. Signed-off-by: David Carlier <devnexen@gmail.com> Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/protocol.c | 123 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 103 insertions(+), 20 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -XXX,XX +XXX,XX @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int ssk_state; - int err; + int err = 0; + bool has_errqueue; + + has_errqueue = !skb_queue_empty_lockless(&ssk->sk_error_queue); - /* only propagate errors on fallen-back sockets or - * on MPC connect + /* Only fallback sockets and the MPC connect path inherit TCP's sk_err + * semantics; consume ssk->sk_err only on those paths so steady-state + * MPTCP doesn't silently drop TCP's one-shot errors. */ - if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) - return false; + if (sk->sk_state == TCP_SYN_SENT || + __mptcp_check_fallback(mptcp_sk(sk))) { + err = sock_error(ssk); + if (err) { + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + mptcp_set_state(sk, ssk_state); + WRITE_ONCE(sk->sk_err, -err); + } + } - err = sock_error(ssk); - if (!err) + if (!err && !has_errqueue) return false; - /* We need to propagate only transition to CLOSE state. - * Orphaned socket will see such state change via - * subflow_sched_work_if_closed() and that path will properly - * destroy the msk as needed. - */ - ssk_state = inet_sk_state_load(ssk); - if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) - mptcp_set_state(sk, ssk_state); - WRITE_ONCE(sk->sk_err, -err); - /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); @@ -XXX,XX +XXX,XX @@ static unsigned int mptcp_inq_hint(const struct sock *sk) return 0; } +static struct sock *mptcp_pick_errqueue_subflow(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct sock *ssk = NULL; + + lock_sock(sk); + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + struct sock *subflow_sk = mptcp_subflow_tcp_sock(subflow); + + if (skb_queue_empty_lockless(&subflow_sk->sk_error_queue)) + continue; + + if (!refcount_inc_not_zero(&subflow_sk->sk_refcnt)) + continue; + + ssk = subflow_sk; + break; + } + release_sock(sk); + + return ssk; +} + +static bool mptcp_has_error_queue(const struct sock *sk) +{ + return !skb_queue_empty_lockless(&sk->sk_error_queue); +} + +static int mptcp_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sk_buff *skb; + struct sock *ssk; + int ret, ret2; + + if (READ_ONCE(sk->sk_err) || mptcp_has_error_queue(sk)) + return inet_recv_error(sk, msg, len); + + ssk = mptcp_pick_errqueue_subflow(sk); + if (!ssk) + return -EAGAIN; + + skb = sock_dequeue_err_skb(ssk); + if (!skb) + goto put_ssk; + + ret = sock_queue_err_skb(sk, skb); + if (ret) { + ret2 = sock_queue_err_skb(ssk, skb); + sock_put(ssk); + if (ret2) + kfree_skb(skb); + return ret; + } + + sock_put(ssk); + return inet_recv_error(sk, msg, len); + +put_ssk: + sock_put(ssk); + return -EAGAIN; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int target; long timeo; - /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len); + return mptcp_recv_error(sk, msg, len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { @@ -XXX,XX +XXX,XX @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) return 0; } +static bool mptcp_subflow_has_error(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + bool has_error = false; + + mptcp_data_lock(sk); + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (READ_ONCE(ssk->sk_err) || + !skb_queue_empty_lockless(&ssk->sk_error_queue)) { + has_error = true; + break; + } + } + mptcp_data_unlock(sk); + + return has_error; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -XXX,XX +XXX,XX @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); - if (READ_ONCE(sk->sk_err)) + if (READ_ONCE(sk->sk_err) || mptcp_has_error_queue(sk) || + mptcp_subflow_has_error(sk)) mask |= EPOLLERR; return mask; -- 2.53.0
Exercise setsockopt/getsockopt of IP_RECVERR and IPV6_RECVERR on the MPTCP parent socket, including the empty-errqueue EAGAIN contract on MSG_ERRQUEUE|MSG_DONTWAIT. End-to-end errqueue delivery (ICMP, TX timestamps, zerocopy) depends on subflow-side producers that are out of scope for this series and will be covered by follow-up work. Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- .../selftests/net/mptcp/mptcp_sockopt.c | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -XXX,XX +XXX,XX @@ static void test_ip_tos_sockopt(int fd) xerror("expect socklen_t == -1"); } +static void test_ip_recverr_sockopt(int fd) +{ + struct iovec iov = { + .iov_base = &(char){ 0 }, + .iov_len = 1, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int one = 1, zero = 0, val = -1; + socklen_t s = sizeof(val); + int level, optname, r; + + switch (pf) { + case AF_INET: + level = SOL_IP; + optname = IP_RECVERR; + break; + case AF_INET6: + level = SOL_IPV6; + optname = IPV6_RECVERR; + break; + default: + xerror("Unknown pf %d\n", pf); + } + + r = setsockopt(fd, level, optname, &one, sizeof(one)); + if (r) + die_perror("setsockopt recverr on"); + + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr on"); + if (s != sizeof(val) || val != one) + xerror("recverr on mismatch val=%d len=%u", val, s); + + r = recvmsg(fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT); + if (r != -1 || errno != EAGAIN) + xerror("expected empty errqueue to return EAGAIN, ret=%d errno=%d", r, errno); + + r = setsockopt(fd, level, optname, &zero, sizeof(zero)); + if (r) + die_perror("setsockopt recverr off"); + + val = -1; + s = sizeof(val); + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr off"); + if (s != sizeof(val) || val != zero) + xerror("recverr off mismatch val=%d len=%u", val, s); +} + static int client(int pipefd) { int fd = -1; @@ -XXX,XX +XXX,XX @@ static int client(int pipefd) } test_ip_tos_sockopt(fd); + test_ip_recverr_sockopt(fd); connect_one_server(fd, pipefd); -- 2.53.0
MPTCP already advertises IP_RECVERR/IPV6_RECVERR as supported, but the parent socket does not currently provide usable MSG_ERRQUEUE handling. This series wires the MPTCP socket up to the IPv4/IPv6 error queue paths. It propagates RECVERR-related sockopts to existing and future subflows, makes poll() report pending errqueue activity through the parent socket, and lets recvmsg(MSG_ERRQUEUE) on the MPTCP socket consume queued errors with the parent socket ABI. A new prerequisite patch factors the per-flag inet_flags propagation in sync_socket_options() into a single masked word copy, so further inet_flags propagated by MPTCP can be added by extending the mask rather than touching the call site. Patch 2 then leverages the existing mptcp_setsockopt_all_sf() helper for the setsockopt path and extends MPTCP_INET_FLAGS_MASK with the four RECVERR bits, dropping the family-specific helpers from v3. Based-on: <20260424-mptcp-pm-sockopt-set-all-sf-v1-1-38e7023822f8@kernel.org> v3 -> v4: - New patch 1/4: factor inet_flags propagation in sync_socket_options() through MPTCP_INET_FLAGS_MASK, per Paolo's review. - Patch 2/4 (was 1/3): drop the mptcp_recverr_enabled() and mptcp_subflow_set_recverr() helpers; route the setsockopt path through mptcp_setsockopt_all_sf(). Inherit the four RECVERR bits via MPTCP_INET_FLAGS_MASK in sync_socket_options() instead of explicit inet[6]_assign_bit() calls. - Patch 3/4 (was 2/3): rework the MSG_ERRQUEUE plumbing per Paolo's review. Subflow err skbs are now spliced onto the parent msk's sk_error_queue from __mptcp_subflow_error_report() via the new __mptcp_subflow_splice_errqueue() helper. recvmsg(MSG_ERRQUEUE) on the parent reverts to plain inet_recv_error(), and mptcp_poll() only inspects the parent's sk_error_queue -- no more on-demand subflow walks, no extra lock_sock() / data_lock() in the poll or recv paths. Keep the original early-return structure of __mptcp_subflow_error_report() and fix the reverse christmas-tree variable order Paolo flagged. v2 -> v3: - Only consume ssk->sk_err in the fallback / MPC-connect branch of __mptcp_subflow_error_report(). Steady-state MPTCP now leaves TCP's one-shot sk_err to TCP's own consumer instead of silently draining it via sock_error(). - In mptcp_recv_error(), also route to inet_recv_error() when sk->sk_err is set, so a fallback-propagated error reaches userspace even when the parent errqueue is empty. - Scope the new selftest to IP_RECVERR sockopt propagation only. End-to-end errqueue delivery (TX timestamps, ICMP, zerocopy) depends on subflow-side producers that are out of scope for this series and will be covered by follow-up work. Fixes the mptcp_sockopt selftest timeout reported by the MPTCP CI on v2. v1 -> v2: - Retargeted to mptcp-next per Matthieu Baerts' feedback (net-next closed during the merge window; iterate on the MPTCP tree). - Guard mptcp_setsockopt_v6_recverr() and its dispatch cases in mptcp_setsockopt_v6() with #if IS_ENABLED(CONFIG_IPV6) to fix the MPTCP CI link break on without_ipv6/with_mptcp configs (undefined reference to ipv6_setsockopt). v1: https://lore.kernel.org/mptcp/20260421152216.38127-1-devnexen@gmail.com/ v2: https://lore.kernel.org/mptcp/20260421191337.58341-1-devnexen@gmail.com/ v3: https://lore.kernel.org/mptcp/20260421223338.52743-1-devnexen@gmail.com/ David Carlier (4): mptcp: sockopt: factor inet_flags propagation into a mask mptcp: propagate RECVERR sockopts to subflows mptcp: support MSG_ERRQUEUE on the parent socket selftests: mptcp: cover IP_RECVERR sockopt propagation net/mptcp/protocol.c | 33 +++++- net/mptcp/sockopt.c | 107 ++++++++++++++---- .../selftests/net/mptcp/mptcp_sockopt.c | 55 +++++++++ 3 files changed, 170 insertions(+), 25 deletions(-) -- 2.53.0
Replace the per-flag inet_assign_bit() calls in sync_socket_options() with a masked word-level copy of inet_sk()->inet_flags. Introduce MPTCP_INET_FLAGS_MASK so further flags propagated by MPTCP can be added by extending the mask rather than touching the call site. No functional change. Suggested-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/sockopt.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -XXX,XX +XXX,XX @@ #define MIN_INFO_OPTLEN_SIZE 16 #define MIN_FULL_INFO_OPTLEN_SIZE 40 +#define MPTCP_INET_FLAGS_MASK \ + (BIT(INET_FLAGS_TRANSPARENT) | \ + BIT(INET_FLAGS_FREEBIND) | \ + BIT(INET_FLAGS_BIND_ADDRESS_NO_PORT)) static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + unsigned long flags; bool keep_open; keep_open = sock_flag(sk, SOCK_KEEPOPEN); @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); tcp_sock_set_maxseg(ssk, msk->maxseg); - inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); - inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); - inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + flags = inet_sk(ssk)->inet_flags; + flags &= ~MPTCP_INET_FLAGS_MASK; + flags |= inet_sk(sk)->inet_flags & MPTCP_INET_FLAGS_MASK; + WRITE_ONCE(inet_sk(ssk)->inet_flags, flags); WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); } -- 2.53.0
Propagate IP_RECVERR/IP_RECVERR_RFC4884 and IPV6_RECVERR/IPV6_RECVERR_RFC4884 from the MPTCP socket to existing and future subflows. The setsockopt path forwards each option to every subflow via mptcp_setsockopt_all_sf(); newly-joining subflows inherit the four RECVERR bits through sync_socket_options() now that MPTCP_INET_FLAGS_MASK covers them. Suggested-by: Paolo Abeni <pabeni@redhat.com> Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/sockopt.c | 97 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 18 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -XXX,XX +XXX,XX @@ #include <linux/kernel.h> #include <linux/module.h> +#include <net/ip.h> +#include <net/ipv6.h> #include <net/sock.h> #include <net/protocol.h> #include <net/tcp.h> @@ -XXX,XX +XXX,XX @@ #define MPTCP_INET_FLAGS_MASK \ (BIT(INET_FLAGS_TRANSPARENT) | \ BIT(INET_FLAGS_FREEBIND) | \ - BIT(INET_FLAGS_BIND_ADDRESS_NO_PORT)) + BIT(INET_FLAGS_BIND_ADDRESS_NO_PORT) | \ + BIT(INET_FLAGS_RECVERR) | \ + BIT(INET_FLAGS_RECVERR_RFC4884) | \ + BIT(INET_FLAGS_RECVERR6) | \ + BIT(INET_FLAGS_RECVERR6_RFC4884)) static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, + int optname, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + int ret = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); + if (ret) + break; + } + return ret; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mptcp_setsockopt_v6_recverr(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret; + + ret = ipv6_setsockopt(sk, SOL_IPV6, optname, optval, optlen); + if (ret) + return ret; + + lock_sock(sk); + ret = mptcp_setsockopt_all_sf(msk, SOL_IPV6, optname, optval, optlen); + release_sock(sk); + return ret; +} +#endif + static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, release_sock(sk); break; +#if IS_ENABLED(CONFIG_IPV6) + case IPV6_RECVERR: + case IPV6_RECVERR_RFC4884: + ret = mptcp_setsockopt_v6_recverr(msk, optname, optval, optlen); + break; +#endif } return ret; @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, return 0; } +static int mptcp_setsockopt_v4_recverr(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret; + + ret = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); + if (ret) + return ret; + + lock_sock(sk); + ret = mptcp_setsockopt_all_sf(msk, SOL_IP, optname, optval, optlen); + release_sock(sk); + return ret; +} + static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); + case IP_RECVERR: + case IP_RECVERR_RFC4884: + return mptcp_setsockopt_v4_recverr(msk, optname, optval, optlen); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int return ret; } -static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, - int optname, sockptr_t optval, - unsigned int optlen) -{ - struct mptcp_subflow_context *subflow; - int ret = 0; - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - ret = tcp_setsockopt(ssk, level, optname, optval, optlen); - if (ret) - break; - } - return ret; -} - static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, case IP_LOCAL_PORT_RANGE: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->local_port_range)); + case IP_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR, sk)); + case IP_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR_RFC4884, sk)); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); + case IPV6_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6, sk)); + case IPV6_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6_RFC4884, sk)); } return -EOPNOTSUPP; -- 2.53.0
Splice pending err skbs from each subflow's error queue onto the parent msk's error queue at error-report time, so poll() and recvmsg(MSG_ERRQUEUE) on the parent socket observe ICMP, tx timestamp, and zerocopy completion notifications through the standard inet ABI. If sock_queue_err_skb() on the parent fails (rmem-limited), the skb is left on the subflow queue and retried on the next error report, avoiding silent loss. Suggested-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/protocol.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -XXX,XX +XXX,XX @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) return moved; } +static bool __mptcp_subflow_splice_errqueue(struct sock *sk, struct sock *ssk) +{ + struct sk_buff *skb; + bool moved = false; + + while ((skb = skb_dequeue(&ssk->sk_error_queue))) { + if (sock_queue_err_skb(sk, skb)) { + skb_queue_head(&ssk->sk_error_queue, skb); + break; + } + moved = true; + } + + return moved; +} + static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int ssk_state; + bool report; int err; + report = __mptcp_subflow_splice_errqueue(sk, ssk); + /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) - return false; + goto out; err = sock_error(ssk); if (!err) - return false; - + goto out; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly @@ -XXX,XX +XXX,XX @@ static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); + report = true; + +out: + if (!report) + return false; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int target; long timeo; - /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len); @@ -XXX,XX +XXX,XX @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); - if (READ_ONCE(sk->sk_err)) + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; -- 2.53.0
Exercise setsockopt/getsockopt of IP_RECVERR and IPV6_RECVERR on the MPTCP parent socket, including the empty-errqueue EAGAIN contract on MSG_ERRQUEUE|MSG_DONTWAIT. End-to-end errqueue delivery (ICMP, TX timestamps, zerocopy) depends on subflow-side producers that are out of scope for this series and will be covered by follow-up work. Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- .../selftests/net/mptcp/mptcp_sockopt.c | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -XXX,XX +XXX,XX @@ static void test_ip_tos_sockopt(int fd) xerror("expect socklen_t == -1"); } +static void test_ip_recverr_sockopt(int fd) +{ + struct iovec iov = { + .iov_base = &(char){ 0 }, + .iov_len = 1, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int one = 1, zero = 0, val = -1; + socklen_t s = sizeof(val); + int level, optname, r; + + switch (pf) { + case AF_INET: + level = SOL_IP; + optname = IP_RECVERR; + break; + case AF_INET6: + level = SOL_IPV6; + optname = IPV6_RECVERR; + break; + default: + xerror("Unknown pf %d\n", pf); + } + + r = setsockopt(fd, level, optname, &one, sizeof(one)); + if (r) + die_perror("setsockopt recverr on"); + + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr on"); + if (s != sizeof(val) || val != one) + xerror("recverr on mismatch val=%d len=%u", val, s); + + r = recvmsg(fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT); + if (r != -1 || errno != EAGAIN) + xerror("expected empty errqueue to return EAGAIN, ret=%d errno=%d", r, errno); + + r = setsockopt(fd, level, optname, &zero, sizeof(zero)); + if (r) + die_perror("setsockopt recverr off"); + + val = -1; + s = sizeof(val); + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr off"); + if (s != sizeof(val) || val != zero) + xerror("recverr off mismatch val=%d len=%u", val, s); +} + static int client(int pipefd) { int fd = -1; @@ -XXX,XX +XXX,XX @@ static int client(int pipefd) } test_ip_tos_sockopt(fd); + test_ip_recverr_sockopt(fd); connect_one_server(fd, pipefd); -- 2.53.0