:p
atchew
Login
MPTCP already advertises IP_RECVERR/IPV6_RECVERR as supported, but the parent socket does not currently provide usable MSG_ERRQUEUE handling. This series wires the MPTCP socket up to the IPv4/IPv6 error queue paths. It propagates RECVERR-related sockopts to existing and future subflows, makes poll() report pending errqueue activity through the parent socket, and allows recvmsg(MSG_ERRQUEUE) on the MPTCP socket to consume queued errors with the parent socket ABI. The series also handles mixed-family subflows by applying the matching sockopt according to each subflow family, and avoids silently losing an error skb if requeueing to the parent socket fails under rmem pressure. v1 -> v2: - Retargeted to mptcp-next per Matthieu Baerts' feedback (net-next closed during the merge window; iterate on the MPTCP tree). - Guard mptcp_setsockopt_v6_recverr() and its dispatch cases in mptcp_setsockopt_v6() with #if IS_ENABLED(CONFIG_IPV6) to fix the MPTCP CI link break on without_ipv6/with_mptcp configs (undefined reference to ipv6_setsockopt). v1: https://lore.kernel.org/mptcp/20260421152216.38127-1-devnexen@gmail.com/ David Carlier (3): mptcp: propagate RECVERR sockopts to subflows mptcp: support MSG_ERRQUEUE on the parent socket selftests: mptcp: cover RECVERR and MSG_ERRQUEUE net/mptcp/protocol.c | 121 +++++++++++--- net/mptcp/sockopt.c | 129 +++++++++++++++ .../selftests/net/mptcp/mptcp_sockopt.c | 152 ++++++++++++++++++ 3 files changed, 384 insertions(+), 18 deletions(-) base-commit: 4464afe97dc56e817a23b730979cbc6fc48f1912 -- 2.53.0
Propagate IP_RECVERR/IP_RECVERR_RFC4884 and IPV6_RECVERR/IPV6_RECVERR_RFC4884 from the MPTCP socket to existing and future subflows. Apply the matching sockopt according to the subflow family so mixed- family subflows stay aligned with the parent socket configuration, including disable-time errqueue purge semantics. Signed-off-by: David Carlier <devnexen@gmail.com> Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- v1 -> v2: - Retargeted to mptcp-next per Matthieu Baerts' feedback (net-next closed during the merge window; iterate on the MPTCP tree). - Guard mptcp_setsockopt_v6_recverr() and its dispatch cases in mptcp_setsockopt_v6() with #if IS_ENABLED(CONFIG_IPV6) to fix the MPTCP CI link break on without_ipv6/with_mptcp configs (undefined reference to ipv6_setsockopt). v1: https://lore.kernel.org/mptcp/20260421152216.38127-1-devnexen@gmail.com/ net/mptcp/sockopt.c | 129 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -XXX,XX +XXX,XX @@ #include <linux/kernel.h> #include <linux/module.h> +#include <net/ip.h> +#include <net/ipv6.h> #include <net/sock.h> #include <net/protocol.h> #include <net/tcp.h> @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static bool mptcp_recverr_enabled(const struct sock *sk, bool rfc4884) +{ + bool enabled; + + enabled = rfc4884 ? inet_test_bit(RECVERR_RFC4884, sk) : + inet_test_bit(RECVERR, sk); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + enabled |= rfc4884 ? inet6_test_bit(RECVERR6_RFC4884, sk) : + inet6_test_bit(RECVERR6, sk); +#endif + + return enabled; +} + +static int mptcp_subflow_set_recverr(struct sock *sk, struct sock *ssk, + bool rfc4884) +{ + int level, optname, val; + +#if IS_ENABLED(CONFIG_IPV6) + if (ssk->sk_family == AF_INET6) { + level = SOL_IPV6; + optname = rfc4884 ? IPV6_RECVERR_RFC4884 : IPV6_RECVERR; + } else +#endif + { + level = SOL_IP; + optname = rfc4884 ? IP_RECVERR_RFC4884 : IP_RECVERR; + } + + val = mptcp_recverr_enabled(sk, rfc4884); + return tcp_setsockopt(ssk, level, optname, KERNEL_SOCKPTR(&val), + sizeof(val)); +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mptcp_setsockopt_v6_recverr(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int ret; + + ret = ipv6_setsockopt(sk, SOL_IPV6, optname, optval, optlen); + if (ret) + return ret; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool rfc4884 = optname == IPV6_RECVERR_RFC4884; + + ret = mptcp_subflow_set_recverr(sk, ssk, rfc4884); + if (ret) + break; + subflow->setsockopt_seq = msk->setsockopt_seq; + } + release_sock(sk); + + return ret; +} +#endif + static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, release_sock(sk); break; +#if IS_ENABLED(CONFIG_IPV6) + case IPV6_RECVERR: + case IPV6_RECVERR_RFC4884: + ret = mptcp_setsockopt_v6_recverr(msk, optname, optval, optlen); + break; +#endif } return ret; @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, return 0; } +static int mptcp_setsockopt_v4_recverr(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int err; + + err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); + if (err) + return err; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool rfc4884 = optname == IP_RECVERR_RFC4884; + + err = mptcp_subflow_set_recverr(sk, ssk, rfc4884); + if (err) + break; + subflow->setsockopt_seq = msk->setsockopt_seq; + } + release_sock(sk); + + return err; +} + static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); + case IP_RECVERR: + case IP_RECVERR_RFC4884: + return mptcp_setsockopt_v4_recverr(msk, optname, optval, optlen); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, case IP_LOCAL_PORT_RANGE: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->local_port_range)); + case IP_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR, sk)); + case IP_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR_RFC4884, sk)); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); + case IPV6_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6, sk)); + case IPV6_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6_RFC4884, sk)); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool recverr, recverr_rfc4884; bool keep_open; keep_open = sock_flag(sk, SOCK_KEEPOPEN); @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); + recverr = mptcp_recverr_enabled(sk, false); + recverr_rfc4884 = mptcp_recverr_enabled(sk, true); +#if IS_ENABLED(CONFIG_IPV6) + if (ssk->sk_family == AF_INET6) { + inet6_assign_bit(RECVERR6, ssk, recverr); + inet6_assign_bit(RECVERR6_RFC4884, ssk, recverr_rfc4884); + } else +#endif + { + inet_assign_bit(RECVERR, ssk, recverr); + inet_assign_bit(RECVERR_RFC4884, ssk, recverr_rfc4884); + } } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) -- 2.53.0
Handle MSG_ERRQUEUE on the MPTCP socket by selecting a subflow with pending errqueue data, moving one error skb to the parent socket, and consuming it through the parent socket ABI. This surfaces subflow errqueue activity through poll(), keeps the userspace ABI tied to the socket being used, and restores the skb to the subflow errqueue if requeueing to the parent fails under rmem pressure. Signed-off-by: David Carlier <devnexen@gmail.com> Assisted-by: Codex:gpt-5 --- net/mptcp/protocol.c | 121 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 103 insertions(+), 18 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -XXX,XX +XXX,XX @@ static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int ssk_state; int err; + bool has_errqueue; - /* only propagate errors on fallen-back sockets or - * on MPC connect - */ - if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) - return false; - + has_errqueue = !skb_queue_empty_lockless(&ssk->sk_error_queue); err = sock_error(ssk); - if (!err) + if (!err && !has_errqueue) return false; - /* We need to propagate only transition to CLOSE state. - * Orphaned socket will see such state change via - * subflow_sched_work_if_closed() and that path will properly - * destroy the msk as needed. + /* Errqueue notifications should wake poll()/recvmsg(MSG_ERRQUEUE) on + * the MPTCP socket, but only fallback sockets and the MPC connect path + * inherit TCP's sk_err semantics. */ - ssk_state = inet_sk_state_load(ssk); - if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) - mptcp_set_state(sk, ssk_state); - WRITE_ONCE(sk->sk_err, -err); + if (err && + (sk->sk_state == TCP_SYN_SENT || __mptcp_check_fallback(mptcp_sk(sk)))) { + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + mptcp_set_state(sk, ssk_state); + WRITE_ONCE(sk->sk_err, -err); + } /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); @@ -XXX,XX +XXX,XX @@ static unsigned int mptcp_inq_hint(const struct sock *sk) return 0; } +static struct sock *mptcp_pick_errqueue_subflow(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct sock *ssk = NULL; + + lock_sock(sk); + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + struct sock *subflow_sk = mptcp_subflow_tcp_sock(subflow); + + if (skb_queue_empty_lockless(&subflow_sk->sk_error_queue)) + continue; + + if (!refcount_inc_not_zero(&subflow_sk->sk_refcnt)) + continue; + + ssk = subflow_sk; + break; + } + release_sock(sk); + + return ssk; +} + +static bool mptcp_has_error_queue(const struct sock *sk) +{ + return !skb_queue_empty_lockless(&sk->sk_error_queue); +} + +static int mptcp_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sk_buff *skb; + struct sock *ssk; + int ret, ret2; + + if (mptcp_has_error_queue(sk)) + return inet_recv_error(sk, msg, len); + + ssk = mptcp_pick_errqueue_subflow(sk); + if (!ssk) + return -EAGAIN; + + skb = sock_dequeue_err_skb(ssk); + if (!skb) + goto put_ssk; + + ret = sock_queue_err_skb(sk, skb); + if (ret) { + ret2 = sock_queue_err_skb(ssk, skb); + sock_put(ssk); + if (ret2) + kfree_skb(skb); + return ret; + } + + sock_put(ssk); + return inet_recv_error(sk, msg, len); + +put_ssk: + sock_put(ssk); + return -EAGAIN; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int target; long timeo; - /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len); + return mptcp_recv_error(sk, msg, len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { @@ -XXX,XX +XXX,XX @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) return 0; } +static bool mptcp_subflow_has_error(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + bool has_error = false; + + mptcp_data_lock(sk); + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (READ_ONCE(ssk->sk_err) || + !skb_queue_empty_lockless(&ssk->sk_error_queue)) { + has_error = true; + break; + } + } + mptcp_data_unlock(sk); + + return has_error; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -XXX,XX +XXX,XX @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); - if (READ_ONCE(sk->sk_err)) + if (READ_ONCE(sk->sk_err) || mptcp_has_error_queue(sk) || + mptcp_subflow_has_error(sk)) mask |= EPOLLERR; return mask; -- 2.53.0
Add MPTCP selftest coverage for RECVERR sockopt round-trips and parent-socket MSG_ERRQUEUE delivery. Enable TX software timestamping, send data over an MPTCP socket, wait for POLLERR, and verify that recvmsg(MSG_ERRQUEUE) returns timestamping metadata on the MPTCP parent socket. Signed-off-by: David Carlier <devnexen@gmail.com> Assisted-by: Codex:gpt-5 --- .../selftests/net/mptcp/mptcp_sockopt.c | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -XXX,XX +XXX,XX @@ #include <linux/tcp.h> #include <linux/compiler.h> +#include <linux/errqueue.h> +#include <linux/net_tstamp.h> + +#include <poll.h> static int pf = AF_INET; +#ifndef SCM_TIMESTAMPING +#define SCM_TIMESTAMPING SO_TIMESTAMPING +#endif + #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 #endif @@ -XXX,XX +XXX,XX @@ struct so_state { #define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif +static void enable_tx_timestamping(int fd); +static void test_msg_errqueue_timestamping(int fd); + static void __noreturn die_perror(const char *msg) { perror(msg); @@ -XXX,XX +XXX,XX @@ static void connect_one_server(int fd, int pipefd) assert(strncmp(buf2, "xmit", 4) == 0); + enable_tx_timestamping(fd); + ret = write(fd, buf, len); if (ret < 0) die_perror("write"); @@ -XXX,XX +XXX,XX @@ static void connect_one_server(int fd, int pipefd) if (ret != (ssize_t)len) xerror("short write"); + test_msg_errqueue_timestamping(fd); + total = 0; do { ret = read(fd, buf2 + total, sizeof(buf2) - total); @@ -XXX,XX +XXX,XX @@ static void test_ip_tos_sockopt(int fd) xerror("expect socklen_t == -1"); } +static void test_ip_recverr_sockopt(int fd) +{ + struct iovec iov = { + .iov_base = &(char){ 0 }, + .iov_len = 1, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int one = 1, zero = 0, val = -1; + socklen_t s = sizeof(val); + int level, optname, r; + + switch (pf) { + case AF_INET: + level = SOL_IP; + optname = IP_RECVERR; + break; + case AF_INET6: + level = SOL_IPV6; + optname = IPV6_RECVERR; + break; + default: + xerror("Unknown pf %d\n", pf); + } + + r = setsockopt(fd, level, optname, &one, sizeof(one)); + if (r) + die_perror("setsockopt recverr on"); + + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr on"); + if (s != sizeof(val) || val != one) + xerror("recverr on mismatch val=%d len=%u", val, s); + + r = recvmsg(fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT); + if (r != -1 || errno != EAGAIN) + xerror("expected empty errqueue to return EAGAIN, ret=%d errno=%d", r, errno); + + r = setsockopt(fd, level, optname, &zero, sizeof(zero)); + if (r) + die_perror("setsockopt recverr off"); + + val = -1; + s = sizeof(val); + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr off"); + if (s != sizeof(val) || val != zero) + xerror("recverr off mismatch val=%d len=%u", val, s); +} + +static void enable_tx_timestamping(int fd) +{ + int val = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_OPT_TSONLY; + int ret; + + ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING_OLD, + &val, sizeof(val)); + if (ret) + die_perror("setsockopt SO_TIMESTAMPING"); +} + +static void test_msg_errqueue_timestamping(int fd) +{ + char ctrl[512] = { 0 }; + char data[32] = { 0 }; + struct iovec iov = { + .iov_base = data, + .iov_len = sizeof(data), + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = ctrl, + .msg_controllen = sizeof(ctrl), + }; + struct pollfd pfd = { + .fd = fd, + .events = POLLERR, + }; + struct cmsghdr *cm; + struct scm_timestamping *tss = NULL; + struct sock_extended_err *serr = NULL; + int ret, i; + + for (i = 0; i < 10; i++) { + ret = poll(&pfd, 1, 1000); + if (ret < 0) + die_perror("poll errqueue"); + if (ret == 0) + continue; + if (!(pfd.revents & POLLERR)) + xerror("expected POLLERR, got revents %#x", pfd.revents); + break; + } + + if (i == 10) + xerror("timed out waiting for MSG_ERRQUEUE event"); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret < 0) + die_perror("recvmsg timestamping errqueue"); + if (!(msg.msg_flags & MSG_ERRQUEUE)) + xerror("expected MSG_ERRQUEUE in msg_flags, got %#x", + msg.msg_flags); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level == SOL_SOCKET && + cm->cmsg_type == SCM_TIMESTAMPING) + tss = (void *)CMSG_DATA(cm); + if ((cm->cmsg_level == SOL_IP && + cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_RECVERR)) + serr = (void *)CMSG_DATA(cm); + } + + if (!tss) + xerror("missing SCM_TIMESTAMPING cmsg"); + if (!serr) + xerror("missing sock_extended_err cmsg"); + if (serr->ee_errno != ENOMSG || + serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) + xerror("unexpected timestamping err ee_errno=%u ee_origin=%u", + serr->ee_errno, serr->ee_origin); + if (!tss->ts[0].tv_sec && !tss->ts[0].tv_nsec && + !tss->ts[1].tv_sec && !tss->ts[1].tv_nsec && + !tss->ts[2].tv_sec && !tss->ts[2].tv_nsec) + xerror("all timestamp slots are zero"); +} + static int client(int pipefd) { int fd = -1; @@ -XXX,XX +XXX,XX @@ static int client(int pipefd) } test_ip_tos_sockopt(fd); + test_ip_recverr_sockopt(fd); connect_one_server(fd, pipefd); -- 2.53.0
MPTCP already advertises IP_RECVERR/IPV6_RECVERR as supported, but the parent socket does not currently provide usable MSG_ERRQUEUE handling. This series wires the MPTCP socket up to the IPv4/IPv6 error queue paths. It propagates RECVERR-related sockopts to existing and future subflows, and lets recvmsg(MSG_ERRQUEUE) on the parent socket consume queued TX-timestamp and MSG_ZEROCOPY completion notifications. A prerequisite patch factors the per-flag inet_flags propagation in sync_socket_options() into a mask-driven loop, so further inet_flags propagated by MPTCP can be added by extending the mask rather than touching the call site. Patch 2 leverages mptcp_setsockopt_all_sf() for the setsockopt path and extends MPTCP_INET_FLAGS_MASK with the four RECVERR bits, with a single mptcp_setsockopt_recverr() helper covering both families. Patch 3 splices subflow err skbs onto the parent at error-report time, filtering by SO_EE_ORIGIN so user-data cmsgs (TIMESTAMPING, ZEROCOPY, LOCAL) reach the parent socket while subflow-level ICMP errors are dropped to avoid leaking subflow identity through the single-path RECVERR ABI. A future MPTCP_RECERR channel is the right home for those events along with the per-fd subflow lifecycle events tracked by [1]. Changes in v5: - 1/4: replace the WRITE_ONCE() RMW with a per-bit assign_bit() loop so the per-bit atomicity of the original inet_assign_bit() calls is preserved (Sashiko). - 2/4: collapse the family-specific helpers into one mptcp_setsockopt_recverr() that snapshots optval into a local int, bumps msk->setsockopt_seq, and forwards via mptcp_setsockopt_all_sf() (Matthieu, Sashiko); skip family-mismatched subflows in mptcp_setsockopt_all_sf() (Sashiko). - 3/4: filter the splice by SO_EE_ORIGIN to forward TIMESTAMPING / ZEROCOPY / LOCAL only and drop ICMP / ICMPv6 (Matthieu, Paolo); add mptcp_recv_error() to retry the splice on the pull side so a parent-side rmem-ENOMEM does not strand subflow skbs (Sashiko). - 4/4: unchanged. [1] https://github.com/multipath-tcp/mptcp_net-next/issues/78 David Carlier (4): mptcp: sockopt: factor inet_flags propagation into a mask mptcp: propagate RECVERR sockopts to subflows mptcp: support MSG_ERRQUEUE on the parent socket selftests: mptcp: cover IP_RECVERR sockopt propagation net/mptcp/protocol.c | 66 ++++++++++- net/mptcp/sockopt.c | 108 ++++++++++++++---- .../selftests/net/mptcp/mptcp_sockopt.c | 55 +++++++++ 3 files changed, 203 insertions(+), 26 deletions(-) -- 2.53.0
Introduce MPTCP_INET_FLAGS_MASK and replace the per-flag inet_assign_bit() calls in sync_socket_options() with a loop driven by the mask that calls assign_bit() per set bit, preserving the per-bit atomicity of the original. Further flags propagated by MPTCP can be added by extending the mask rather than touching the call site. No functional change. Suggested-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/sockopt.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -XXX,XX +XXX,XX @@ #define MIN_INFO_OPTLEN_SIZE 16 #define MIN_FULL_INFO_OPTLEN_SIZE 40 +#define MPTCP_INET_FLAGS_MASK \ + (BIT(INET_FLAGS_TRANSPARENT) | \ + BIT(INET_FLAGS_FREEBIND) | \ + BIT(INET_FLAGS_BIND_ADDRESS_NO_PORT)) static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + unsigned long mask = MPTCP_INET_FLAGS_MASK; + unsigned long src; + int b; bool keep_open; keep_open = sock_flag(sk, SOCK_KEEPOPEN); @@ -XXX,XX +XXX,XX @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); tcp_sock_set_maxseg(ssk, msk->maxseg); - inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); - inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); - inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + src = READ_ONCE(inet_sk(sk)->inet_flags); + + for_each_set_bit(b, &mask, BITS_PER_LONG) + assign_bit(b, &inet_sk(ssk)->inet_flags, src & BIT(b)); + WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); } -- 2.53.0
Propagate IP_RECVERR/IP_RECVERR_RFC4884 and IPV6_RECVERR/IPV6_RECVERR_RFC4884 from the MPTCP socket to existing and future subflows. mptcp_setsockopt_recverr() snapshots optval into a local int, applies it to the parent socket via ip_setsockopt() / ipv6_setsockopt(), bumps msk->setsockopt_seq, and forwards to every subflow via mptcp_setsockopt_all_sf(). Newly-joining subflows pick up the four RECVERR bits through sync_socket_options() now that MPTCP_INET_FLAGS_MASK covers them. mptcp_setsockopt_all_sf() skips IPv4 subflows when called with SOL_IPV6 to avoid the -ENOPROTOOPT that ip_setsockopt() returns on level mismatch in AF_INET6 msks carrying IPv4 subflows. Suggested-by: Paolo Abeni <pabeni@redhat.com> Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/sockopt.c | 95 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 18 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -XXX,XX +XXX,XX @@ #include <linux/kernel.h> #include <linux/module.h> +#include <net/ip.h> +#include <net/ipv6.h> #include <net/sock.h> #include <net/protocol.h> #include <net/tcp.h> @@ -XXX,XX +XXX,XX @@ #define MPTCP_INET_FLAGS_MASK \ (BIT(INET_FLAGS_TRANSPARENT) | \ BIT(INET_FLAGS_FREEBIND) | \ - BIT(INET_FLAGS_BIND_ADDRESS_NO_PORT)) + BIT(INET_FLAGS_BIND_ADDRESS_NO_PORT) | \ + BIT(INET_FLAGS_RECVERR) | \ + BIT(INET_FLAGS_RECVERR_RFC4884) | \ + BIT(INET_FLAGS_RECVERR6) | \ + BIT(INET_FLAGS_RECVERR6_RFC4884)) static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, + int optname, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + int ret = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (level == SOL_IPV6 && ssk->sk_family != AF_INET6) + continue; + + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); + if (ret) + break; + } + return ret; +} + +static int mptcp_setsockopt_recverr(struct mptcp_sock *msk, int level, + int optname, sockptr_t optval, + unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int val, ret; + + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + if (level == SOL_IP) + ret = ip_setsockopt(sk, level, optname, KERNEL_SOCKPTR(&val), sizeof(val)); + +#if IS_ENABLED(CONFIG_IPV6) + else if (level == SOL_IPV6) + ret = ipv6_setsockopt(sk, level, optname, KERNEL_SOCKPTR(&val), sizeof(val)); +#endif + else + return -EOPNOTSUPP; + if (ret) + return ret; + + lock_sock(sk); + sockopt_seq_inc(msk); + ret = mptcp_setsockopt_all_sf(msk, level, optname, KERNEL_SOCKPTR(&val), sizeof(val)); + release_sock(sk); + return ret; +} + static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, release_sock(sk); break; + case IPV6_RECVERR: + case IPV6_RECVERR_RFC4884: + ret = mptcp_setsockopt_recverr(msk, SOL_IPV6, optname, optval, optlen); + break; } return ret; @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); + case IP_RECVERR: + case IP_RECVERR_RFC4884: + return mptcp_setsockopt_recverr(msk, SOL_IP, optname, optval, optlen); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int return ret; } -static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, - int optname, sockptr_t optval, - unsigned int optlen) -{ - struct mptcp_subflow_context *subflow; - int ret = 0; - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - ret = tcp_setsockopt(ssk, level, optname, optval, optlen); - if (ret) - break; - } - return ret; -} - static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, case IP_LOCAL_PORT_RANGE: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->local_port_range)); + case IP_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR, sk)); + case IP_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(RECVERR_RFC4884, sk)); } return -EOPNOTSUPP; @@ -XXX,XX +XXX,XX @@ static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_FREEBIND: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(FREEBIND, sk)); + case IPV6_RECVERR: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6, sk)); + case IPV6_RECVERR_RFC4884: + return mptcp_put_int_option(msk, optval, optlen, + inet6_test_bit(RECVERR6_RFC4884, sk)); } return -EOPNOTSUPP; -- 2.53.0
Splice pending err skbs from each subflow's error queue onto the parent msk's error queue at error-report time, so poll() and recvmsg(MSG_ERRQUEUE) on the parent socket observe TX timestamps and MSG_ZEROCOPY completion notifications through the standard inet ABI. The splice filters by SO_EE_ORIGIN: TIMESTAMPING / ZEROCOPY / LOCAL events forward to the parent because they are tied to user-handed data, not to a specific path; subflow-level ICMP errors are dropped because the legacy RECVERR ABI cannot meaningfully convey their per-subflow peer identity to single-path-aware userspace. Such events will be carried by a future MPTCP_RECERR channel. mptcp_recv_error() retries the splice on the pull side: if sock_queue_err_skb() previously failed under rmem pressure, the skb stays on the subflow queue, and the next recvmsg(MSG_ERRQUEUE) splices it once the parent's queue has been drained. Suggested-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David Carlier <devnexen@gmail.com> --- net/mptcp/protocol.c | 66 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index XXXXXXX..XXXXXXX 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -XXX,XX +XXX,XX @@ #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> +#include <linux/errqueue.h> #include <net/aligned_data.h> #include <net/rps.h> #include <net/sock.h> @@ -XXX,XX +XXX,XX @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) return moved; } +static bool mptcp_errqueue_skb_forwardable(const struct sk_buff *skb) +{ + u8 origin = SKB_EXT_ERR(skb)->ee.ee_origin; + + return origin == SO_EE_ORIGIN_TIMESTAMPING || + origin == SO_EE_ORIGIN_ZEROCOPY || + origin == SO_EE_ORIGIN_LOCAL; +} + +static bool __mptcp_subflow_splice_errqueue(struct sock *sk, struct sock *ssk) +{ + struct sk_buff *skb; + bool moved = false; + + while ((skb = skb_dequeue(&ssk->sk_error_queue))) { + if (!mptcp_errqueue_skb_forwardable(skb)) { + kfree_skb(skb); /* path-specific (ICMP) — belongs in MPTCP_RECERR */ + continue; + } + if (sock_queue_err_skb(sk, skb)) { + skb_queue_head(&ssk->sk_error_queue, skb); + break; + } + moved = true; + } + + return moved; +} + static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int ssk_state; + bool report; int err; + report = __mptcp_subflow_splice_errqueue(sk, ssk); + /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) - return false; + goto out; err = sock_error(ssk); if (!err) - return false; - + goto out; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly @@ -XXX,XX +XXX,XX @@ static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); + report = true; + +out: + if (!report) + return false; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); @@ -XXX,XX +XXX,XX @@ static unsigned int mptcp_inq_hint(const struct sock *sk) return 0; } +static int mptcp_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!skb_queue_empty(&ssk->sk_error_queue)) + __mptcp_subflow_splice_errqueue(sk, ssk); + } + release_sock(sk); + + return inet_recv_error(sk, msg, len); +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int target; long timeo; - /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len); + return mptcp_recv_error(sk, msg, len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { @@ -XXX,XX +XXX,XX @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); - if (READ_ONCE(sk->sk_err)) + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; -- 2.53.0
Exercise setsockopt/getsockopt of IP_RECVERR and IPV6_RECVERR on the MPTCP parent socket, including the empty-errqueue EAGAIN contract on MSG_ERRQUEUE|MSG_DONTWAIT. End-to-end errqueue delivery (ICMP, TX timestamps, zerocopy) depends on subflow-side producers that are out of scope for this series and will be covered by follow-up work. Assisted-by: Codex:gpt-5 Signed-off-by: David Carlier <devnexen@gmail.com> --- .../selftests/net/mptcp/mptcp_sockopt.c | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index XXXXXXX..XXXXXXX 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -XXX,XX +XXX,XX @@ static void test_ip_tos_sockopt(int fd) xerror("expect socklen_t == -1"); } +static void test_ip_recverr_sockopt(int fd) +{ + struct iovec iov = { + .iov_base = &(char){ 0 }, + .iov_len = 1, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + int one = 1, zero = 0, val = -1; + socklen_t s = sizeof(val); + int level, optname, r; + + switch (pf) { + case AF_INET: + level = SOL_IP; + optname = IP_RECVERR; + break; + case AF_INET6: + level = SOL_IPV6; + optname = IPV6_RECVERR; + break; + default: + xerror("Unknown pf %d\n", pf); + } + + r = setsockopt(fd, level, optname, &one, sizeof(one)); + if (r) + die_perror("setsockopt recverr on"); + + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr on"); + if (s != sizeof(val) || val != one) + xerror("recverr on mismatch val=%d len=%u", val, s); + + r = recvmsg(fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT); + if (r != -1 || errno != EAGAIN) + xerror("expected empty errqueue to return EAGAIN, ret=%d errno=%d", r, errno); + + r = setsockopt(fd, level, optname, &zero, sizeof(zero)); + if (r) + die_perror("setsockopt recverr off"); + + val = -1; + s = sizeof(val); + r = getsockopt(fd, level, optname, &val, &s); + if (r) + die_perror("getsockopt recverr off"); + if (s != sizeof(val) || val != zero) + xerror("recverr off mismatch val=%d len=%u", val, s); +} + static int client(int pipefd) { int fd = -1; @@ -XXX,XX +XXX,XX @@ static int client(int pipefd) } test_ip_tos_sockopt(fd); + test_ip_recverr_sockopt(fd); connect_one_server(fd, pipefd); -- 2.53.0