Some user-space applications want to monitor the subflows utilization.
Dumping the per subflow tcp_info is not enough, as the PM could close
and re-create the subflows under-the-hood, fooling the accounting.
Even checking the src/dst addresses used by each subflow could not
be enough, because new subflows could re-use the same address/port of
the just closed one.
This patch introduces a new socket option, allow dumping all the relevant
information all-at-once (everything, everywhere...), in a consistent manner.
To reuse the existing helper to manipulate the new struct, keep the binary
layout of the initial few fields the same as mptcp_subflow_data.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v3 -> v4:
- full_info struct re-design (Florian)
v2 -> v3:
- added missing changelog (oops)
---
include/uapi/linux/mptcp.h | 25 ++++++
net/mptcp/sockopt.c | 165 +++++++++++++++++++++++++++++++++----
2 files changed, 174 insertions(+), 16 deletions(-)
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 32af2d278cb4..37c46cf05795 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -12,6 +12,7 @@
#include <linux/in.h> /* for sockaddr_in */
#include <linux/in6.h> /* for sockaddr_in6 */
#include <linux/socket.h> /* for sockaddr_storage and sa_family */
+#include <linux/tcp.h> /* for tcp_info */
#define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0)
#define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1)
@@ -244,9 +245,33 @@ struct mptcp_subflow_addrs {
};
};
+struct mptcp_subflow_info {
+ __u32 id;
+ struct mptcp_subflow_addrs addrs;
+};
+
+struct mptcp_subflow_full_info {
+ __u32 size_subflow_full_info; /* size of this structure in userspace */
+ __u32 num_subflows_kern; /* must be 0, set by kernel (real subflow count) */
+ __u32 size_tcpinfo_kernel; /* must be 0, set by kernel */
+ __u32 size_tcpinfo_user;
+ __u32 size_sfinfo_kernel; /* must be 0, set by kernel */
+ __u32 size_sfinfo_user;
+ __u32 num_subflows_user; /* max subflows that userspace is interested in;
+ * the buffers at subflow_info_addr/tcp_info_addr
+ * are respectively at least:
+ * num_subflows_user * size_sfinfo_user
+ * num_subflows_user * size_tcpinfo_user
+ * bytes wide
+ */
+ __aligned_u64 subflow_info_addr;
+ __aligned_u64 tcp_info_addr;
+} __attribute__((aligned(8)));
+
/* MPTCP socket options */
#define MPTCP_INFO 1
#define MPTCP_TCPINFO 2
#define MPTCP_SUBFLOW_ADDRS 3
+#define MPTCP_FULL_INFO 4
#endif /* _UAPI_MPTCP_H */
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index d4258869ac48..59a174ee1d54 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -14,7 +14,8 @@
#include <net/mptcp.h>
#include "protocol.h"
-#define MIN_INFO_OPTLEN_SIZE 16
+#define MIN_INFO_OPTLEN_SIZE 16
+#define MIN_FULL_INFO_OPTLEN_SIZE 48
static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
@@ -943,12 +944,13 @@ static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, in
return 0;
}
-static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
- char __user *optval,
- u32 copied,
- int __user *optlen)
+static int __mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
+ int size_subflow_data_kern,
+ char __user *optval,
+ u32 copied,
+ int __user *optlen)
{
- u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd));
+ u32 copylen = min_t(u32, sfd->size_subflow_data, size_subflow_data_kern);
if (copied)
copied += sfd->size_subflow_data;
@@ -964,25 +966,30 @@ static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
return 0;
}
-static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
- char __user *optval, int __user *optlen)
+static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
+ char __user *optval,
+ u32 copied,
+ int __user *optlen)
+{
+ return __mptcp_put_subflow_data(sfd, sizeof(*sfd), optval, copied, optlen);
+}
+
+static int __mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
+ int min_info_optlen_size,
+ char __user *optval,
+ int __user *optlen)
{
int len, copylen;
if (get_user(len, optlen))
return -EFAULT;
- /* if mptcp_subflow_data size is changed, need to adjust
- * this function to deal with programs using old version.
- */
- BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE);
-
- if (len < MIN_INFO_OPTLEN_SIZE)
+ if (len < min_info_optlen_size)
return -EINVAL;
memset(sfd, 0, sizeof(*sfd));
- copylen = min_t(unsigned int, len, sizeof(*sfd));
+ copylen = min_t(unsigned int, len, min_info_optlen_size);
if (copy_from_user(sfd, optval, copylen))
return -EFAULT;
@@ -991,7 +998,7 @@ static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
sfd->size_user > INT_MAX)
return -EINVAL;
- if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE ||
+ if (sfd->size_subflow_data < min_info_optlen_size ||
sfd->size_subflow_data > len)
return -EINVAL;
@@ -1001,6 +1008,19 @@ static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
return len - sfd->size_subflow_data;
}
+static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
+ char __user *optval,
+ int __user *optlen)
+{
+ /* if mptcp_subflow_data size is changed, need to adjust
+ * this function to deal with programs using old version.
+ */
+ BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE);
+
+ return __mptcp_get_subflow_data(sfd, MIN_INFO_OPTLEN_SIZE,
+ optval, optlen);
+}
+
static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
int __user *optlen)
{
@@ -1146,6 +1166,117 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o
return 0;
}
+static int mptcp_get_full_subflow_info(struct mptcp_subflow_full_info *sffi,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct mptcp_subflow_data *sfd = (struct mptcp_subflow_data *)sffi;
+ int len;
+
+ BUILD_BUG_ON(sizeof(*sffi) != MIN_FULL_INFO_OPTLEN_SIZE);
+
+ len = __mptcp_get_subflow_data(sfd, MIN_FULL_INFO_OPTLEN_SIZE,
+ optval, optlen);
+ if (len < 0)
+ return len;
+
+ if (sffi->size_tcpinfo_kernel)
+ return -EINVAL;
+
+ if (sffi->size_sfinfo_user > INT_MAX)
+ return -EINVAL;
+
+ return len;
+}
+
+static int mptcp_put_subflow_full_info(struct mptcp_subflow_full_info *sffi,
+ char __user *optval,
+ u32 copied,
+ int __user *optlen)
+{
+ struct mptcp_subflow_data *sfd = (struct mptcp_subflow_data *)sffi;
+
+ return __mptcp_put_subflow_data(sfd, sizeof(*sffi), optval, copied, optlen);
+}
+
+static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ unsigned int sfcount = 0, copied = 0;
+ struct mptcp_subflow_full_info sffi;
+ void __user *tcpinfoptr, *sfinfoptr;
+ int len;
+
+ len = mptcp_get_full_subflow_info(&sffi, optval, optlen);
+ if (len < 0)
+ return len;
+
+ /* don't bother filling the mptcp info if there is not enough
+ * user-space-provided storage
+ */
+ if (len > 0) {
+ struct mptcp_info mptcp_info;
+ char __user *infoptr;
+ int mptcp_info_len;
+
+ infoptr = optval + sffi.size_subflow_full_info;
+ memset(&mptcp_info, 0, sizeof(mptcp_info));
+ mptcp_info_len = min_t(unsigned int, len, sizeof(struct mptcp_info));
+
+ mptcp_diag_fill_info(msk, &mptcp_info);
+
+ if (copy_to_user(infoptr, &mptcp_info, mptcp_info_len))
+ return -EFAULT;
+
+ copied += mptcp_info_len;
+ }
+
+ sffi.size_tcpinfo_kernel = sizeof(struct tcp_info);
+ sffi.size_tcpinfo_user = min_t(unsigned int, sffi.size_tcpinfo_user,
+ sizeof(struct tcp_info));
+ sfinfoptr = (void __force __user *)sffi.subflow_info_addr;
+ sffi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info);
+ sffi.size_sfinfo_user = min_t(unsigned int, sffi.size_sfinfo_user,
+ sizeof(struct mptcp_subflow_info));
+ tcpinfoptr = (void __force __user *)sffi.tcp_info_addr;
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (sfcount++ < sffi.num_subflows_user) {
+ struct mptcp_subflow_info sfinfo;
+ struct tcp_info tcp_info;
+
+ memset(&sfinfo, 0, sizeof(sfinfo));
+ sfinfo.id = subflow->subflow_id;
+ mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
+ if (copy_to_user(sfinfoptr, &sfinfo, sffi.size_sfinfo_user))
+ goto fail_release;
+
+ tcp_get_info(ssk, &tcp_info);
+ if (copy_to_user(tcpinfoptr, &tcp_info, sffi.size_tcpinfo_user))
+ goto fail_release;
+
+ tcpinfoptr += sffi.size_tcpinfo_user;
+ sfinfoptr += sffi.size_sfinfo_user;
+ }
+ }
+ release_sock(sk);
+
+ sffi.num_subflows_kern = sfcount;
+ if (mptcp_put_subflow_full_info(&sffi, optval, copied, optlen))
+ return -EFAULT;
+
+ return 0;
+
+fail_release:
+ release_sock(sk);
+ return -EFAULT;
+}
+
static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
int __user *optlen, int val)
{
@@ -1219,6 +1350,8 @@ static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
switch (optname) {
case MPTCP_INFO:
return mptcp_getsockopt_info(msk, optval, optlen);
+ case MPTCP_FULL_INFO:
+ return mptcp_getsockopt_full_info(msk, optval, optlen);
case MPTCP_TCPINFO:
return mptcp_getsockopt_tcpinfo(msk, optval, optlen);
case MPTCP_SUBFLOW_ADDRS:
--
2.40.1
Hi Paolo,
On 24/05/2023 15:50, Paolo Abeni wrote:
> Some user-space applications want to monitor the subflows utilization.
>
> Dumping the per subflow tcp_info is not enough, as the PM could close
> and re-create the subflows under-the-hood, fooling the accounting.
> Even checking the src/dst addresses used by each subflow could not
> be enough, because new subflows could re-use the same address/port of
> the just closed one.
>
> This patch introduces a new socket option, allow dumping all the relevant
> information all-at-once (everything, everywhere...), in a consistent manner.
>
> To reuse the existing helper to manipulate the new struct, keep the binary
> layout of the initial few fields the same as mptcp_subflow_data.
(...)
> + lock_sock(sk);
> + mptcp_for_each_subflow(msk, subflow) {
> + struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +
> + if (sfcount++ < sffi.num_subflows_user) {
> + struct mptcp_subflow_info sfinfo;
> + struct tcp_info tcp_info;
> +
> + memset(&sfinfo, 0, sizeof(sfinfo));
> + sfinfo.id = subflow->subflow_id;
> + mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
Would it be worth it to first check size_sfinfo_user > sizeof(sfinfo.id)
before retreiving the addresses?
> + if (copy_to_user(sfinfoptr, &sfinfo, sffi.size_sfinfo_user))
> + goto fail_release;> +
> + tcp_get_info(ssk, &tcp_info);
Same here: just checking if sffi.size_tcpinfo_user > 0, just in case the
userspace doesn't one and/or the other.
Cheers,
Matt
--
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
Hi Paolo,
On 24/05/2023 15:50, Paolo Abeni wrote:
> Some user-space applications want to monitor the subflows utilization.
>
> Dumping the per subflow tcp_info is not enough, as the PM could close
> and re-create the subflows under-the-hood, fooling the accounting.
> Even checking the src/dst addresses used by each subflow could not
> be enough, because new subflows could re-use the same address/port of
> the just closed one.
>
> This patch introduces a new socket option, allow dumping all the relevant
> information all-at-once (everything, everywhere...), in a consistent manner.
>
> To reuse the existing helper to manipulate the new struct, keep the binary
> layout of the initial few fields the same as mptcp_subflow_data.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> v3 -> v4:
> - full_info struct re-design (Florian)
>
> v2 -> v3:
> - added missing changelog (oops)
> ---
> include/uapi/linux/mptcp.h | 25 ++++++
> net/mptcp/sockopt.c | 165 +++++++++++++++++++++++++++++++++----
> 2 files changed, 174 insertions(+), 16 deletions(-)
>
> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> index 32af2d278cb4..37c46cf05795 100644
> --- a/include/uapi/linux/mptcp.h
> +++ b/include/uapi/linux/mptcp.h
> @@ -12,6 +12,7 @@
> #include <linux/in.h> /* for sockaddr_in */
> #include <linux/in6.h> /* for sockaddr_in6 */
> #include <linux/socket.h> /* for sockaddr_storage and sa_family */
> +#include <linux/tcp.h> /* for tcp_info */
(we don't directly use tcp_info here below, no?)
>
> #define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0)
> #define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1)
> @@ -244,9 +245,33 @@ struct mptcp_subflow_addrs {
> };
> };
>
> +struct mptcp_subflow_info {
> + __u32 id;
> + struct mptcp_subflow_addrs addrs;
> +};
> +
> +struct mptcp_subflow_full_info {
> + __u32 size_subflow_full_info; /* size of this structure in userspace */
> + __u32 num_subflows_kern; /* must be 0, set by kernel (real subflow count) */
(maybe better to call it num_subflows if it is the "real" number of
subflows?)
> + __u32 size_tcpinfo_kernel; /* must be 0, set by kernel */
> + __u32 size_tcpinfo_user;
> + __u32 size_sfinfo_kernel; /* must be 0, set by kernel */
> + __u32 size_sfinfo_user;
> + __u32 num_subflows_user; /* max subflows that userspace is interested in;
> + * the buffers at subflow_info_addr/tcp_info_addr
> + * are respectively at least:
> + * num_subflows_user * size_sfinfo_user
> + * num_subflows_user * size_tcpinfo_user
> + * bytes wide
The "longish" comment seems to suggest the name is not appropriated,
maybe: size_arrays(_user) or similar?
> + */
> + __aligned_u64 subflow_info_addr;
> + __aligned_u64 tcp_info_addr;
(the "_addr" suffix is confusing, I thought it was linked to subflow
addresses :) why not "_ptr" instead? or put "addr_" as a prefix?)
> +} __attribute__((aligned(8)));
Out of curiosity, why not including "struct mptcp_info" in "struct
mptcp_subflow_full_info" (and strip "_subflow": "mptcp_full_info")?
It looks like it is a bit hidden that this getsockopt() also writes
mptcp_info in the buffer if there is space and mptcp_info size could be
explicitly given making clean it can be set to 0 if we don't want it
(instead of not knowing we can give a bigger size to have it included).
Plus the userspace has to create a custom structure to get it:
struct my_mptcp_full_info {
struct mptcp_subflow_full_info i;
struct mptcp_info mi;
};
(but maybe that is common and fine?)
And while at it, why not doing like subflow_info_addr and tcp_info_addr
and use a pointer to a structure? I don't know if it would be better, it
is just to know why, just because it is not an array I suppose :)
Cheers,
Matt
--
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
© 2016 - 2026 Red Hat, Inc.