From: Geliang Tang <tanggeliang@kylinos.cn>
This patch adds a "cgroup/getsockopt" way to inspect the subflows of a
mptcp socket.
mptcp_for_each_stubflow() and other helpers related to list_dentry are
added into progs/mptcp_bpf.h.
Add an extra "cgroup/getsockopt" prog to walk the msk->conn_list and use
bpf_core_cast to cast a pointer to tcp_sock for readonly. It will allow
to inspect all the fields in a tcp_sock.
Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
tools/testing/selftests/bpf/progs/mptcp_bpf.h | 35 +++++++++++
.../selftests/bpf/progs/mptcp_subflow.c | 62 +++++++++++++++++++
2 files changed, 97 insertions(+)
diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf.h b/tools/testing/selftests/bpf/progs/mptcp_bpf.h
index 782f36ed027e..d60e3c8f85a1 100644
--- a/tools/testing/selftests/bpf/progs/mptcp_bpf.h
+++ b/tools/testing/selftests/bpf/progs/mptcp_bpf.h
@@ -4,9 +4,44 @@
#include <vmlinux.h>
#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
#define MPTCP_SUBFLOWS_MAX 8
+static inline int list_is_head(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list == head;
+}
+
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define list_first_entry(ptr, type, member) \
+ list_entry((ptr)->next, type, member)
+
+#define list_next_entry(pos, member) \
+ list_entry((pos)->member.next, typeof(*(pos)), member)
+
+#define list_entry_is_head(pos, head, member) \
+ list_is_head(&pos->member, (head))
+
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_first_entry(head, typeof(*pos), member); \
+ cond_break, !list_entry_is_head(pos, head, member); \
+ pos = list_next_entry(pos, member))
+
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_first_entry(head, typeof(*pos), member), \
+ n = list_next_entry(pos, member); \
+ cond_break, !list_entry_is_head(pos, head, member); \
+ pos = n, n = list_next_entry(n, member))
+
+#define mptcp_for_each_subflow(__msk, __subflow) \
+ list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \
+ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)
+
extern void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
bool scheduled) __ksym;
diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
index bc572e1d6df8..6a7cf71c2795 100644
--- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c
+++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
@@ -4,6 +4,7 @@
/* vmlinux.h, bpf_helpers.h and other 'define' */
#include "bpf_tracing_net.h"
+#include "mptcp_bpf.h"
char _license[] SEC("license") = "GPL";
@@ -57,3 +58,64 @@ int mptcp_subflow(struct bpf_sock_ops *skops)
return 1;
}
+
+static int _check_getsockopt_subflow_mark(struct mptcp_sock *msk, struct bpf_sockopt *ctx)
+{
+ struct mptcp_subflow_context *subflow;
+ int i = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
+ struct mptcp_subflow_context));
+
+ if (ssk->sk_mark != ++i)
+ ctx->retval = -1;
+ }
+
+ return 1;
+}
+
+static int _check_getsockopt_subflow_cc(struct mptcp_sock *msk, struct bpf_sockopt *ctx)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct inet_connection_sock *icsk;
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
+ struct mptcp_subflow_context));
+ icsk = bpf_core_cast(ssk, struct inet_connection_sock);
+
+ if ((ssk->sk_mark == 1 &&
+ __builtin_memcmp(icsk->icsk_ca_ops->name, cc, TCP_CA_NAME_MAX)) ||
+ (ssk->sk_mark == 2 &&
+ __builtin_memcmp(icsk->icsk_ca_ops->name, "cubic", TCP_CA_NAME_MAX)))
+ ctx->retval = -1;
+ }
+
+ return 1;
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt_subflow(struct bpf_sockopt *ctx)
+{
+ struct bpf_sock *sk = ctx->sk;
+ struct mptcp_sock *msk;
+
+ if (!sk || sk->protocol != IPPROTO_MPTCP)
+ return 1;
+
+ msk = bpf_core_cast(sk, struct mptcp_sock);
+ if (msk->pm.subflows != 1)
+ return 1;
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_MARK)
+ return _check_getsockopt_subflow_mark(msk, ctx);
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION)
+ return _check_getsockopt_subflow_cc(msk, ctx);
+
+ return 1;
+}
--
2.43.0
Hi Geliang,
cc: -Martin (not to annoy him with MPTCP specific stuff)
On 29/08/2024 04:53, Geliang Tang wrote:
> From: Geliang Tang <tanggeliang@kylinos.cn>
>
> This patch adds a "cgroup/getsockopt" way to inspect the subflows of a
> mptcp socket.
>
> mptcp_for_each_stubflow() and other helpers related to list_dentry are
> added into progs/mptcp_bpf.h.
>
> Add an extra "cgroup/getsockopt" prog to walk the msk->conn_list and use
> bpf_core_cast to cast a pointer to tcp_sock for readonly. It will allow
> to inspect all the fields in a tcp_sock.
(...)
> diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> index bc572e1d6df8..6a7cf71c2795 100644
> --- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> @@ -4,6 +4,7 @@
>
> /* vmlinux.h, bpf_helpers.h and other 'define' */
> #include "bpf_tracing_net.h"
> +#include "mptcp_bpf.h"
>
> char _license[] SEC("license") = "GPL";
>
> @@ -57,3 +58,64 @@ int mptcp_subflow(struct bpf_sock_ops *skops)
>
> return 1;
> }
> +
> +static int _check_getsockopt_subflow_mark(struct mptcp_sock *msk, struct bpf_sockopt *ctx)
> +{
> + struct mptcp_subflow_context *subflow;
> + int i = 0;
> +
> + mptcp_for_each_subflow(msk, subflow) {
> + struct sock *ssk;
> +
> + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
> + struct mptcp_subflow_context));
> +
> + if (ssk->sk_mark != ++i)
> + ctx->retval = -1;
> + }
> +
> + return 1;
> +}
> +
> +static int _check_getsockopt_subflow_cc(struct mptcp_sock *msk, struct bpf_sockopt *ctx)
> +{
> + struct mptcp_subflow_context *subflow, *tmp;
> +
> + mptcp_for_each_subflow_safe(msk, subflow, tmp) {
Out of curiosity, why do you need the '_safe()' helper here? Just to
check it works?
> + struct inet_connection_sock *icsk;
> + struct sock *ssk;
> +
> + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
> + struct mptcp_subflow_context));
> + icsk = bpf_core_cast(ssk, struct inet_connection_sock);
> +
> + if ((ssk->sk_mark == 1 &&
> + __builtin_memcmp(icsk->icsk_ca_ops->name, cc, TCP_CA_NAME_MAX)) ||
In your previous version, you had a patch to set the cc on the other
subflow, not to change the value of 'getsockopt(TCP_CONGESTION)' seen by
the userspace. Why did you drop it?
> + (ssk->sk_mark == 2 &&
> + __builtin_memcmp(icsk->icsk_ca_ops->name, "cubic", TCP_CA_NAME_MAX)))
You cannot check this one (or at least not like that): the default TCP
CC can be changed with a kernel config, it is not necessarily cubic.
I think it is fine to only check the other subflow.
> + ctx->retval = -1;
Should you not mark it as an error if the mark is different from 1 or 2?
(Or maybe not, because that's covered by _check_getsockopt_subflow_mark)
> + }
> +
> + return 1;
> +}
> +
> +SEC("cgroup/getsockopt")
> +int _getsockopt_subflow(struct bpf_sockopt *ctx)
> +{
> + struct bpf_sock *sk = ctx->sk;
> + struct mptcp_sock *msk;
> +
> + if (!sk || sk->protocol != IPPROTO_MPTCP)
> + return 1;
If this test fails, the check will not be done.
> +
> + msk = bpf_core_cast(sk, struct mptcp_sock);
> + if (msk->pm.subflows != 1)
> + return 1;
Same here: if there is no extra subflow, the test is silently not done.
You change 'ctx->retval'.
Also, it might be good to print something somewhere everywhere you set
this 'ctx->retval' to -1, to be able to easily understand what went
wrong, no? Can we use bpf_printk() or something similar?
> +
> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_MARK)
> + return _check_getsockopt_subflow_mark(msk, ctx);
> + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION)
> + return _check_getsockopt_subflow_cc(msk, ctx);
> +
> + return 1;
> +}
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
Hi Matt,
On Thu, 2024-08-29 at 16:25 +0200, Matthieu Baerts wrote:
> Hi Geliang,
>
> cc: -Martin (not to annoy him with MPTCP specific stuff)
>
> On 29/08/2024 04:53, Geliang Tang wrote:
> > From: Geliang Tang <tanggeliang@kylinos.cn>
> >
> > This patch adds a "cgroup/getsockopt" way to inspect the subflows
> > of a
> > mptcp socket.
> >
> > mptcp_for_each_stubflow() and other helpers related to list_dentry
> > are
> > added into progs/mptcp_bpf.h.
> >
> > Add an extra "cgroup/getsockopt" prog to walk the msk->conn_list
> > and use
> > bpf_core_cast to cast a pointer to tcp_sock for readonly. It will
> > allow
> > to inspect all the fields in a tcp_sock.
>
> (...)
>
> > diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> > b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> > index bc572e1d6df8..6a7cf71c2795 100644
> > --- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> > +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c
> > @@ -4,6 +4,7 @@
> >
> > /* vmlinux.h, bpf_helpers.h and other 'define' */
> > #include "bpf_tracing_net.h"
> > +#include "mptcp_bpf.h"
> >
> > char _license[] SEC("license") = "GPL";
> >
> > @@ -57,3 +58,64 @@ int mptcp_subflow(struct bpf_sock_ops *skops)
> >
> > return 1;
> > }
> > +
> > +static int _check_getsockopt_subflow_mark(struct mptcp_sock *msk,
> > struct bpf_sockopt *ctx)
> > +{
> > + struct mptcp_subflow_context *subflow;
> > + int i = 0;
> > +
> > + mptcp_for_each_subflow(msk, subflow) {
> > + struct sock *ssk;
> > +
> > + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
> > + struct mptcp_subflow_context));
> > +
> > + if (ssk->sk_mark != ++i)
> > + ctx->retval = -1;
> > + }
> > +
> > + return 1;
> > +}
> > +
> > +static int _check_getsockopt_subflow_cc(struct mptcp_sock *msk,
> > struct bpf_sockopt *ctx)
> > +{
> > + struct mptcp_subflow_context *subflow, *tmp;
> > +
> > + mptcp_for_each_subflow_safe(msk, subflow, tmp) {
>
> Out of curiosity, why do you need the '_safe()' helper here? Just to
> check it works?
>
> > + struct inet_connection_sock *icsk;
> > + struct sock *ssk;
> > +
> > + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow,
> > + struct mptcp_subflow_context));
> > + icsk = bpf_core_cast(ssk, struct inet_connection_sock);
> > +
> > + if ((ssk->sk_mark == 1 &&
> > + __builtin_memcmp(icsk->icsk_ca_ops->name, cc,
> > TCP_CA_NAME_MAX)) ||
>
> In your previous version, you had a patch to set the cc on the other
> subflow, not to change the value of 'getsockopt(TCP_CONGESTION)' seen
> by
> the userspace. Why did you drop it?
>
> > + (ssk->sk_mark == 2 &&
> > + __builtin_memcmp(icsk->icsk_ca_ops->name, "cubic",
> > TCP_CA_NAME_MAX)))
>
> You cannot check this one (or at least not like that): the default
> TCP
> CC can be changed with a kernel config, it is not necessarily cubic.
>
> I think it is fine to only check the other subflow.
>
> > + ctx->retval = -1;
>
> Should you not mark it as an error if the mark is different from 1 or
> 2?
> (Or maybe not, because that's covered by
> _check_getsockopt_subflow_mark)
>
> > + }
> > +
> > + return 1;
> > +}
> > +
> > +SEC("cgroup/getsockopt")
> > +int _getsockopt_subflow(struct bpf_sockopt *ctx)
> > +{
> > + struct bpf_sock *sk = ctx->sk;
> > + struct mptcp_sock *msk;
> > +
> > + if (!sk || sk->protocol != IPPROTO_MPTCP)
> > + return 1;
>
> If this test fails, the check will not be done.
>
> > +
> > + msk = bpf_core_cast(sk, struct mptcp_sock);
> > + if (msk->pm.subflows != 1)
> > + return 1;
>
> Same here: if there is no extra subflow, the test is silently not
> done.
> You change 'ctx->retval'.
>
> Also, it might be good to print something somewhere everywhere you
> set
> this 'ctx->retval' to -1, to be able to easily understand what went
> wrong, no? Can we use bpf_printk() or something similar?
getsockopt() is invoked by connect_to_fd() too, and msk->pm.subflows is
0 in this case. So in v6, I moved this "msk->pm.subflows" check into
_check_getsockopt_subflow_mark/cc.
Thanks,
-Geliang
>
> > +
> > + if (ctx->level == SOL_SOCKET && ctx->optname == SO_MARK)
> > + return _check_getsockopt_subflow_mark(msk, ctx);
> > + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION)
> > + return _check_getsockopt_subflow_cc(msk, ctx);
> > +
> > + return 1;
> > +}
>
> Cheers,
> Matt
© 2016 - 2026 Red Hat, Inc.