From: Geliang Tang <tanggeliang@kylinos.cn>
It's necessary to traverse all subflows on the conn_list of an MPTCP
socket and then call kfunc to modify the fields of each subflow. In
kernel space, mptcp_for_each_subflow() helper is used for this:
mptcp_for_each_subflow(msk, subflow)
kfunc(subflow);
But in the MPTCP BPF program, this has not yet been implemented. As
Martin suggested recently, this conn_list walking + modify-by-kfunc
usage fits the bpf_iter use case. So this patch adds a new bpf_iter
type named "mptcp_subflow" to do this and implements its helpers
bpf_iter_mptcp_subflow_new()/_next()/_destroy().
Then bpf_for_each() for mptcp_subflow can be used in BPF program like
this:
bpf_rcu_read_lock();
bpf_for_each(mptcp_subflow, subflow, msk)
kfunc(subflow);
bpf_rcu_read_unlock();
Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
---
net/mptcp/bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 43 insertions(+), 4 deletions(-)
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 6414824402e6..350672e24a3d 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -201,9 +201,48 @@ static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
.set = &bpf_mptcp_fmodret_ids,
};
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "kfuncs which will be used in BPF programs");
+struct bpf_iter_mptcp_subflow {
+ __u64 __opaque[2];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_mptcp_subflow_kern {
+ struct mptcp_sock *msk;
+ struct list_head *pos;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it,
+ struct mptcp_sock *msk)
+{
+ struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
+
+ if (!msk)
+ return -EINVAL;
+
+ kit->msk = msk;
+ kit->pos = &msk->conn_list;
+ return 0;
+}
+
+__bpf_kfunc struct mptcp_subflow_context *
+bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
+{
+ struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = kit->msk;
+
+ subflow = list_entry((kit->pos)->next, struct mptcp_subflow_context, node);
+ if (!msk || list_entry_is_head(subflow, &msk->conn_list, node))
+ return NULL;
+
+ kit->pos = &subflow->node;
+ return subflow;
+}
+
+__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct bpf_iter_mptcp_subflow *it)
+{
+}
__bpf_kfunc struct mptcp_subflow_context *
bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, unsigned int pos)
@@ -218,7 +257,7 @@ __bpf_kfunc bool bpf_mptcp_subflow_queues_empty(struct sock *sk)
return tcp_rtx_queue_empty(sk);
}
-__diag_pop();
+__bpf_kfunc_end_defs();
BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
--
2.43.0
On Mon, Sep 9, 2024 at 10:37 PM Geliang Tang <geliang@kernel.org> wrote:
>
> From: Geliang Tang <tanggeliang@kylinos.cn>
>
> It's necessary to traverse all subflows on the conn_list of an MPTCP
> socket and then call kfunc to modify the fields of each subflow. In
> kernel space, mptcp_for_each_subflow() helper is used for this:
>
> mptcp_for_each_subflow(msk, subflow)
> kfunc(subflow);
>
> But in the MPTCP BPF program, this has not yet been implemented. As
> Martin suggested recently, this conn_list walking + modify-by-kfunc
> usage fits the bpf_iter use case. So this patch adds a new bpf_iter
> type named "mptcp_subflow" to do this and implements its helpers
> bpf_iter_mptcp_subflow_new()/_next()/_destroy().
>
> Then bpf_for_each() for mptcp_subflow can be used in BPF program like
> this:
>
> bpf_rcu_read_lock();
> bpf_for_each(mptcp_subflow, subflow, msk)
> kfunc(subflow);
> bpf_rcu_read_unlock();
>
> Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> ---
> net/mptcp/bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 43 insertions(+), 4 deletions(-)
>
Not sure why, but only this patch made it to the BPF mailing list? Did
you cc bpf@vger on all patches?
> diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> index 6414824402e6..350672e24a3d 100644
> --- a/net/mptcp/bpf.c
> +++ b/net/mptcp/bpf.c
> @@ -201,9 +201,48 @@ static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
> .set = &bpf_mptcp_fmodret_ids,
> };
>
> -__diag_push();
> -__diag_ignore_all("-Wmissing-prototypes",
> - "kfuncs which will be used in BPF programs");
> +struct bpf_iter_mptcp_subflow {
> + __u64 __opaque[2];
> +} __attribute__((aligned(8)));
> +
> +struct bpf_iter_mptcp_subflow_kern {
> + struct mptcp_sock *msk;
> + struct list_head *pos;
> +} __attribute__((aligned(8)));
> +
> +__bpf_kfunc_start_defs();
> +
> +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct bpf_iter_mptcp_subflow *it,
> + struct mptcp_sock *msk)
> +{
> + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> +
> + if (!msk)
> + return -EINVAL;
you still need to initialize at least kit->msk to NULL to prevent next
implementation below doing the wrong thing
keep in mind, iterator constructor returning error doesn't prevent BPF
program from still calling next() and destroy(), so implementation has
to set iterator state such that next can return NULL if iterator was
never successfully initialized
pw-bot: cr
> +
> + kit->msk = msk;
> + kit->pos = &msk->conn_list;
> + return 0;
> +}
> +
> +__bpf_kfunc struct mptcp_subflow_context *
> +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
> +{
> + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> + struct mptcp_subflow_context *subflow;
> + struct mptcp_sock *msk = kit->msk;
> +
you should check if (!msk) early here, before accessing kit->pos->next below
> + subflow = list_entry((kit->pos)->next, struct mptcp_subflow_context, node);
nit: why () around kit->pos?
> + if (!msk || list_entry_is_head(subflow, &msk->conn_list, node))
as I mentioned, !msk check seems too late. Maybe list_entry_is_head()
is a bit too late as well?
> + return NULL;
> +
> + kit->pos = &subflow->node;
> + return subflow;
> +}
> +
> +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct bpf_iter_mptcp_subflow *it)
> +{
> +}
>
> __bpf_kfunc struct mptcp_subflow_context *
> bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data, unsigned int pos)
> @@ -218,7 +257,7 @@ __bpf_kfunc bool bpf_mptcp_subflow_queues_empty(struct sock *sk)
> return tcp_rtx_queue_empty(sk);
> }
>
> -__diag_pop();
> +__bpf_kfunc_end_defs();
>
> BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
> --
> 2.43.0
>
>
Hi Andrii,
On Wed, 2024-09-11 at 14:00 -0700, Andrii Nakryiko wrote:
> On Mon, Sep 9, 2024 at 10:37 PM Geliang Tang <geliang@kernel.org>
> wrote:
> >
> > From: Geliang Tang <tanggeliang@kylinos.cn>
> >
> > It's necessary to traverse all subflows on the conn_list of an
> > MPTCP
> > socket and then call kfunc to modify the fields of each subflow. In
> > kernel space, mptcp_for_each_subflow() helper is used for this:
> >
> > mptcp_for_each_subflow(msk, subflow)
> > kfunc(subflow);
> >
> > But in the MPTCP BPF program, this has not yet been implemented. As
> > Martin suggested recently, this conn_list walking + modify-by-kfunc
> > usage fits the bpf_iter use case. So this patch adds a new bpf_iter
> > type named "mptcp_subflow" to do this and implements its helpers
> > bpf_iter_mptcp_subflow_new()/_next()/_destroy().
> >
> > Then bpf_for_each() for mptcp_subflow can be used in BPF program
> > like
> > this:
> >
> > bpf_rcu_read_lock();
> > bpf_for_each(mptcp_subflow, subflow, msk)
> > kfunc(subflow);
> > bpf_rcu_read_unlock();
> >
> > Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
> > Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
> > ---
> > net/mptcp/bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++--
> > --
> > 1 file changed, 43 insertions(+), 4 deletions(-)
> >
>
> Not sure why, but only this patch made it to the BPF mailing list?
> Did
> you cc bpf@vger on all patches?
This patch is for "mptcp-next" [1], it depends on the "new MPTCP
subflow subtest" which is under review on the bpf list. We will send it
to the bpf list very soon.
[1]
https://patchwork.kernel.org/project/mptcp/cover/cover.1726023577.git.tanggeliang@kylinos.cn/
>
> > diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
> > index 6414824402e6..350672e24a3d 100644
> > --- a/net/mptcp/bpf.c
> > +++ b/net/mptcp/bpf.c
> > @@ -201,9 +201,48 @@ static const struct btf_kfunc_id_set
> > bpf_mptcp_fmodret_set = {
> > .set = &bpf_mptcp_fmodret_ids,
> > };
> >
> > -__diag_push();
> > -__diag_ignore_all("-Wmissing-prototypes",
> > - "kfuncs which will be used in BPF programs");
> > +struct bpf_iter_mptcp_subflow {
> > + __u64 __opaque[2];
> > +} __attribute__((aligned(8)));
> > +
> > +struct bpf_iter_mptcp_subflow_kern {
> > + struct mptcp_sock *msk;
> > + struct list_head *pos;
> > +} __attribute__((aligned(8)));
> > +
> > +__bpf_kfunc_start_defs();
> > +
> > +__bpf_kfunc int bpf_iter_mptcp_subflow_new(struct
> > bpf_iter_mptcp_subflow *it,
> > + struct mptcp_sock *msk)
> > +{
> > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > +
> > + if (!msk)
> > + return -EINVAL;
>
> you still need to initialize at least kit->msk to NULL to prevent
> next
> implementation below doing the wrong thing
>
> keep in mind, iterator constructor returning error doesn't prevent
> BPF
> program from still calling next() and destroy(), so implementation
> has
> to set iterator state such that next can return NULL if iterator was
> never successfully initialized
>
I'll move "kit->msk = msk;" earlier like this:
{
struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
kit->msk = msk;
if (!msk)
return -EINVAL;
kit->pos = &msk->conn_list;
return 0;
}
> pw-bot: cr
>
> > +
> > + kit->msk = msk;
> > + kit->pos = &msk->conn_list;
> > + return 0;
> > +}
> > +
> > +__bpf_kfunc struct mptcp_subflow_context *
> > +bpf_iter_mptcp_subflow_next(struct bpf_iter_mptcp_subflow *it)
> > +{
> > + struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
> > + struct mptcp_subflow_context *subflow;
> > + struct mptcp_sock *msk = kit->msk;
> > +
>
> you should check if (!msk) early here, before accessing kit->pos-
> >next below
>
> > + subflow = list_entry((kit->pos)->next, struct
> > mptcp_subflow_context, node);
>
> nit: why () around kit->pos?
>
> > + if (!msk || list_entry_is_head(subflow, &msk->conn_list,
> > node))
>
> as I mentioned, !msk check seems too late. Maybe list_entry_is_head()
> is a bit too late as well?
We can use list_is_last() to check kit->pos earlier. But here we use
list_entry_is_head(), it should be after list_entry().
I'll move "if (!msk)" check earlier like this:
{
struct bpf_iter_mptcp_subflow_kern *kit = (void *)it;
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = kit->msk;
if (!msk)
return NULL;
subflow = list_entry(kit->pos->next, struct
mptcp_subflow_context, node);
if (!subflow || list_entry_is_head(subflow, &msk->conn_list,
node))
return NULL;
kit->pos = &subflow->node;
return subflow;
}
Thanks,
-Geliang
>
> > + return NULL;
> > +
> > + kit->pos = &subflow->node;
> > + return subflow;
> > +}
> > +
> > +__bpf_kfunc void bpf_iter_mptcp_subflow_destroy(struct
> > bpf_iter_mptcp_subflow *it)
> > +{
> > +}
> >
> > __bpf_kfunc struct mptcp_subflow_context *
> > bpf_mptcp_subflow_ctx_by_pos(const struct mptcp_sched_data *data,
> > unsigned int pos)
> > @@ -218,7 +257,7 @@ __bpf_kfunc bool
> > bpf_mptcp_subflow_queues_empty(struct sock *sk)
> > return tcp_rtx_queue_empty(sk);
> > }
> >
> > -__diag_pop();
> > +__bpf_kfunc_end_defs();
> >
> > BTF_KFUNCS_START(bpf_mptcp_sched_kfunc_ids)
> > BTF_ID_FLAGS(func, mptcp_subflow_set_scheduled)
> > --
> > 2.43.0
> >
> >
© 2016 - 2026 Red Hat, Inc.