[PATCH mptcp-next v7 1/8] mptcp: add struct mptcp_sched_ops

Geliang Tang posted 8 patches 3 years, 10 months ago
Maintainers: Mat Martineau <mathew.j.martineau@linux.intel.com>, Yonghong Song <yhs@fb.com>, Paolo Abeni <pabeni@redhat.com>, John Fastabend <john.fastabend@gmail.com>, Andrii Nakryiko <andrii@kernel.org>, Jakub Kicinski <kuba@kernel.org>, Alexei Starovoitov <ast@kernel.org>, "David S. Miller" <davem@davemloft.net>, KP Singh <kpsingh@kernel.org>, Matthieu Baerts <matthieu.baerts@tessares.net>, Jonathan Corbet <corbet@lwn.net>, Shuah Khan <shuah@kernel.org>, Martin KaFai Lau <kafai@fb.com>, Song Liu <songliubraving@fb.com>, Daniel Borkmann <daniel@iogearbox.net>
There is a newer version of this series
[PATCH mptcp-next v7 1/8] mptcp: add struct mptcp_sched_ops
Posted by Geliang Tang 3 years, 10 months ago
This patch defines struct mptcp_sched_ops, which has three struct members,
name, owner and list, and three function pointers, init, release and
get_subflow.

Add the scheduler registering, unregistering and finding functions to add
or delete or find a packet scheduler on the pernet sched_list.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 include/net/mptcp.h  |  13 +++++
 net/mptcp/Makefile   |   2 +-
 net/mptcp/protocol.c |   1 +
 net/mptcp/protocol.h |   7 +++
 net/mptcp/sched.c    | 113 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 net/mptcp/sched.c

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 8b1afd6f5cc4..e3a0baa8dbd7 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -95,6 +95,19 @@ struct mptcp_out_options {
 #endif
 };
 
+#define MPTCP_SCHED_NAME_MAX 16
+
+struct mptcp_sched_ops {
+	struct sock *	(*get_subflow)(struct mptcp_sock *msk);
+
+	char			name[MPTCP_SCHED_NAME_MAX];
+	struct module		*owner;
+	struct list_head	list;
+
+	void (*init)(struct mptcp_sock *msk);
+	void (*release)(struct mptcp_sock *msk);
+} ____cacheline_aligned_in_smp;
+
 #ifdef CONFIG_MPTCP
 extern struct request_sock_ops mptcp_subflow_request_sock_ops;
 
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index 168c55d1c917..a37330760b0c 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,7 @@
 obj-$(CONFIG_MPTCP) += mptcp.o
 
 mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
-	   mib.o pm_netlink.o sockopt.o
+	   mib.o pm_netlink.o sockopt.o sched.o
 
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d3887f628b54..b1d7c8b0c112 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3807,6 +3807,7 @@ void __init mptcp_proto_init(void)
 
 	mptcp_subflow_init();
 	mptcp_pm_init();
+	mptcp_sched_init();
 	mptcp_token_init();
 
 	if (proto_register(&mptcp_prot, MPTCP_USE_SLAB) != 0)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index fd82fd113113..3258b740c8ee 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -608,6 +608,13 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
 void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
 			 struct sockaddr_storage *addr,
 			 unsigned short family);
+struct mptcp_sched_ops *mptcp_sched_find(const struct net *net,
+					 const char *name);
+int mptcp_register_scheduler(const struct net *net,
+			     struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(const struct net *net,
+				struct mptcp_sched_ops *sched);
+void mptcp_sched_init(void);
 
 static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
 {
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
new file mode 100644
index 000000000000..ae1956b6de92
--- /dev/null
+++ b/net/mptcp/sched.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, SUSE.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+#include "protocol.h"
+
+static int sched_pernet_id;
+
+struct sched_pernet {
+	/* protects pernet updates */
+	spinlock_t		lock;
+	struct list_head	sched_list;
+};
+
+static struct sched_pernet *sched_get_pernet(const struct net *net)
+{
+	return net_generic(net, sched_pernet_id);
+}
+
+struct mptcp_sched_ops *mptcp_sched_find(const struct net *net,
+					 const char *name)
+{
+	struct sched_pernet *pernet = sched_get_pernet(net);
+	struct mptcp_sched_ops *sched, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sched, &pernet->sched_list, list) {
+		if (!strcmp(sched->name, name)) {
+			ret = sched;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int mptcp_register_scheduler(const struct net *net,
+			     struct mptcp_sched_ops *sched)
+{
+	struct sched_pernet *pernet = sched_get_pernet(net);
+
+	if (!sched->get_subflow)
+		return -EINVAL;
+
+	if (mptcp_sched_find(net, sched->name))
+		return -EEXIST;
+
+	spin_lock(&pernet->lock);
+	list_add_tail_rcu(&sched->list, &pernet->sched_list);
+	spin_unlock(&pernet->lock);
+
+	pr_debug("%s registered", sched->name);
+	return 0;
+}
+
+void mptcp_unregister_scheduler(const struct net *net,
+				struct mptcp_sched_ops *sched)
+{
+	struct sched_pernet *pernet = sched_get_pernet(net);
+
+	spin_lock(&pernet->lock);
+	list_del_rcu(&sched->list);
+	spin_unlock(&pernet->lock);
+
+	/* avoid workqueue lockup */
+	synchronize_rcu();
+}
+
+static int __net_init sched_init_net(struct net *net)
+{
+	struct sched_pernet *pernet = sched_get_pernet(net);
+
+	INIT_LIST_HEAD_RCU(&pernet->sched_list);
+	spin_lock_init(&pernet->lock);
+
+	return 0;
+}
+
+static void __net_exit sched_exit_net(struct net *net)
+{
+	struct sched_pernet *pernet = sched_get_pernet(net);
+	struct mptcp_sched_ops *sched;
+
+	spin_lock(&pernet->lock);
+	list_for_each_entry_rcu(sched, &pernet->sched_list, list)
+		list_del_rcu(&sched->list);
+	spin_unlock(&pernet->lock);
+}
+
+static struct pernet_operations mptcp_sched_pernet_ops = {
+	.init = sched_init_net,
+	.exit = sched_exit_net,
+	.id = &sched_pernet_id,
+	.size = sizeof(struct sched_pernet),
+};
+
+void mptcp_sched_init(void)
+{
+	if (register_pernet_subsys(&mptcp_sched_pernet_ops) < 0)
+		panic("Failed to register MPTCP sched pernet subsystem.\n");
+}
-- 
2.34.1


Re: [PATCH mptcp-next v7 1/8] mptcp: add struct mptcp_sched_ops
Posted by Florian Westphal 3 years, 10 months ago
Geliang Tang <geliang.tang@suse.com> wrote:
> This patch defines struct mptcp_sched_ops, which has three struct members,
> name, owner and list, and three function pointers, init, release and
> get_subflow.
> 
> Add the scheduler registering, unregistering and finding functions to add
> or delete or find a packet scheduler on the pernet sched_list.
> 
> Suggested-by: Florian Westphal <fw@strlen.de>

Huh?  I made some comments on earlier patch but I did not suggest this.
In fact, I don't like this at all but I guess I've been vetoed wrt.
pernet design.

> +struct mptcp_sched_ops *mptcp_sched_find(const struct net *net,
> +					 const char *name)
> +{
> +	struct sched_pernet *pernet = sched_get_pernet(net);
> +	struct mptcp_sched_ops *sched, *ret = NULL;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(sched, &pernet->sched_list, list) {
> +		if (!strcmp(sched->name, name)) {
> +			ret = sched;
> +			break;
> +		}
> +	}
> +	rcu_read_unlock();
> +	return ret;

This is fishy.  Either caller must already hold rcu read lock,
or mptcp_sched_ops can be free'd before function returns.

I suspect its best to force callers to do the rcu_read_lock and
document that this function is called with rcu read lock held.

> +int mptcp_register_scheduler(const struct net *net,
> +			     struct mptcp_sched_ops *sched)
> +{
> +	struct sched_pernet *pernet = sched_get_pernet(net);
> +
> +	if (!sched->get_subflow)
> +		return -EINVAL;
> +
> +	if (mptcp_sched_find(net, sched->name))
> +		return -EEXIST;
> +
> +	spin_lock(&pernet->lock);
> +	list_add_tail_rcu(&sched->list, &pernet->sched_list);
> +	spin_unlock(&pernet->lock);

This is racy.  Lock, test, add, unlock, not test, lock, ...

> +	spin_lock(&pernet->lock);
> +	list_del_rcu(&sched->list);
> +	spin_unlock(&pernet->lock);
> +
> +	/* avoid workqueue lockup */
> +	synchronize_rcu();

Hmm.  You mean 'wait until rcu read sides are done'?
But even that makes little sense to me.

There is no free operation done here so this is safe even
without synchronize_rcu()?

Re: [PATCH mptcp-next v7 1/8] mptcp: add struct mptcp_sched_ops
Posted by Geliang Tang 3 years, 10 months ago
On Mon, Mar 28, 2022 at 11:49:19AM +0200, Florian Westphal wrote:
> Geliang Tang <geliang.tang@suse.com> wrote:
> > This patch defines struct mptcp_sched_ops, which has three struct members,
> > name, owner and list, and three function pointers, init, release and
> > get_subflow.
> > 
> > Add the scheduler registering, unregistering and finding functions to add
> > or delete or find a packet scheduler on the pernet sched_list.
> > 
> > Suggested-by: Florian Westphal <fw@strlen.de>
> 
> Huh?  I made some comments on earlier patch but I did not suggest this.
> In fact, I don't like this at all but I guess I've been vetoed wrt.
> pernet design.

Sorry Florian, I didn't use 'Suggested-by' tag correctly here. I'll drop
it in v8. At the weekly meeting last week, everyone did agree to use the
global list instead of the pernet list to implement sched list. I have no
strong intention of which type of list to use. I just think that if we
used pernet local_addr_list in PM netlink, it makes sense for sched list
to use pernet too.

How about switching to use global sched list in v8? I guess Mat won't
object.

> 
> > +struct mptcp_sched_ops *mptcp_sched_find(const struct net *net,
> > +					 const char *name)
> > +{
> > +	struct sched_pernet *pernet = sched_get_pernet(net);
> > +	struct mptcp_sched_ops *sched, *ret = NULL;
> > +
> > +	rcu_read_lock();
> > +	list_for_each_entry_rcu(sched, &pernet->sched_list, list) {
> > +		if (!strcmp(sched->name, name)) {
> > +			ret = sched;
> > +			break;
> > +		}
> > +	}
> > +	rcu_read_unlock();
> > +	return ret;
> 
> This is fishy.  Either caller must already hold rcu read lock,
> or mptcp_sched_ops can be free'd before function returns.
> 
> I suspect its best to force callers to do the rcu_read_lock and
> document that this function is called with rcu read lock held.

Agree, update in v8.

> 
> > +int mptcp_register_scheduler(const struct net *net,
> > +			     struct mptcp_sched_ops *sched)
> > +{
> > +	struct sched_pernet *pernet = sched_get_pernet(net);
> > +
> > +	if (!sched->get_subflow)
> > +		return -EINVAL;
> > +
> > +	if (mptcp_sched_find(net, sched->name))
> > +		return -EEXIST;
> > +
> > +	spin_lock(&pernet->lock);
> > +	list_add_tail_rcu(&sched->list, &pernet->sched_list);
> > +	spin_unlock(&pernet->lock);
> 
> This is racy.  Lock, test, add, unlock, not test, lock, ...

How should I fix this?

> 
> > +	spin_lock(&pernet->lock);
> > +	list_del_rcu(&sched->list);
> > +	spin_unlock(&pernet->lock);
> > +
> > +	/* avoid workqueue lockup */
> > +	synchronize_rcu();
> 
> Hmm.  You mean 'wait until rcu read sides are done'?
> But even that makes little sense to me.
> 
> There is no free operation done here so this is safe even
> without synchronize_rcu()?

To be honest, I don't know if I should use synchronize_rcu() here. But
I've tested it. If remove this synchronize_rcu(), kernel will crash, with
"BUG: workqueue lockup - pool ...!"

Thanks,
-Geliang

> 


Re: [PATCH mptcp-next v7 1/8] mptcp: add struct mptcp_sched_ops
Posted by Florian Westphal 3 years, 10 months ago
Geliang Tang <geliang.tang@suse.com> wrote:
> > > +	if (!sched->get_subflow)
> > > +		return -EINVAL;
> > > +
> > > +	if (mptcp_sched_find(net, sched->name))
> > > +		return -EEXIST;
> > > +
> > > +	spin_lock(&pernet->lock);
> > > +	list_add_tail_rcu(&sched->list, &pernet->sched_list);
> > > +	spin_unlock(&pernet->lock);
> > 
> > This is racy.  Lock, test, add, unlock, not test, lock, ...
> 
> How should I fix this?

spin_lock(&pernet->lock);
if (mptcp_sched_find(net, sched->name)) {
	spin_unlock(&pernet->lock);
	return -EEXIST;
}
list_add_....