[PATCH mptcp-next 09/10] selftests/bpf: Add bpf_burst scheduler

Geliang Tang posted 10 patches 2 years, 7 months ago
Maintainers: Matthieu Baerts <matthieu.baerts@tessares.net>, Mat Martineau <martineau@kernel.org>, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Alexei Starovoitov <ast@kernel.org>, Daniel Borkmann <daniel@iogearbox.net>, Andrii Nakryiko <andrii@kernel.org>, Martin KaFai Lau <martin.lau@linux.dev>, Song Liu <song@kernel.org>, Yonghong Song <yhs@fb.com>, John Fastabend <john.fastabend@gmail.com>, KP Singh <kpsingh@kernel.org>, Stanislav Fomichev <sdf@google.com>, Hao Luo <haoluo@google.com>, Jiri Olsa <jolsa@kernel.org>, Mykola Lysenko <mykolal@fb.com>, Shuah Khan <shuah@kernel.org>
There is a newer version of this series
[PATCH mptcp-next 09/10] selftests/bpf: Add bpf_burst scheduler
Posted by Geliang Tang 2 years, 7 months ago
This patch implements the burst BPF MPTCP scheduler, named bpf_burst,
which is the default scheduler in protocol.c. bpf_burst_get_send() uses
the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans
uses the same logic as mptcp_subflow_get_retrans().

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 tools/testing/selftests/bpf/bpf_tcp_helpers.h |   4 +
 .../selftests/bpf/progs/mptcp_bpf_burst.c     | 205 ++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c

diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
index c749940c9103..c1d7963c3bc8 100644
--- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -36,6 +36,7 @@ enum sk_pacing {
 struct sock {
 	struct sock_common	__sk_common;
 #define sk_state		__sk_common.skc_state
+	int			sk_wmem_queued;
 	unsigned long		sk_pacing_rate;
 	__u32			sk_pacing_status; /* see enum sk_pacing */
 } __attribute__((preserve_access_index));
@@ -234,8 +235,10 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
 #define MPTCP_SUBFLOWS_MAX	8
 
 struct mptcp_subflow_context {
+	unsigned long avg_pacing_rate;
 	__u32	backup : 1,
 		stale : 1;
+	__u8	stale_count;
 	struct	sock *tcp_sock;	    /* tcp sk backpointer */
 } __attribute__((preserve_access_index));
 
@@ -260,6 +263,7 @@ struct mptcp_sched_ops {
 struct mptcp_sock {
 	struct inet_connection_sock	sk;
 
+	__u64		snd_nxt;
 	__u32		token;
 	struct sock	*first;
 	char		ca_name[TCP_CA_NAME_MAX];
diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
new file mode 100644
index 000000000000..1886e2f7aca4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023, SUSE. */
+
+#include <linux/bpf.h>
+#include <limits.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct mptcp_burst_storage {
+	int snd_burst;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct mptcp_burst_storage);
+} mptcp_burst_map SEC(".maps");
+
+#define MPTCP_SEND_BURST_SIZE	65428
+
+struct subflow_send_info {
+	__u8 subflow_id;
+	__u64 linger_time;
+};
+
+static inline __u64 div_u64_rem(__u64 dividend, __u32 divisor, __u32 *remainder)
+{
+	*remainder = dividend % divisor;
+	return dividend / divisor;
+}
+
+static inline __u64 div_u64(__u64 dividend, __u32 divisor)
+{
+	__u32 remainder;
+
+	return div_u64_rem(dividend, divisor, &remainder);
+}
+
+extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym;
+extern void mptcp_set_timeout(struct sock *sk) __ksym;
+extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym;
+extern bool bpf_sk_stream_memory_free(const struct sock *sk) __ksym;
+extern bool bpf_tcp_rtx_and_write_queues_empty(const struct sock *sk) __ksym;
+extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym;
+
+#define SSK_MODE_ACTIVE	0
+#define SSK_MODE_BACKUP	1
+#define SSK_MODE_MAX	2
+
+SEC("struct_ops/mptcp_sched_burst_init")
+void BPF_PROG(mptcp_sched_burst_init, struct mptcp_sock *msk)
+{
+}
+
+SEC("struct_ops/mptcp_sched_burst_release")
+void BPF_PROG(mptcp_sched_burst_release, struct mptcp_sock *msk)
+{
+	bpf_sk_storage_delete(&mptcp_burst_map, msk);
+}
+
+void BPF_STRUCT_OPS(bpf_burst_data_init, struct mptcp_sock *msk,
+		    struct mptcp_sched_data *data)
+{
+	mptcp_sched_data_set_contexts(msk, data);
+}
+
+static int bpf_burst_get_send(struct mptcp_sock *msk,
+			      const struct mptcp_sched_data *data)
+{
+	struct subflow_send_info send_info[SSK_MODE_MAX];
+	struct mptcp_subflow_context *subflow;
+	struct sock *sk = (struct sock *)msk;
+	struct mptcp_burst_storage *ptr;
+	__u32 pace, burst, wmem;
+	__u64 linger_time;
+	struct sock *ssk;
+	int i;
+
+	/* pick the subflow with the lower wmem/wspace ratio */
+	for (i = 0; i < SSK_MODE_MAX; ++i) {
+		send_info[i].subflow_id = MPTCP_SUBFLOWS_MAX;
+		send_info[i].linger_time = -1;
+	}
+
+	for (i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) {
+		subflow = mptcp_subflow_ctx_by_pos(data, i);
+		if (!subflow)
+			break;
+
+		ssk = mptcp_subflow_tcp_sock(subflow);
+		if (!mptcp_subflow_active(subflow))
+			continue;
+
+		pace = subflow->avg_pacing_rate;
+		if (!pace) {
+			/* init pacing rate from socket */
+			subflow->avg_pacing_rate = ssk->sk_pacing_rate;
+			pace = subflow->avg_pacing_rate;
+			if (!pace)
+				continue;
+		}
+
+		linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace);
+		if (linger_time < send_info[subflow->backup].linger_time) {
+			send_info[subflow->backup].subflow_id = i;
+			send_info[subflow->backup].linger_time = linger_time;
+		}
+	}
+	mptcp_set_timeout(sk);
+
+	/* pick the best backup if no other subflow is active */
+	if (send_info[SSK_MODE_ACTIVE].subflow_id == MPTCP_SUBFLOWS_MAX)
+		send_info[SSK_MODE_ACTIVE].subflow_id = send_info[SSK_MODE_BACKUP].subflow_id;
+
+	subflow = mptcp_subflow_ctx_by_pos(data, send_info[SSK_MODE_ACTIVE].subflow_id);
+	if (!subflow)
+		return -1;
+	ssk = mptcp_subflow_tcp_sock(subflow);
+	if (!ssk || !bpf_sk_stream_memory_free(ssk))
+		return -1;
+
+	burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+	wmem = ssk->sk_wmem_queued;
+	if (!burst)
+		goto out;
+
+	subflow->avg_pacing_rate = div_u64((__u64)subflow->avg_pacing_rate * wmem +
+					   ssk->sk_pacing_rate * burst,
+					   burst + wmem);
+	ptr = bpf_sk_storage_get(&mptcp_burst_map, msk, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (ptr)
+		ptr->snd_burst = burst;
+
+out:
+	mptcp_subflow_set_scheduled(subflow, true);
+	return 0;
+}
+
+static int bpf_burst_get_retrans(struct mptcp_sock *msk,
+				 const struct mptcp_sched_data *data)
+{
+	int backup = MPTCP_SUBFLOWS_MAX, pick = MPTCP_SUBFLOWS_MAX, subflow_id;
+	struct mptcp_subflow_context *subflow;
+	int min_stale_count = INT_MAX;
+	struct sock *ssk;
+
+	for (int i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) {
+		subflow = mptcp_subflow_ctx_by_pos(data, i);
+		if (!subflow)
+			break;
+
+		if (!mptcp_subflow_active(subflow))
+			continue;
+
+		ssk = mptcp_subflow_tcp_sock(subflow);
+		/* still data outstanding at TCP level? skip this */
+		if (!bpf_tcp_rtx_and_write_queues_empty(ssk)) {
+			mptcp_pm_subflow_chk_stale(msk, ssk);
+			min_stale_count = min(min_stale_count, subflow->stale_count);
+			continue;
+		}
+
+		if (subflow->backup) {
+			if (backup == MPTCP_SUBFLOWS_MAX)
+				backup = i;
+			continue;
+		}
+
+		if (pick == MPTCP_SUBFLOWS_MAX)
+			pick = i;
+	}
+
+	if (pick < MPTCP_SUBFLOWS_MAX) {
+		subflow_id = pick;
+		goto out;
+	}
+	subflow_id = min_stale_count > 1 ? backup : MPTCP_SUBFLOWS_MAX;
+
+out:
+	subflow = mptcp_subflow_ctx_by_pos(data, subflow_id);
+	if (!subflow)
+		return -1;
+	mptcp_subflow_set_scheduled(subflow, true);
+	return 0;
+}
+
+int BPF_STRUCT_OPS(bpf_burst_get_subflow, struct mptcp_sock *msk,
+		   const struct mptcp_sched_data *data)
+{
+	if (data->reinject)
+		return bpf_burst_get_retrans(msk, data);
+	return bpf_burst_get_send(msk, data);
+}
+
+SEC(".struct_ops")
+struct mptcp_sched_ops burst = {
+	.init		= (void *)mptcp_sched_burst_init,
+	.release	= (void *)mptcp_sched_burst_release,
+	.data_init	= (void *)bpf_burst_data_init,
+	.get_subflow	= (void *)bpf_burst_get_subflow,
+	.name		= "bpf_burst",
+};
-- 
2.35.3
Re: [PATCH mptcp-next 09/10] selftests/bpf: Add bpf_burst scheduler
Posted by Mat Martineau 2 years, 7 months ago
On Tue, 27 Jun 2023, Geliang Tang wrote:

> This patch implements the burst BPF MPTCP scheduler, named bpf_burst,
> which is the default scheduler in protocol.c. bpf_burst_get_send() uses
> the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans
> uses the same logic as mptcp_subflow_get_retrans().
>
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
> tools/testing/selftests/bpf/bpf_tcp_helpers.h |   4 +
> .../selftests/bpf/progs/mptcp_bpf_burst.c     | 205 ++++++++++++++++++
> 2 files changed, 209 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
>
> diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> index c749940c9103..c1d7963c3bc8 100644
> --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> @@ -36,6 +36,7 @@ enum sk_pacing {
> struct sock {
> 	struct sock_common	__sk_common;
> #define sk_state		__sk_common.skc_state
> +	int			sk_wmem_queued;
> 	unsigned long		sk_pacing_rate;
> 	__u32			sk_pacing_status; /* see enum sk_pacing */
> } __attribute__((preserve_access_index));
> @@ -234,8 +235,10 @@ extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
> #define MPTCP_SUBFLOWS_MAX	8
>
> struct mptcp_subflow_context {
> +	unsigned long avg_pacing_rate;
> 	__u32	backup : 1,
> 		stale : 1;
> +	__u8	stale_count;
> 	struct	sock *tcp_sock;	    /* tcp sk backpointer */
> } __attribute__((preserve_access_index));
>
> @@ -260,6 +263,7 @@ struct mptcp_sched_ops {
> struct mptcp_sock {
> 	struct inet_connection_sock	sk;
>
> +	__u64		snd_nxt;
> 	__u32		token;
> 	struct sock	*first;
> 	char		ca_name[TCP_CA_NAME_MAX];
> diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> new file mode 100644
> index 000000000000..1886e2f7aca4
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c
> @@ -0,0 +1,205 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2023, SUSE. */
> +
> +#include <linux/bpf.h>
> +#include <limits.h>
> +#include "bpf_tcp_helpers.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +struct mptcp_burst_storage {
> +	int snd_burst;
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct mptcp_burst_storage);
> +} mptcp_burst_map SEC(".maps");
> +
> +#define MPTCP_SEND_BURST_SIZE	65428
> +
> +struct subflow_send_info {
> +	__u8 subflow_id;
> +	__u64 linger_time;
> +};
> +
> +static inline __u64 div_u64_rem(__u64 dividend, __u32 divisor, __u32 *remainder)
> +{
> +	*remainder = dividend % divisor;
> +	return dividend / divisor;
> +}
> +
> +static inline __u64 div_u64(__u64 dividend, __u32 divisor)
> +{
> +	__u32 remainder;
> +
> +	return div_u64_rem(dividend, divisor, &remainder);
> +}

Since this is compiling to BPF rather than native code with 
architecture-specific optimizations, I think it's better to remove 
div_u64_rem() and keep div_u64() simple:

static inline __u64 div_u64(__u64 dividend, __u32 divisor)
{
      return dividend / divisor;
}

- Mat

> +
> +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym;
> +extern void mptcp_set_timeout(struct sock *sk) __ksym;
> +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym;
> +extern bool bpf_sk_stream_memory_free(const struct sock *sk) __ksym;
> +extern bool bpf_tcp_rtx_and_write_queues_empty(const struct sock *sk) __ksym;
> +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym;
> +
> +#define SSK_MODE_ACTIVE	0
> +#define SSK_MODE_BACKUP	1
> +#define SSK_MODE_MAX	2
> +
> +SEC("struct_ops/mptcp_sched_burst_init")
> +void BPF_PROG(mptcp_sched_burst_init, struct mptcp_sock *msk)
> +{
> +}
> +
> +SEC("struct_ops/mptcp_sched_burst_release")
> +void BPF_PROG(mptcp_sched_burst_release, struct mptcp_sock *msk)
> +{
> +	bpf_sk_storage_delete(&mptcp_burst_map, msk);
> +}
> +
> +void BPF_STRUCT_OPS(bpf_burst_data_init, struct mptcp_sock *msk,
> +		    struct mptcp_sched_data *data)
> +{
> +	mptcp_sched_data_set_contexts(msk, data);
> +}
> +
> +static int bpf_burst_get_send(struct mptcp_sock *msk,
> +			      const struct mptcp_sched_data *data)
> +{
> +	struct subflow_send_info send_info[SSK_MODE_MAX];
> +	struct mptcp_subflow_context *subflow;
> +	struct sock *sk = (struct sock *)msk;
> +	struct mptcp_burst_storage *ptr;
> +	__u32 pace, burst, wmem;
> +	__u64 linger_time;
> +	struct sock *ssk;
> +	int i;
> +
> +	/* pick the subflow with the lower wmem/wspace ratio */
> +	for (i = 0; i < SSK_MODE_MAX; ++i) {
> +		send_info[i].subflow_id = MPTCP_SUBFLOWS_MAX;
> +		send_info[i].linger_time = -1;
> +	}
> +
> +	for (i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) {
> +		subflow = mptcp_subflow_ctx_by_pos(data, i);
> +		if (!subflow)
> +			break;
> +
> +		ssk = mptcp_subflow_tcp_sock(subflow);
> +		if (!mptcp_subflow_active(subflow))
> +			continue;
> +
> +		pace = subflow->avg_pacing_rate;
> +		if (!pace) {
> +			/* init pacing rate from socket */
> +			subflow->avg_pacing_rate = ssk->sk_pacing_rate;
> +			pace = subflow->avg_pacing_rate;
> +			if (!pace)
> +				continue;
> +		}
> +
> +		linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace);
> +		if (linger_time < send_info[subflow->backup].linger_time) {
> +			send_info[subflow->backup].subflow_id = i;
> +			send_info[subflow->backup].linger_time = linger_time;
> +		}
> +	}
> +	mptcp_set_timeout(sk);
> +
> +	/* pick the best backup if no other subflow is active */
> +	if (send_info[SSK_MODE_ACTIVE].subflow_id == MPTCP_SUBFLOWS_MAX)
> +		send_info[SSK_MODE_ACTIVE].subflow_id = send_info[SSK_MODE_BACKUP].subflow_id;
> +
> +	subflow = mptcp_subflow_ctx_by_pos(data, send_info[SSK_MODE_ACTIVE].subflow_id);
> +	if (!subflow)
> +		return -1;
> +	ssk = mptcp_subflow_tcp_sock(subflow);
> +	if (!ssk || !bpf_sk_stream_memory_free(ssk))
> +		return -1;
> +
> +	burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
> +	wmem = ssk->sk_wmem_queued;
> +	if (!burst)
> +		goto out;
> +
> +	subflow->avg_pacing_rate = div_u64((__u64)subflow->avg_pacing_rate * wmem +
> +					   ssk->sk_pacing_rate * burst,
> +					   burst + wmem);
> +	ptr = bpf_sk_storage_get(&mptcp_burst_map, msk, 0,
> +				 BPF_LOCAL_STORAGE_GET_F_CREATE);
> +	if (ptr)
> +		ptr->snd_burst = burst;
> +
> +out:
> +	mptcp_subflow_set_scheduled(subflow, true);
> +	return 0;
> +}
> +
> +static int bpf_burst_get_retrans(struct mptcp_sock *msk,
> +				 const struct mptcp_sched_data *data)
> +{
> +	int backup = MPTCP_SUBFLOWS_MAX, pick = MPTCP_SUBFLOWS_MAX, subflow_id;
> +	struct mptcp_subflow_context *subflow;
> +	int min_stale_count = INT_MAX;
> +	struct sock *ssk;
> +
> +	for (int i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) {
> +		subflow = mptcp_subflow_ctx_by_pos(data, i);
> +		if (!subflow)
> +			break;
> +
> +		if (!mptcp_subflow_active(subflow))
> +			continue;
> +
> +		ssk = mptcp_subflow_tcp_sock(subflow);
> +		/* still data outstanding at TCP level? skip this */
> +		if (!bpf_tcp_rtx_and_write_queues_empty(ssk)) {
> +			mptcp_pm_subflow_chk_stale(msk, ssk);
> +			min_stale_count = min(min_stale_count, subflow->stale_count);
> +			continue;
> +		}
> +
> +		if (subflow->backup) {
> +			if (backup == MPTCP_SUBFLOWS_MAX)
> +				backup = i;
> +			continue;
> +		}
> +
> +		if (pick == MPTCP_SUBFLOWS_MAX)
> +			pick = i;
> +	}
> +
> +	if (pick < MPTCP_SUBFLOWS_MAX) {
> +		subflow_id = pick;
> +		goto out;
> +	}
> +	subflow_id = min_stale_count > 1 ? backup : MPTCP_SUBFLOWS_MAX;
> +
> +out:
> +	subflow = mptcp_subflow_ctx_by_pos(data, subflow_id);
> +	if (!subflow)
> +		return -1;
> +	mptcp_subflow_set_scheduled(subflow, true);
> +	return 0;
> +}
> +
> +int BPF_STRUCT_OPS(bpf_burst_get_subflow, struct mptcp_sock *msk,
> +		   const struct mptcp_sched_data *data)
> +{
> +	if (data->reinject)
> +		return bpf_burst_get_retrans(msk, data);
> +	return bpf_burst_get_send(msk, data);
> +}
> +
> +SEC(".struct_ops")
> +struct mptcp_sched_ops burst = {
> +	.init		= (void *)mptcp_sched_burst_init,
> +	.release	= (void *)mptcp_sched_burst_release,
> +	.data_init	= (void *)bpf_burst_data_init,
> +	.get_subflow	= (void *)bpf_burst_get_subflow,
> +	.name		= "bpf_burst",
> +};
> -- 
> 2.35.3
>
>
>