[v2] tun: Introduce virtio-net hashing feature

[RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Akihiko Odaki 2 years, 3 months ago

This new program type will be used by tun to determine the queues to
deliver packets and the hash values and types reported with virtio-net
headers.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
---
 Documentation/bpf/bpf_prog_run.rst         |  1 +
 Documentation/bpf/libbpf/program_types.rst |  2 ++
 include/linux/bpf_types.h                  |  2 ++
 include/uapi/linux/bpf.h                   |  5 +++++
 kernel/bpf/verifier.c                      |  6 ++++++
 net/core/filter.c                          | 11 +++++++++++
 tools/include/uapi/linux/bpf.h             |  1 +
 tools/lib/bpf/libbpf.c                     |  2 ++
 8 files changed, 30 insertions(+)

diff --git a/Documentation/bpf/bpf_prog_run.rst b/Documentation/bpf/bpf_prog_run.rst
index 4868c909df5c..0d108d867c03 100644
--- a/Documentation/bpf/bpf_prog_run.rst
+++ b/Documentation/bpf/bpf_prog_run.rst
@@ -39,6 +39,7 @@ following types:
 - ``BPF_PROG_TYPE_STRUCT_OPS``
 - ``BPF_PROG_TYPE_RAW_TRACEPOINT``
 - ``BPF_PROG_TYPE_SYSCALL``
+- ``BPF_PROG_TYPE_VNET_HASH``
 
 When using the ``BPF_PROG_RUN`` command, userspace supplies an input context
 object and (for program types operating on network packets) a buffer containing
diff --git a/Documentation/bpf/libbpf/program_types.rst b/Documentation/bpf/libbpf/program_types.rst
index ad4d4d5eecb0..6be53201f91b 100644
--- a/Documentation/bpf/libbpf/program_types.rst
+++ b/Documentation/bpf/libbpf/program_types.rst
@@ -171,6 +171,8 @@ described in more detail in the footnotes.
 +                                           +----------------------------------------+----------------------------------+-----------+
 |                                           | ``BPF_TRACE_RAW_TP``                   | ``tp_btf+`` [#fentry]_           |           |
 +-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_VNET_HASH``               |                                        | ``vnet_hash``                    |           |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
 | ``BPF_PROG_TYPE_XDP``                     | ``BPF_XDP_CPUMAP``                     | ``xdp.frags/cpumap``             |           |
 +                                           +                                        +----------------------------------+-----------+
 |                                           |                                        | ``xdp/cpumap``                   |           |
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..dec83d495e82 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -34,6 +34,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg,
 	      struct sk_msg_md, struct sk_msg)
 BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector,
 	      struct __sk_buff, struct bpf_flow_dissector)
+BPF_PROG_TYPE(BPF_PROG_TYPE_VNET_HASH, vnet_hash,
+	      struct __sk_buff, struct sk_buff)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0448700890f7..298634556fab 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -988,6 +988,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_VNET_HASH,
 };
 
 enum bpf_attach_type {
@@ -6111,6 +6112,10 @@ struct __sk_buff {
 	__u8  tstamp_type;
 	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
+
+	__u32 vnet_hash_value;
+	__u16 vnet_hash_report;
+	__u16 vnet_rss_queue;
 };
 
 struct bpf_tunnel_key {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb78212fa5b2..fd6d842635d2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14373,6 +14373,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_VNET_HASH:
 		return true;
 	default:
 		return false;
@@ -16973,6 +16974,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
+		if (prog_type == BPF_PROG_TYPE_VNET_HASH) {
+			verbose(env, "vnet hash progs cannot use bpf_spin_lock yet\n");
+			return -EINVAL;
+		}
+
 		if (is_tracing_prog_type(prog_type)) {
 			verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index a094694899c9..867edbc628de 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10967,6 +10967,17 @@ const struct bpf_prog_ops flow_dissector_prog_ops = {
 	.test_run		= bpf_prog_test_run_flow_dissector,
 };
 
+const struct bpf_verifier_ops vnet_hash_verifier_ops = {
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sk_filter_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+	.gen_ld_abs		= bpf_gen_ld_abs,
+};
+
+const struct bpf_prog_ops vnet_hash_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0448700890f7..60976fe86247 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -988,6 +988,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_VNET_HASH,
 };
 
 enum bpf_attach_type {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 96ff1aa4bf6a..e74d136eae07 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -209,6 +209,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SK_LOOKUP]		= "sk_lookup",
 	[BPF_PROG_TYPE_SYSCALL]			= "syscall",
 	[BPF_PROG_TYPE_NETFILTER]		= "netfilter",
+	[BPF_PROG_TYPE_VNET_HASH]		= "vnet_hash",
 };
 
 static int __base_pr(enum libbpf_print_level level, const char *format,
@@ -8858,6 +8859,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("struct_ops.s+",	STRUCT_OPS, 0, SEC_SLEEPABLE),
 	SEC_DEF("sk_lookup",		SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
 	SEC_DEF("netfilter",		NETFILTER, BPF_NETFILTER, SEC_NONE),
+	SEC_DEF("vnet_hash",		VNET_HASH, 0, SEC_NONE),
 };
 
 int libbpf_register_prog_handler(const char *sec,
-- 
2.42.0

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Alexei Starovoitov 2 years, 3 months ago

On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 0448700890f7..298634556fab 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -988,6 +988,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_SK_LOOKUP,
>         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
>         BPF_PROG_TYPE_NETFILTER,
> +       BPF_PROG_TYPE_VNET_HASH,

Sorry, we do not add new stable program types anymore.

> @@ -6111,6 +6112,10 @@ struct __sk_buff {
>         __u8  tstamp_type;
>         __u32 :24;              /* Padding, future use. */
>         __u64 hwtstamp;
> +
> +       __u32 vnet_hash_value;
> +       __u16 vnet_hash_report;
> +       __u16 vnet_rss_queue;
>  };

we also do not add anything to uapi __sk_buff.

> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
> +       .get_func_proto         = sk_filter_func_proto,
> +       .is_valid_access        = sk_filter_is_valid_access,
> +       .convert_ctx_access     = bpf_convert_ctx_access,
> +       .gen_ld_abs             = bpf_gen_ld_abs,
> +};

and we don't do ctx rewrites like this either.

Please see how hid-bpf and cgroup rstat are hooking up bpf
in _unstable_ way.

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Akihiko Odaki 2 years, 3 months ago

On 2023/10/16 1:07, Alexei Starovoitov wrote:
> On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 0448700890f7..298634556fab 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -988,6 +988,7 @@ enum bpf_prog_type {
>>          BPF_PROG_TYPE_SK_LOOKUP,
>>          BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
>>          BPF_PROG_TYPE_NETFILTER,
>> +       BPF_PROG_TYPE_VNET_HASH,
> 
> Sorry, we do not add new stable program types anymore.
> 
>> @@ -6111,6 +6112,10 @@ struct __sk_buff {
>>          __u8  tstamp_type;
>>          __u32 :24;              /* Padding, future use. */
>>          __u64 hwtstamp;
>> +
>> +       __u32 vnet_hash_value;
>> +       __u16 vnet_hash_report;
>> +       __u16 vnet_rss_queue;
>>   };
> 
> we also do not add anything to uapi __sk_buff.
> 
>> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
>> +       .get_func_proto         = sk_filter_func_proto,
>> +       .is_valid_access        = sk_filter_is_valid_access,
>> +       .convert_ctx_access     = bpf_convert_ctx_access,
>> +       .gen_ld_abs             = bpf_gen_ld_abs,
>> +};
> 
> and we don't do ctx rewrites like this either.
> 
> Please see how hid-bpf and cgroup rstat are hooking up bpf
> in _unstable_ way.

Can you describe what "stable" and "unstable" mean here? I'm new to BPF 
and I'm worried if it may mean the interface stability.

Let me describe the context. QEMU bundles an eBPF program that is used 
for the "eBPF steering program" feature of tun. Now I'm proposing to 
extend the feature to allow to return some values to the userspace and 
vhost_net. As such, the extension needs to be done in a way that ensures 
interface stability.

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Alexei Starovoitov 2 years, 3 months ago

On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2023/10/16 1:07, Alexei Starovoitov wrote:
> > On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 0448700890f7..298634556fab 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -988,6 +988,7 @@ enum bpf_prog_type {
> >>          BPF_PROG_TYPE_SK_LOOKUP,
> >>          BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> >>          BPF_PROG_TYPE_NETFILTER,
> >> +       BPF_PROG_TYPE_VNET_HASH,
> >
> > Sorry, we do not add new stable program types anymore.
> >
> >> @@ -6111,6 +6112,10 @@ struct __sk_buff {
> >>          __u8  tstamp_type;
> >>          __u32 :24;              /* Padding, future use. */
> >>          __u64 hwtstamp;
> >> +
> >> +       __u32 vnet_hash_value;
> >> +       __u16 vnet_hash_report;
> >> +       __u16 vnet_rss_queue;
> >>   };
> >
> > we also do not add anything to uapi __sk_buff.
> >
> >> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
> >> +       .get_func_proto         = sk_filter_func_proto,
> >> +       .is_valid_access        = sk_filter_is_valid_access,
> >> +       .convert_ctx_access     = bpf_convert_ctx_access,
> >> +       .gen_ld_abs             = bpf_gen_ld_abs,
> >> +};
> >
> > and we don't do ctx rewrites like this either.
> >
> > Please see how hid-bpf and cgroup rstat are hooking up bpf
> > in _unstable_ way.
>
> Can you describe what "stable" and "unstable" mean here? I'm new to BPF
> and I'm worried if it may mean the interface stability.
>
> Let me describe the context. QEMU bundles an eBPF program that is used
> for the "eBPF steering program" feature of tun. Now I'm proposing to
> extend the feature to allow to return some values to the userspace and
> vhost_net. As such, the extension needs to be done in a way that ensures
> interface stability.

bpf is not an option then.
we do not add stable bpf program types or hooks any more.
If a kernel subsystem wants to use bpf it needs to accept the fact
that such bpf extensibility will be unstable and subsystem maintainers
can decide to remove such bpf support in the future.

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Jason Wang 2 years, 3 months ago

On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >
> > On 2023/10/16 1:07, Alexei Starovoitov wrote:
> > > On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> > >>
> > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >> index 0448700890f7..298634556fab 100644
> > >> --- a/include/uapi/linux/bpf.h
> > >> +++ b/include/uapi/linux/bpf.h
> > >> @@ -988,6 +988,7 @@ enum bpf_prog_type {
> > >>          BPF_PROG_TYPE_SK_LOOKUP,
> > >>          BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> > >>          BPF_PROG_TYPE_NETFILTER,
> > >> +       BPF_PROG_TYPE_VNET_HASH,
> > >
> > > Sorry, we do not add new stable program types anymore.
> > >
> > >> @@ -6111,6 +6112,10 @@ struct __sk_buff {
> > >>          __u8  tstamp_type;
> > >>          __u32 :24;              /* Padding, future use. */
> > >>          __u64 hwtstamp;
> > >> +
> > >> +       __u32 vnet_hash_value;
> > >> +       __u16 vnet_hash_report;
> > >> +       __u16 vnet_rss_queue;
> > >>   };
> > >
> > > we also do not add anything to uapi __sk_buff.
> > >
> > >> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
> > >> +       .get_func_proto         = sk_filter_func_proto,
> > >> +       .is_valid_access        = sk_filter_is_valid_access,
> > >> +       .convert_ctx_access     = bpf_convert_ctx_access,
> > >> +       .gen_ld_abs             = bpf_gen_ld_abs,
> > >> +};
> > >
> > > and we don't do ctx rewrites like this either.
> > >
> > > Please see how hid-bpf and cgroup rstat are hooking up bpf
> > > in _unstable_ way.
> >
> > Can you describe what "stable" and "unstable" mean here? I'm new to BPF
> > and I'm worried if it may mean the interface stability.
> >
> > Let me describe the context. QEMU bundles an eBPF program that is used
> > for the "eBPF steering program" feature of tun. Now I'm proposing to
> > extend the feature to allow to return some values to the userspace and
> > vhost_net. As such, the extension needs to be done in a way that ensures
> > interface stability.
>
> bpf is not an option then.
> we do not add stable bpf program types or hooks any more.

Does this mean eBPF could not be used for any new use cases other than
the existing ones?

> If a kernel subsystem wants to use bpf it needs to accept the fact
> that such bpf extensibility will be unstable and subsystem maintainers
> can decide to remove such bpf support in the future.

I don't see how it is different from the existing ones.

Thanks

>

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Alexei Starovoitov 2 years, 3 months ago

On Mon, Oct 16, 2023 at 7:38 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> > >
> > > On 2023/10/16 1:07, Alexei Starovoitov wrote:
> > > > On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> > > >>
> > > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > >> index 0448700890f7..298634556fab 100644
> > > >> --- a/include/uapi/linux/bpf.h
> > > >> +++ b/include/uapi/linux/bpf.h
> > > >> @@ -988,6 +988,7 @@ enum bpf_prog_type {
> > > >>          BPF_PROG_TYPE_SK_LOOKUP,
> > > >>          BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> > > >>          BPF_PROG_TYPE_NETFILTER,
> > > >> +       BPF_PROG_TYPE_VNET_HASH,
> > > >
> > > > Sorry, we do not add new stable program types anymore.
> > > >
> > > >> @@ -6111,6 +6112,10 @@ struct __sk_buff {
> > > >>          __u8  tstamp_type;
> > > >>          __u32 :24;              /* Padding, future use. */
> > > >>          __u64 hwtstamp;
> > > >> +
> > > >> +       __u32 vnet_hash_value;
> > > >> +       __u16 vnet_hash_report;
> > > >> +       __u16 vnet_rss_queue;
> > > >>   };
> > > >
> > > > we also do not add anything to uapi __sk_buff.
> > > >
> > > >> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
> > > >> +       .get_func_proto         = sk_filter_func_proto,
> > > >> +       .is_valid_access        = sk_filter_is_valid_access,
> > > >> +       .convert_ctx_access     = bpf_convert_ctx_access,
> > > >> +       .gen_ld_abs             = bpf_gen_ld_abs,
> > > >> +};
> > > >
> > > > and we don't do ctx rewrites like this either.
> > > >
> > > > Please see how hid-bpf and cgroup rstat are hooking up bpf
> > > > in _unstable_ way.
> > >
> > > Can you describe what "stable" and "unstable" mean here? I'm new to BPF
> > > and I'm worried if it may mean the interface stability.
> > >
> > > Let me describe the context. QEMU bundles an eBPF program that is used
> > > for the "eBPF steering program" feature of tun. Now I'm proposing to
> > > extend the feature to allow to return some values to the userspace and
> > > vhost_net. As such, the extension needs to be done in a way that ensures
> > > interface stability.
> >
> > bpf is not an option then.
> > we do not add stable bpf program types or hooks any more.
>
> Does this mean eBPF could not be used for any new use cases other than
> the existing ones?

It means that any new use of bpf has to be unstable for the time being.

> > If a kernel subsystem wants to use bpf it needs to accept the fact
> > that such bpf extensibility will be unstable and subsystem maintainers
> > can decide to remove such bpf support in the future.
>
> I don't see how it is different from the existing ones.

Can we remove BPF_CGROUP_RUN_PROG_INET_INGRESS hook along
with BPF_PROG_TYPE_CGROUP_SKB program type?
Obviously not.
We can refactor it. We can move it around, but not remove.
That's the difference in stable vs unstable.

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Akihiko Odaki 2 years, 3 months ago

On 2023/10/18 4:03, Alexei Starovoitov wrote:
> On Mon, Oct 16, 2023 at 7:38 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>>>
>>> On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2023/10/16 1:07, Alexei Starovoitov wrote:
>>>>> On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>
>>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>>> index 0448700890f7..298634556fab 100644
>>>>>> --- a/include/uapi/linux/bpf.h
>>>>>> +++ b/include/uapi/linux/bpf.h
>>>>>> @@ -988,6 +988,7 @@ enum bpf_prog_type {
>>>>>>           BPF_PROG_TYPE_SK_LOOKUP,
>>>>>>           BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
>>>>>>           BPF_PROG_TYPE_NETFILTER,
>>>>>> +       BPF_PROG_TYPE_VNET_HASH,
>>>>>
>>>>> Sorry, we do not add new stable program types anymore.
>>>>>
>>>>>> @@ -6111,6 +6112,10 @@ struct __sk_buff {
>>>>>>           __u8  tstamp_type;
>>>>>>           __u32 :24;              /* Padding, future use. */
>>>>>>           __u64 hwtstamp;
>>>>>> +
>>>>>> +       __u32 vnet_hash_value;
>>>>>> +       __u16 vnet_hash_report;
>>>>>> +       __u16 vnet_rss_queue;
>>>>>>    };
>>>>>
>>>>> we also do not add anything to uapi __sk_buff.
>>>>>
>>>>>> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
>>>>>> +       .get_func_proto         = sk_filter_func_proto,
>>>>>> +       .is_valid_access        = sk_filter_is_valid_access,
>>>>>> +       .convert_ctx_access     = bpf_convert_ctx_access,
>>>>>> +       .gen_ld_abs             = bpf_gen_ld_abs,
>>>>>> +};
>>>>>
>>>>> and we don't do ctx rewrites like this either.
>>>>>
>>>>> Please see how hid-bpf and cgroup rstat are hooking up bpf
>>>>> in _unstable_ way.
>>>>
>>>> Can you describe what "stable" and "unstable" mean here? I'm new to BPF
>>>> and I'm worried if it may mean the interface stability.
>>>>
>>>> Let me describe the context. QEMU bundles an eBPF program that is used
>>>> for the "eBPF steering program" feature of tun. Now I'm proposing to
>>>> extend the feature to allow to return some values to the userspace and
>>>> vhost_net. As such, the extension needs to be done in a way that ensures
>>>> interface stability.
>>>
>>> bpf is not an option then.
>>> we do not add stable bpf program types or hooks any more.
>>
>> Does this mean eBPF could not be used for any new use cases other than
>> the existing ones?
> 
> It means that any new use of bpf has to be unstable for the time being.

Can you elaborate more about making new use unstable "for the time 
being?" Is it a temporary situation? What is the rationale for that? 
Such information will help devise a solution that is best for both of 
the BPF and network subsystems.

I would also appreciate if you have some documentation or link to 
relevant discussions on the mailing list. That will avoid having same 
discussion you may already have done in the past.

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Akihiko Odaki 2 years, 2 months ago

On 2023/10/18 4:19, Akihiko Odaki wrote:
> On 2023/10/18 4:03, Alexei Starovoitov wrote:
>> On Mon, Oct 16, 2023 at 7:38 PM Jason Wang <jasowang@redhat.com> wrote:
>>>
>>> On Tue, Oct 17, 2023 at 7:53 AM Alexei Starovoitov
>>> <alexei.starovoitov@gmail.com> wrote:
>>>>
>>>> On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki 
>>>> <akihiko.odaki@daynix.com> wrote:
>>>>>
>>>>> On 2023/10/16 1:07, Alexei Starovoitov wrote:
>>>>>> On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki 
>>>>>> <akihiko.odaki@daynix.com> wrote:
>>>>>>>
>>>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>>>> index 0448700890f7..298634556fab 100644
>>>>>>> --- a/include/uapi/linux/bpf.h
>>>>>>> +++ b/include/uapi/linux/bpf.h
>>>>>>> @@ -988,6 +988,7 @@ enum bpf_prog_type {
>>>>>>>           BPF_PROG_TYPE_SK_LOOKUP,
>>>>>>>           BPF_PROG_TYPE_SYSCALL, /* a program that can execute 
>>>>>>> syscalls */
>>>>>>>           BPF_PROG_TYPE_NETFILTER,
>>>>>>> +       BPF_PROG_TYPE_VNET_HASH,
>>>>>>
>>>>>> Sorry, we do not add new stable program types anymore.
>>>>>>
>>>>>>> @@ -6111,6 +6112,10 @@ struct __sk_buff {
>>>>>>>           __u8  tstamp_type;
>>>>>>>           __u32 :24;              /* Padding, future use. */
>>>>>>>           __u64 hwtstamp;
>>>>>>> +
>>>>>>> +       __u32 vnet_hash_value;
>>>>>>> +       __u16 vnet_hash_report;
>>>>>>> +       __u16 vnet_rss_queue;
>>>>>>>    };
>>>>>>
>>>>>> we also do not add anything to uapi __sk_buff.
>>>>>>
>>>>>>> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
>>>>>>> +       .get_func_proto         = sk_filter_func_proto,
>>>>>>> +       .is_valid_access        = sk_filter_is_valid_access,
>>>>>>> +       .convert_ctx_access     = bpf_convert_ctx_access,
>>>>>>> +       .gen_ld_abs             = bpf_gen_ld_abs,
>>>>>>> +};
>>>>>>
>>>>>> and we don't do ctx rewrites like this either.
>>>>>>
>>>>>> Please see how hid-bpf and cgroup rstat are hooking up bpf
>>>>>> in _unstable_ way.
>>>>>
>>>>> Can you describe what "stable" and "unstable" mean here? I'm new to 
>>>>> BPF
>>>>> and I'm worried if it may mean the interface stability.
>>>>>
>>>>> Let me describe the context. QEMU bundles an eBPF program that is used
>>>>> for the "eBPF steering program" feature of tun. Now I'm proposing to
>>>>> extend the feature to allow to return some values to the userspace and
>>>>> vhost_net. As such, the extension needs to be done in a way that 
>>>>> ensures
>>>>> interface stability.
>>>>
>>>> bpf is not an option then.
>>>> we do not add stable bpf program types or hooks any more.
>>>
>>> Does this mean eBPF could not be used for any new use cases other than
>>> the existing ones?
>>
>> It means that any new use of bpf has to be unstable for the time being.
> 
> Can you elaborate more about making new use unstable "for the time 
> being?" Is it a temporary situation? What is the rationale for that? 
> Such information will help devise a solution that is best for both of 
> the BPF and network subsystems.
> 
> I would also appreciate if you have some documentation or link to 
> relevant discussions on the mailing list. That will avoid having same 
> discussion you may already have done in the past.

Hi,

The discussion has been stuck for a month, but I'd still like to 
continue figuring out the way best for the whole kernel to implement 
this feature. I summarize the current situation and question that needs 
to be answered before push this forward:

The goal of this RFC is to allow to report hash values calculated with 
eBPF steering program. It's essentially just to report 4 bytes from the 
kernel to the userspace.

Unfortunately, however, it is not acceptable for the BPF subsystem 
because the "stable" BPF is completely fixed these days. The 
"unstable/kfunc" BPF is an alternative, but the eBPF program will be 
shipped with a portable userspace program (QEMU)[1] so the lack of 
interface stability is not tolerable.

Another option is to hardcode the algorithm that was conventionally 
implemented with eBPF steering program in the kernel[2]. It is possible 
because the algorithm strictly follows the virtio-net specification[3]. 
However, there are proposals to add different algorithms to the 
specification[4], and hardcoding the algorithm to the kernel will 
require to add more UAPIs and code each time such a specification change 
happens, which is not good for tuntap.

In short, the proposed feature requires to make either of three compromises:

1. Compromise on the BPF side: Relax the "stable" BPF feature freeze 
once and allow eBPF steering program to report 4 more bytes to the kernel.

2. Compromise on the tuntap side: Implement the algorithm to the kernel, 
and abandon the capability to update the algorithm without changing the 
kernel.

IMHO, I think it's better to make a compromise on the BPF side (option 
1). We should minimize the total UAPI changes in the whole kernel, and 
option 1 is much superior in that sense.

Yet I have to note that such a compromise on the BPF side can risk the 
"stable" BPF feature freeze fragile and let other people complain like 
"you allowed to change stable BPF for this, why do you reject [some 
other request to change stable BPF]?" It is bad for BPF maintainers. (I 
can imagine that introducing and maintaining widely different BPF 
interfaces is too much burden.) And, of course, this requires an 
approval from BPF maintainers.

So I'd like to ask you that which of these compromises you think worse. 
Please also tell me if you have another idea.

Regards,
Akihiko Odaki

[1] https://qemu.readthedocs.io/en/v8.1.0/devel/ebpf_rss.html
[2] 
https://lore.kernel.org/all/20231008052101.144422-1-akihiko.odaki@daynix.com/
[3] 
https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-2400003
[4] 
https://lore.kernel.org/all/CACGkMEuBbGKssxNv5AfpaPpWQfk2BHR83rM5AHXN-YVMf2NvpQ@mail.gmail.com/

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Song Liu 2 years, 2 months ago

Hi,

A few rookie questions below.

On Sat, Nov 18, 2023 at 2:39 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2023/10/18 4:19, Akihiko Odaki wrote:
> > On 2023/10/18 4:03, Alexei Starovoitov wrote:
[...]
> >
> > I would also appreciate if you have some documentation or link to
> > relevant discussions on the mailing list. That will avoid having same
> > discussion you may already have done in the past.
>
> Hi,
>
> The discussion has been stuck for a month, but I'd still like to
> continue figuring out the way best for the whole kernel to implement
> this feature. I summarize the current situation and question that needs
> to be answered before push this forward:
>
> The goal of this RFC is to allow to report hash values calculated with
> eBPF steering program. It's essentially just to report 4 bytes from the
> kernel to the userspace.

AFAICT, the proposed design is to have BPF generate some data
(namely hash, but could be anything afaict) and consume it from
user space. Instead of updating __sk_buff, can we have the user
space to fetch the data/hash from a bpf map? If this is an option,
I guess we can implement the same feature with BPF tracing
programs?

>
> Unfortunately, however, it is not acceptable for the BPF subsystem
> because the "stable" BPF is completely fixed these days. The
> "unstable/kfunc" BPF is an alternative, but the eBPF program will be
> shipped with a portable userspace program (QEMU)[1] so the lack of
> interface stability is not tolerable.

bpf kfuncs are as stable as exported symbols. Is exported symbols
like stability enough for the use case? (I would assume yes.)

>
> Another option is to hardcode the algorithm that was conventionally
> implemented with eBPF steering program in the kernel[2]. It is possible
> because the algorithm strictly follows the virtio-net specification[3].
> However, there are proposals to add different algorithms to the
> specification[4], and hardcoding the algorithm to the kernel will
> require to add more UAPIs and code each time such a specification change
> happens, which is not good for tuntap.

The requirement looks similar to hid-bpf. Could you explain why that
model is not enough? HID also requires some stability AFAICT.

Thanks,
Song

>
> In short, the proposed feature requires to make either of three compromises:
>
> 1. Compromise on the BPF side: Relax the "stable" BPF feature freeze
> once and allow eBPF steering program to report 4 more bytes to the kernel.
>
> 2. Compromise on the tuntap side: Implement the algorithm to the kernel,
> and abandon the capability to update the algorithm without changing the
> kernel.
>
> IMHO, I think it's better to make a compromise on the BPF side (option
> 1). We should minimize the total UAPI changes in the whole kernel, and
> option 1 is much superior in that sense.
>
> Yet I have to note that such a compromise on the BPF side can risk the
> "stable" BPF feature freeze fragile and let other people complain like
> "you allowed to change stable BPF for this, why do you reject [some
> other request to change stable BPF]?" It is bad for BPF maintainers. (I
> can imagine that introducing and maintaining widely different BPF
> interfaces is too much burden.) And, of course, this requires an
> approval from BPF maintainers.
>
> So I'd like to ask you that which of these compromises you think worse.
> Please also tell me if you have another idea.
>
> Regards,
> Akihiko Odaki
>
> [1] https://qemu.readthedocs.io/en/v8.1.0/devel/ebpf_rss.html
> [2]
> https://lore.kernel.org/all/20231008052101.144422-1-akihiko.odaki@daynix.com/
> [3]
> https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-2400003
> [4]
> https://lore.kernel.org/all/CACGkMEuBbGKssxNv5AfpaPpWQfk2BHR83rM5AHXN-YVMf2NvpQ@mail.gmail.com/

Re: [RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH

Posted by Willem de Bruijn 2 years, 3 months ago

On Mon, Oct 16, 2023 at 7:53 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Sun, Oct 15, 2023 at 10:10 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >
> > On 2023/10/16 1:07, Alexei Starovoitov wrote:
> > > On Sun, Oct 15, 2023 at 7:17 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> > >>
> > >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > >> index 0448700890f7..298634556fab 100644
> > >> --- a/include/uapi/linux/bpf.h
> > >> +++ b/include/uapi/linux/bpf.h
> > >> @@ -988,6 +988,7 @@ enum bpf_prog_type {
> > >>          BPF_PROG_TYPE_SK_LOOKUP,
> > >>          BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> > >>          BPF_PROG_TYPE_NETFILTER,
> > >> +       BPF_PROG_TYPE_VNET_HASH,
> > >
> > > Sorry, we do not add new stable program types anymore.
> > >
> > >> @@ -6111,6 +6112,10 @@ struct __sk_buff {
> > >>          __u8  tstamp_type;
> > >>          __u32 :24;              /* Padding, future use. */
> > >>          __u64 hwtstamp;
> > >> +
> > >> +       __u32 vnet_hash_value;
> > >> +       __u16 vnet_hash_report;
> > >> +       __u16 vnet_rss_queue;
> > >>   };
> > >
> > > we also do not add anything to uapi __sk_buff.
> > >
> > >> +const struct bpf_verifier_ops vnet_hash_verifier_ops = {
> > >> +       .get_func_proto         = sk_filter_func_proto,
> > >> +       .is_valid_access        = sk_filter_is_valid_access,
> > >> +       .convert_ctx_access     = bpf_convert_ctx_access,
> > >> +       .gen_ld_abs             = bpf_gen_ld_abs,
> > >> +};
> > >
> > > and we don't do ctx rewrites like this either.
> > >
> > > Please see how hid-bpf and cgroup rstat are hooking up bpf
> > > in _unstable_ way.
> >
> > Can you describe what "stable" and "unstable" mean here? I'm new to BPF
> > and I'm worried if it may mean the interface stability.
> >
> > Let me describe the context. QEMU bundles an eBPF program that is used
> > for the "eBPF steering program" feature of tun. Now I'm proposing to
> > extend the feature to allow to return some values to the userspace and
> > vhost_net. As such, the extension needs to be done in a way that ensures
> > interface stability.
>
> bpf is not an option then.
> we do not add stable bpf program types or hooks any more.
> If a kernel subsystem wants to use bpf it needs to accept the fact
> that such bpf extensibility will be unstable and subsystem maintainers
> can decide to remove such bpf support in the future.

Based on hooks for tracepoints and kfuncs, correct?

Perhaps the existing stable flow dissector type is extensible to
optionally compute the hash and report hash and hash type. Else we
probably should revisit the previous version of this series.

[RFC PATCH v2 1/7] bpf: Introduce BPF_PROG_TYPE_VNET_HASH
[RFC PATCH v2 2/7] bpf: Add vnet_hash members to __sk_buff
[RFC PATCH v2 3/7] skbuff: Introduce SKB_EXT_TUN_VNET_HASH
[RFC PATCH v2 4/7] virtio_net: Add virtio_net_hdr_v1_hash_from_skb()
[RFC PATCH v2 5/7] tun: Support BPF_PROG_TYPE_VNET_HASH
[RFC PATCH v2 6/7] selftests/bpf: Test BPF_PROG_TYPE_VNET_HASH
[RFC PATCH v2 7/7] vhost_net: Support VIRTIO_NET_F_HASH_REPORT