This commit adds support for enabling IP defrag using pre-existing
netfilter defrag support. Basically all the flag does is bump a refcnt
while the link the active. Checks are also added to ensure the prog
requesting defrag support is run _after_ netfilter defrag hooks.
We also take care to avoid any issues w.r.t. module unloading -- while
defrag is active on a link, the module is prevented from unloading.
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
include/uapi/linux/bpf.h | 5 ++
net/netfilter/nf_bpf_link.c | 123 +++++++++++++++++++++++++++++----
tools/include/uapi/linux/bpf.h | 5 ++
3 files changed, 118 insertions(+), 15 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 739c15906a65..12a5480314a2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1187,6 +1187,11 @@ enum bpf_perf_event_type {
*/
#define BPF_F_KPROBE_MULTI_RETURN (1U << 0)
+/* link_create.netfilter.flags used in LINK_CREATE command for
+ * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation.
+ */
+#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* the following extensions:
*
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index c36da56d756f..8fe594bbc7e2 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
#include <linux/netfilter.h>
#include <net/netfilter/nf_bpf_link.h>
@@ -23,8 +25,88 @@ struct bpf_nf_link {
struct nf_hook_ops hook_ops;
struct net *net;
u32 dead;
+ const struct nf_defrag_hook *defrag_hook;
};
+static const struct nf_defrag_hook *
+get_proto_defrag_hook(struct bpf_nf_link *link,
+ const struct nf_defrag_hook __rcu *global_hook,
+ const char *mod)
+{
+ const struct nf_defrag_hook *hook;
+ int err;
+
+ /* RCU protects us from races against module unloading */
+ rcu_read_lock();
+ hook = rcu_dereference(global_hook);
+ if (!hook) {
+ rcu_read_unlock();
+ err = request_module(mod);
+ if (err)
+ return ERR_PTR(err < 0 ? err : -EINVAL);
+
+ rcu_read_lock();
+ hook = rcu_dereference(global_hook);
+ }
+
+ if (hook && try_module_get(hook->owner)) {
+ /* Once we have a refcnt on the module, we no longer need RCU */
+ hook = rcu_pointer_handoff(hook);
+ } else {
+ WARN_ONCE(!hook, "%s has bad registration", mod);
+ hook = ERR_PTR(-ENOENT);
+ }
+ rcu_read_unlock();
+
+ if (!IS_ERR(hook)) {
+ err = hook->enable(link->net);
+ if (err) {
+ module_put(hook->owner);
+ hook = ERR_PTR(err);
+ }
+ }
+
+ return hook;
+}
+
+static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook __maybe_unused *hook;
+
+ switch (link->hook_ops.pf) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+ case NFPROTO_IPV4:
+ hook = get_proto_defrag_hook(link, nf_defrag_v4_hook, "nf_defrag_ipv4");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ case NFPROTO_IPV6:
+ hook = get_proto_defrag_hook(link, nf_defrag_v6_hook, "nf_defrag_ipv6");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+ default:
+ return -EAFNOSUPPORT;
+ }
+}
+
+static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook *hook = link->defrag_hook;
+
+ if (!hook)
+ return;
+ hook->disable(link->net);
+ module_put(hook->owner);
+}
+
static void bpf_nf_link_release(struct bpf_link *link)
{
struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
@@ -32,11 +114,11 @@ static void bpf_nf_link_release(struct bpf_link *link)
if (nf_link->dead)
return;
- /* prevent hook-not-found warning splat from netfilter core when
- * .detach was already called
- */
- if (!cmpxchg(&nf_link->dead, 0, 1))
+ /* do not double release in case .detach was already called */
+ if (!cmpxchg(&nf_link->dead, 0, 1)) {
nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+ bpf_nf_disable_defrag(nf_link);
+ }
}
static void bpf_nf_link_dealloc(struct bpf_link *link)
@@ -92,6 +174,8 @@ static const struct bpf_link_ops bpf_nf_link_lops = {
static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
{
+ int prio;
+
switch (attr->link_create.netfilter.pf) {
case NFPROTO_IPV4:
case NFPROTO_IPV6:
@@ -102,19 +186,18 @@ static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
return -EAFNOSUPPORT;
}
- if (attr->link_create.netfilter.flags)
+ if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG)
return -EOPNOTSUPP;
- /* make sure conntrack confirm is always last.
- *
- * In the future, if userspace can e.g. request defrag, then
- * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG"
- * should fail.
- */
- switch (attr->link_create.netfilter.priority) {
- case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */
- case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */
- }
+ /* make sure conntrack confirm is always last */
+ prio = attr->link_create.netfilter.priority;
+ if (prio == NF_IP_PRI_FIRST)
+ return -ERANGE; /* sabotage_in and other warts */
+ else if (prio == NF_IP_PRI_LAST)
+ return -ERANGE; /* e.g. conntrack confirm */
+ else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
+ prio <= NF_IP_PRI_CONNTRACK_DEFRAG)
+ return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */
return 0;
}
@@ -149,6 +232,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
link->net = net;
link->dead = false;
+ link->defrag_hook = NULL;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -156,8 +240,17 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
return err;
}
+ if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) {
+ err = bpf_nf_enable_defrag(link);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
err = nf_register_net_hook(net, &link->hook_ops);
if (err) {
+ bpf_nf_disable_defrag(link);
bpf_link_cleanup(&link_primer);
return err;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 739c15906a65..12a5480314a2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1187,6 +1187,11 @@ enum bpf_perf_event_type {
*/
#define BPF_F_KPROBE_MULTI_RETURN (1U << 0)
+/* link_create.netfilter.flags used in LINK_CREATE command for
+ * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation.
+ */
+#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* the following extensions:
*
--
2.41.0
Daniel Xu <dxu@dxuuu.xyz> wrote: > This commit adds support for enabling IP defrag using pre-existing > netfilter defrag support. Basically all the flag does is bump a refcnt > while the link the active. Checks are also added to ensure the prog > requesting defrag support is run _after_ netfilter defrag hooks. > > We also take care to avoid any issues w.r.t. module unloading -- while > defrag is active on a link, the module is prevented from unloading. Reviewed-by: Florian Westphal <fw@strlen.de>
On Fri, Jul 21, 2023 at 02:22:46PM -0600, Daniel Xu wrote: > This commit adds support for enabling IP defrag using pre-existing > netfilter defrag support. Basically all the flag does is bump a refcnt > while the link the active. Checks are also added to ensure the prog > requesting defrag support is run _after_ netfilter defrag hooks. > > We also take care to avoid any issues w.r.t. module unloading -- while > defrag is active on a link, the module is prevented from unloading. > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> > --- > include/uapi/linux/bpf.h | 5 ++ > net/netfilter/nf_bpf_link.c | 123 +++++++++++++++++++++++++++++---- > tools/include/uapi/linux/bpf.h | 5 ++ > 3 files changed, 118 insertions(+), 15 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 739c15906a65..12a5480314a2 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1187,6 +1187,11 @@ enum bpf_perf_event_type { > */ > #define BPF_F_KPROBE_MULTI_RETURN (1U << 0) > > +/* link_create.netfilter.flags used in LINK_CREATE command for > + * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. > + */ > +#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) > + > /* When BPF ldimm64's insn[0].src_reg != 0 then this can have > * the following extensions: > * > diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c > index c36da56d756f..8fe594bbc7e2 100644 > --- a/net/netfilter/nf_bpf_link.c > +++ b/net/netfilter/nf_bpf_link.c > @@ -1,6 +1,8 @@ > // SPDX-License-Identifier: GPL-2.0 > #include <linux/bpf.h> > #include <linux/filter.h> > +#include <linux/kmod.h> > +#include <linux/module.h> > #include <linux/netfilter.h> > > #include <net/netfilter/nf_bpf_link.h> > @@ -23,8 +25,88 @@ struct bpf_nf_link { > struct nf_hook_ops hook_ops; > struct net *net; > u32 dead; > + const struct nf_defrag_hook *defrag_hook; > }; > > +static const struct nf_defrag_hook * > +get_proto_defrag_hook(struct bpf_nf_link *link, > + const struct nf_defrag_hook __rcu *global_hook, > + const char *mod) > +{ > + const struct nf_defrag_hook *hook; > + int err; > + > + /* RCU protects us from races against module unloading */ > + rcu_read_lock(); > + hook = rcu_dereference(global_hook); > + if (!hook) { > + rcu_read_unlock(); > + err = request_module(mod); > + if (err) > + return ERR_PTR(err < 0 ? err : -EINVAL); > + > + rcu_read_lock(); > + hook = rcu_dereference(global_hook); > + } > + > + if (hook && try_module_get(hook->owner)) { > + /* Once we have a refcnt on the module, we no longer need RCU */ > + hook = rcu_pointer_handoff(hook); > + } else { > + WARN_ONCE(!hook, "%s has bad registration", mod); > + hook = ERR_PTR(-ENOENT); > + } > + rcu_read_unlock(); > + > + if (!IS_ERR(hook)) { > + err = hook->enable(link->net); > + if (err) { > + module_put(hook->owner); > + hook = ERR_PTR(err); > + } > + } > + > + return hook; The rcu + module_get logic looks correct to me, but you've dropped all Florian's acks. What's going on? We need explicit acks to merge this through bpf-next.
Hi Alexei, On Thu, Jul 27, 2023 at 06:16:20PM -0700, Alexei Starovoitov wrote: > On Fri, Jul 21, 2023 at 02:22:46PM -0600, Daniel Xu wrote: > > This commit adds support for enabling IP defrag using pre-existing > > netfilter defrag support. Basically all the flag does is bump a refcnt > > while the link the active. Checks are also added to ensure the prog > > requesting defrag support is run _after_ netfilter defrag hooks. > > > > We also take care to avoid any issues w.r.t. module unloading -- while > > defrag is active on a link, the module is prevented from unloading. > > > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> > > --- > > include/uapi/linux/bpf.h | 5 ++ > > net/netfilter/nf_bpf_link.c | 123 +++++++++++++++++++++++++++++---- > > tools/include/uapi/linux/bpf.h | 5 ++ > > 3 files changed, 118 insertions(+), 15 deletions(-) > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 739c15906a65..12a5480314a2 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -1187,6 +1187,11 @@ enum bpf_perf_event_type { > > */ > > #define BPF_F_KPROBE_MULTI_RETURN (1U << 0) > > > > +/* link_create.netfilter.flags used in LINK_CREATE command for > > + * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. > > + */ > > +#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) > > + > > /* When BPF ldimm64's insn[0].src_reg != 0 then this can have > > * the following extensions: > > * > > diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c > > index c36da56d756f..8fe594bbc7e2 100644 > > --- a/net/netfilter/nf_bpf_link.c > > +++ b/net/netfilter/nf_bpf_link.c > > @@ -1,6 +1,8 @@ > > // SPDX-License-Identifier: GPL-2.0 > > #include <linux/bpf.h> > > #include <linux/filter.h> > > +#include <linux/kmod.h> > > +#include <linux/module.h> > > #include <linux/netfilter.h> > > > > #include <net/netfilter/nf_bpf_link.h> > > @@ -23,8 +25,88 @@ struct bpf_nf_link { > > struct nf_hook_ops hook_ops; > > struct net *net; > > u32 dead; > > + const struct nf_defrag_hook *defrag_hook; > > }; > > > > +static const struct nf_defrag_hook * > > +get_proto_defrag_hook(struct bpf_nf_link *link, > > + const struct nf_defrag_hook __rcu *global_hook, > > + const char *mod) > > +{ > > + const struct nf_defrag_hook *hook; > > + int err; > > + > > + /* RCU protects us from races against module unloading */ > > + rcu_read_lock(); > > + hook = rcu_dereference(global_hook); > > + if (!hook) { > > + rcu_read_unlock(); > > + err = request_module(mod); > > + if (err) > > + return ERR_PTR(err < 0 ? err : -EINVAL); > > + > > + rcu_read_lock(); > > + hook = rcu_dereference(global_hook); > > + } > > + > > + if (hook && try_module_get(hook->owner)) { > > + /* Once we have a refcnt on the module, we no longer need RCU */ > > + hook = rcu_pointer_handoff(hook); > > + } else { > > + WARN_ONCE(!hook, "%s has bad registration", mod); > > + hook = ERR_PTR(-ENOENT); > > + } > > + rcu_read_unlock(); > > + > > + if (!IS_ERR(hook)) { > > + err = hook->enable(link->net); > > + if (err) { > > + module_put(hook->owner); > > + hook = ERR_PTR(err); > > + } > > + } > > + > > + return hook; > > The rcu + module_get logic looks correct to me, but you've dropped all Florian's acks. > What's going on? > > We need explicit acks to merge this through bpf-next. I understood acked-by tags to be a lighter form of reviewed-by tag. So b/c the patches changed so much I dropped the tag. It sounds like maybe I misunderstand -- I'll keep it in mind the next time around. Thanks, Daniel
© 2016 - 2025 Red Hat, Inc.