[RFC PATCH bpf-next v3 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops

Hui Zhu posted 12 patches 2 weeks, 3 days ago
There is a newer version of this series
[RFC PATCH bpf-next v3 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops
Posted by Hui Zhu 2 weeks, 3 days ago
From: Hui Zhu <zhuhui@kylinos.cn>

To allow for more flexible attachment policies in nested cgroup
hierarchies, this patch introduces support for the
`BPF_F_ALLOW_OVERRIDE` flag for `memcg_bpf_ops`.

When a `memcg_bpf_ops` is attached to a cgroup with this flag, it
permits child cgroups to attach their own, different `memcg_bpf_ops`,
overriding the parent's inherited program. Without this flag,
attaching a BPF program to a cgroup that already has one (either
directly or via inheritance) will fail.

The implementation involves:
- Adding a `bpf_ops_flags` field to `struct mem_cgroup`.
- During registration (`bpf_memcg_ops_reg`), checking for existing
  programs and the `BPF_F_ALLOW_OVERRIDE` flag.
- During unregistration (`bpf_memcg_ops_unreg`), correctly restoring
  the parent's BPF program to the cgroup hierarchy.
- Ensuring flags are inherited by child cgroups during online events.

This change enables complex, multi-level policy enforcement where
different subtrees of the cgroup hierarchy can have distinct memory
management BPF programs.

Signed-off-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
---
 include/linux/memcontrol.h |  1 +
 mm/bpf_memcontrol.c        | 77 ++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d71e86b85ba7..a37b78d3853d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -354,6 +354,7 @@ struct mem_cgroup {
 
 #ifdef CONFIG_BPF_SYSCALL
 	struct memcg_bpf_ops *bpf_ops;
+	u32 bpf_ops_flags;
 #endif
 
 	struct mem_cgroup_per_node *nodeinfo[];
diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
index 3eae1af49519..d6126b94f521 100644
--- a/mm/bpf_memcontrol.c
+++ b/mm/bpf_memcontrol.c
@@ -213,6 +213,7 @@ void memcontrol_bpf_online(struct mem_cgroup *memcg)
 		goto out;
 
 	WRITE_ONCE(memcg->bpf_ops, ops);
+	memcg->bpf_ops_flags = parent_memcg->bpf_ops_flags;
 
 	/*
 	 * If the BPF program implements it, call the online handler to
@@ -340,29 +341,6 @@ static int bpf_memcg_ops_init_member(const struct btf_type *t,
 	return 0;
 }
 
-/**
- * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy.
- * @memcg: The root of the cgroup hierarchy to clean.
- * @ops:   The specific ops struct to detach. If NULL, detach any ops.
- *
- * Iterates through all descendant cgroups of @memcg (including itself)
- * and clears their bpf_ops pointer. This is used when a BPF program
- * is detached or if attachment fails midway.
- */
-static void clean_memcg_bpf_ops(struct mem_cgroup *memcg,
-				struct memcg_bpf_ops *ops)
-{
-	struct mem_cgroup *iter = NULL;
-
-	while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
-		if (ops) {
-			if (!WARN_ON(READ_ONCE(memcg->bpf_ops) != ops))
-				WRITE_ONCE(memcg->bpf_ops, NULL);
-		} else
-			WRITE_ONCE(iter->bpf_ops, NULL);
-	}
-}
-
 static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
 {
 	struct bpf_struct_ops_link *ops_link
@@ -371,21 +349,44 @@ static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
 	struct mem_cgroup *memcg, *iter = NULL;
 	int err = 0;
 
+	if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) {
+		pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n");
+		return -EOPNOTSUPP;
+	}
+
 	memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
 	if (IS_ERR_OR_NULL(memcg))
 		return PTR_ERR(memcg);
 
 	cgroup_lock();
+
+	if (READ_ONCE(memcg->bpf_ops)) {
+		/* Check if bpf_ops of the parent is BPF_F_ALLOW_OVERRIDE. */
+		if (memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) {
+			iter = parent_mem_cgroup(memcg);
+
+			if (!iter)
+				goto busy_out;
+			if (READ_ONCE(iter->bpf_ops) !=
+			    READ_ONCE(memcg->bpf_ops))
+				goto busy_out;
+		} else {
+busy_out:
+			err = -EBUSY;
+			goto unlock_out;
+		}
+	}
+
 	while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
 		if (READ_ONCE(iter->bpf_ops)) {
-			mem_cgroup_iter_break(memcg, iter);
-			err = -EBUSY;
-			break;
+			/* cannot override existing bpf_ops of sub-cgroup. */
+			continue;
 		}
 		WRITE_ONCE(iter->bpf_ops, ops);
+		iter->bpf_ops_flags = ops_link->flags;
 	}
-	if (err)
-		clean_memcg_bpf_ops(memcg, NULL);
+
+unlock_out:
 	cgroup_unlock();
 
 	mem_cgroup_put(memcg);
@@ -399,13 +400,31 @@ static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link)
 		= container_of(link, struct bpf_struct_ops_link, link);
 	struct memcg_bpf_ops *ops = kdata;
 	struct mem_cgroup *memcg;
+	struct mem_cgroup *iter;
+	struct memcg_bpf_ops *parent_bpf_ops = NULL;
+	u32 parent_bpf_ops_flags = 0;
 
 	memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
 	if (IS_ERR_OR_NULL(memcg))
 		goto out;
 
 	cgroup_lock();
-	clean_memcg_bpf_ops(memcg, ops);
+
+	/* Get the parent bpf_ops and bpf_ops_flags */
+	iter = parent_mem_cgroup(memcg);
+	if (iter) {
+		parent_bpf_ops = READ_ONCE(iter->bpf_ops);
+		parent_bpf_ops_flags = iter->bpf_ops_flags;
+	}
+
+	iter = NULL;
+	while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
+		if (READ_ONCE(iter->bpf_ops) == ops) {
+			WRITE_ONCE(iter->bpf_ops, parent_bpf_ops);
+			iter->bpf_ops_flags = parent_bpf_ops_flags;
+		}
+	}
+
 	cgroup_unlock();
 
 	mem_cgroup_put(memcg);
-- 
2.43.0
Re: [RFC PATCH bpf-next v3 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops
Posted by bot+bpf-ci@kernel.org 2 weeks, 3 days ago
> diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
> index 415e3c59b6a6..0ded1fdd0de8 100644
> --- a/mm/bpf_memcontrol.c
> +++ b/mm/bpf_memcontrol.c

[ ... ]

> static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
> {
> 	struct bpf_struct_ops_link *ops_link
> 		= container_of(link, struct bpf_struct_ops_link, link);
> 	struct memcg_bpf_ops *ops = kdata;
> 	struct mem_cgroup *memcg, *iter = NULL;
> 	int err = 0;
>
> +	if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) {
> +		pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> 	memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
> 	if (IS_ERR_OR_NULL(memcg))
> 		return PTR_ERR(memcg);
>
> 	cgroup_lock();
> +
> +	if (READ_ONCE(memcg->bpf_ops)) {
> +		/* Check if bpf_ops of the parent is BPF_F_ALLOW_OVERRIDE. */
> +		if (memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) {
> +			iter = parent_mem_cgroup(memcg);
> +
> +			if (!iter)
> +				goto busy_out;
> +			if (READ_ONCE(iter->bpf_ops) !=
> +			    READ_ONCE(memcg->bpf_ops))
> +				goto busy_out;
> +		} else {
> +busy_out:
> +			err = -EBUSY;
> +			goto unlock_out;
> +		}
> +	}
> +
> 	while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
                                      ^^^^

When the override check passes (all conditions at lines 363-377 are met),
iter is set to parent_mem_cgroup(memcg) but is never reset to NULL before
this loop. Should iter be reset to NULL here?

The mem_cgroup_iter() function expects prev (iter) to be either NULL or a
previous result from iterating within memcg's descendants. When iter points
to memcg's parent, the iteration will not correctly traverse memcg's
subtree since parent is outside that subtree.

For comparison, bpf_memcg_ops_unreg() correctly resets iter to NULL before
its loop:

> +	iter = NULL;
> +	while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {

> 		if (READ_ONCE(iter->bpf_ops)) {
> -			mem_cgroup_iter_break(memcg, iter);
> -			err = -EBUSY;
> -			break;
> +			/* cannot override existing bpf_ops of sub-cgroup. */
> +			continue;
> 		}
> 		WRITE_ONCE(iter->bpf_ops, ops);
> +		iter->bpf_ops_flags = ops_link->flags;
> 	}

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21280790825