bpf: add a few hooks for sandboxing

[PATCH 2/4] cgroup: add bpf hook for attach

Posted by Christian Brauner 1 month, 1 week ago

Add a hook to manage attaching tasks to cgroup. I'm in the process of
adding various "universal truth" bpf programs to systemd that will make
use of this.

This has been a long-standing request (cf. [1] and [2]). It will allow us to
enforce cgroup migrations and ensure that services can never escape their
cgroups. This is just one of many use-cases.

Link: https://github.com/systemd/systemd/issues/6356 [1]
Link: https://github.com/systemd/systemd/issues/22874 [2]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/bpf_lsm.h | 15 +++++++++++++++
 kernel/bpf/bpf_lsm.c    | 12 ++++++++++++
 kernel/cgroup/cgroup.c  | 18 +++++++++++-------
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 5ae438fdf567..bc1d35b271f5 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -12,8 +12,11 @@
 #include <linux/bpf_verifier.h>
 #include <linux/lsm_hooks.h>
 
+struct cgroup;
+struct cgroup_namespace;
 struct ns_common;
 struct nsset;
+struct super_block;
 
 #ifdef CONFIG_BPF_LSM
 
@@ -55,6 +58,9 @@ int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
 int bpf_lsm_namespace_alloc(struct ns_common *ns);
 void bpf_lsm_namespace_free(struct ns_common *ns);
 int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
+int bpf_lsm_cgroup_attach(struct task_struct *task, struct cgroup *src_cgrp,
+			   struct cgroup *dst_cgrp, struct super_block *sb,
+			   bool threadgroup, struct cgroup_namespace *ns);
 
 int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
 				const struct bpf_dynptr *value_p, int flags);
@@ -125,6 +131,15 @@ static inline int bpf_lsm_namespace_install(struct nsset *nsset,
 {
 	return 0;
 }
+static inline int bpf_lsm_cgroup_attach(struct task_struct *task,
+					 struct cgroup *src_cgrp,
+					 struct cgroup *dst_cgrp,
+					 struct super_block *sb,
+					 bool threadgroup,
+					 struct cgroup_namespace *ns)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index f6378db46220..1da5585082fa 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -47,6 +47,16 @@ __weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
 	return 0;
 }
 
+__weak noinline int bpf_lsm_cgroup_attach(struct task_struct *task,
+					   struct cgroup *src_cgrp,
+					   struct cgroup *dst_cgrp,
+					   struct super_block *sb,
+					   bool threadgroup,
+					   struct cgroup_namespace *ns)
+{
+	return 0;
+}
+
 __bpf_hook_end();
 
 #define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
@@ -56,6 +66,7 @@ BTF_SET_START(bpf_lsm_hooks)
 BTF_ID(func, bpf_lsm_namespace_alloc)
 BTF_ID(func, bpf_lsm_namespace_free)
 BTF_ID(func, bpf_lsm_namespace_install)
+BTF_ID(func, bpf_lsm_cgroup_attach)
 BTF_SET_END(bpf_lsm_hooks)
 
 BTF_SET_START(bpf_lsm_disabled_hooks)
@@ -407,6 +418,7 @@ BTF_ID(func, bpf_lsm_task_to_inode)
 BTF_ID(func, bpf_lsm_userns_create)
 BTF_ID(func, bpf_lsm_namespace_alloc)
 BTF_ID(func, bpf_lsm_namespace_install)
+BTF_ID(func, bpf_lsm_cgroup_attach)
 BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8af4351536cf..16535349b22f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,6 +28,7 @@
 #include "cgroup-internal.h"
 
 #include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
@@ -5334,7 +5335,8 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
 	return 0;
 }
 
-static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+static int cgroup_attach_permissions(struct task_struct *task,
+				     struct cgroup *src_cgrp,
 				     struct cgroup *dst_cgrp,
 				     struct super_block *sb, bool threadgroup,
 				     struct cgroup_namespace *ns)
@@ -5350,9 +5352,9 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
 		return ret;
 
 	if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
-		ret = -EOPNOTSUPP;
+		return -EOPNOTSUPP;
 
-	return ret;
+	return bpf_lsm_cgroup_attach(task, src_cgrp, dst_cgrp, sb, threadgroup, ns);
 }
 
 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
@@ -5384,7 +5386,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	 * inherited fd attacks.
 	 */
 	scoped_with_creds(of->file->f_cred)
-		ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+		ret = cgroup_attach_permissions(task, src_cgrp, dst_cgrp,
 						of->file->f_path.dentry->d_sb,
 						threadgroup, ctx->ns);
 	if (ret)
@@ -6669,6 +6671,7 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
 
 /**
  * cgroup_css_set_fork - find or create a css_set for a child process
+ * @task: the task to be attached
  * @kargs: the arguments passed to create the child process
  *
  * This functions finds or creates a new css_set which the child
@@ -6683,7 +6686,8 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
  * before grabbing cgroup_threadgroup_rwsem and will hold a reference
  * to the target cgroup.
  */
-static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+static int cgroup_css_set_fork(struct task_struct *task,
+			       struct kernel_clone_args *kargs)
 	__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
 {
 	int ret;
@@ -6752,7 +6756,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 	 * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
 	 * to always use the caller's credentials.
 	 */
-	ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+	ret = cgroup_attach_permissions(task, cset->dfl_cgrp, dst_cgrp, sb,
 					!(kargs->flags & CLONE_THREAD),
 					current->nsproxy->cgroup_ns);
 	if (ret)
@@ -6824,7 +6828,7 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
 	struct cgroup_subsys *ss;
 	int i, j, ret;
 
-	ret = cgroup_css_set_fork(kargs);
+	ret = cgroup_css_set_fork(child, kargs);
 	if (ret)
 		return ret;
 

-- 
2.47.3

Re: [PATCH 2/4] cgroup: add bpf hook for attach

Posted by Michal Koutný 1 month, 1 week ago

Hi.

On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner <brauner@kernel.org> wrote:
> Add a hook to manage attaching tasks to cgroup. I'm in the process of
> adding various "universal truth" bpf programs to systemd that will make
> use of this.
> 
> This has been a long-standing request (cf. [1] and [2]). It will allow us to
> enforce cgroup migrations and ensure that services can never escape their
> cgroups. This is just one of many use-cases.
> 
> Link: https://github.com/systemd/systemd/issues/6356 [1]
> Link: https://github.com/systemd/systemd/issues/22874 [2]

These two issues are misconfigured/misunderstood PAM configs. I don't
think those warrant introduction of another permissions mechanism,
furthermore they're relatively old and I estimate many of such configs
must have been fixed in the course of time.

As for services escaping their cgroups -- they needn't run as root, do
they? And if you seek a mechanism how to prevent even root from
migrations, there are cgroupnses for that. (BTW what would prevent a
root detaching/disabling these hook progs anyway?)

I think that the cgroup file permissions are sufficient for many use
cases and this BPF hook is too tempting in unnecessary cases (like
masking other issues).
Could you please expand more about some other reasonable use cases not
covered by those?

(BTW I notice there's already a very similar BPF hook in sched_ext's
cgroup_prep_move. It'd be nicer to have only one generic approach to
these checks.)

Regards,
Michal

Re: [PATCH 2/4] cgroup: add bpf hook for attach

Posted by Christian Brauner 1 month ago

On Mon, Feb 23, 2026 at 04:47:11PM +0100, Michal Koutný wrote:
> Hi.
> 
> On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner <brauner@kernel.org> wrote:
> > Add a hook to manage attaching tasks to cgroup. I'm in the process of
> > adding various "universal truth" bpf programs to systemd that will make
> > use of this.
> > 
> > This has been a long-standing request (cf. [1] and [2]). It will allow us to
> > enforce cgroup migrations and ensure that services can never escape their
> > cgroups. This is just one of many use-cases.
> > 
> > Link: https://github.com/systemd/systemd/issues/6356 [1]
> > Link: https://github.com/systemd/systemd/issues/22874 [2]
> 
> These two issues are misconfigured/misunderstood PAM configs. I don't
> think those warrant introduction of another permissions mechanism,
> furthermore they're relatively old and I estimate many of such configs
> must have been fixed in the course of time.

logind has to allow cgroup migrations but for say Docker this shouldn't
be allowed. So calling this misconfiguration is like taking a shortcut
by simply pointing to a different destination. But fine, let's say you
insist on this not being valid.

> As for services escaping their cgroups -- they needn't run as root, do
> they? And if you seek a mechanism how to prevent even root from
> migrations, there are cgroupnses for that. (BTW what would prevent a

A bunch of tools that do cgroup migrations don't use cgroup namespaces
and there's no requirement or way to enforce that they do. Plus, there's
no requirement to only do cgroup management via systemd or its APIs.

Frankly, I can't even blame userspace for not having widely adopted
cgroup namespaces. The implementation atop of a single superblock like
cgroupfs is questionable.

But in general the point is that there's no mechanism to enforce cgroup
tree policy currently in a sufficiently flexible manner.

> root detaching/disabling these hook progs anyway?)

I cannot help but read this as you asking me "What if you're too dumb to
write a security policy that isn't self-defeating?" :)

bpf has security hooks for itself including security_bpf(). First thing
that comes to mind is to have security.bpf.* or trusted.* xattrs on
selected processes like PID 1 that mark it as eligible for modifying BPF
state or BPF LSM programs supervising link/prog detach, update etc and
then designating only PID 1 as handing out those magical xattrs. Can be
as fine-grained as needed and that tells everyone else to go away and do
something else.

There's more fine-grained mechanisms to deal with this. IOW, it's a
solvable problem.

> I think that the cgroup file permissions are sufficient for many use
> cases and this BPF hook is too tempting in unnecessary cases (like
> masking other issues).
> Could you please expand more about some other reasonable use cases not
> covered by those?

systemd will gain the ability to implement policy to control cgroup tree
modifications in as much details as it needs without having the kernel
in need to be aware of it. This can take various forms by marking only
select processes as being eligible for managing cgroup migrations or
even just locking down specific cgroups.

The policy needs to be flexible so it can be live-updated, switched into
auditing mode, and losened, tightened on-demand as needed.

> (BTW I notice there's already a very similar BPF hook in sched_ext's
> cgroup_prep_move. It'd be nicer to have only one generic approach to
> these checks.)

This feels a bit like a wild goose chase. But fine, I'll look at it.
/me goes off

Ok, let's start with cgroup_can_fork(). The sched ext hook isn't a
generic permission check. It's called way after
cgroup_attach_permissions() and is a per cgroup controller check that is
only called for some cgroup controllers. So effectively useless to pull
up (Notice also, how some controllers like cpuset call additional
security hooks already.).

The same problem applies to writes for cgroup.procs and for subtree
control. The sched ext hook are per cgroup controller not generically
called.

And they happen to be called in cgroup_migrate_execute() which is way
deep in the callchain. When cgroup_attach_permissions() fails it's
effectively free. If migrate_execute() fails it must put/free css sets,
it must splice back task on mg_tasks, it must call cancel_attach()
callbacks, thus it must call the sched-ext cancel callbacks for each
already prepped task, it must uncharge pids for each already prepped
task, it needs to unlock a bunch of stuff.

On top of that this looks like a category mistake imho. The callbacks
are a dac-like permission mechanism whereas the hooks is actual mac
permission checking. I'm not sure lumping this together with
per-cgroup-controller migration preparations will be very clean. I think
it will end up looking rather confusing. But that's best left to you
cgroup maintainers, I think.

Re: [PATCH 2/4] cgroup: add bpf hook for attach

Posted by Michal Koutný 3 weeks, 3 days ago

Hello.

On Fri, Feb 27, 2026 at 02:44:27PM +0100, Christian Brauner <brauner@kernel.org> wrote:
> So calling this misconfiguration is like taking a shortcut
> by simply pointing to a different destination. But fine, let's say you
> insist on this not being valid.

I understand this in analogy with filesystem organization -- there's the
package manager that ensures files are put in right places,
non-conflicting and trackable.  Subtrees may be delegated (e.g.
/usr/local).  If root (or whoever has perms for it), decides to
manipulate the files, its up to them what they end up with.

> The implementation atop of a single superblock like cgroupfs is
> questionable.

(This is an interesting topic which I'd like to discuss some other
time not to diverge here.)

> But in general the point is that there's no mechanism to enforce cgroup
> tree policy currently in a sufficiently flexible manner.
> 
> > root detaching/disabling these hook progs anyway?)
> 
> I cannot help but read this as you asking me "What if you're too dumb to
> write a security policy that isn't self-defeating?" :)

I was just trying to express that there's only one level of root (user).
(Cautionary example are "containers" that executed as (host) root.
Lockdown neglected.)

> bpf has security hooks for itself including security_bpf(). First thing
> that comes to mind is to have security.bpf.* or trusted.* xattrs on
> selected processes like PID 1 that mark it as eligible for modifying BPF
> state or BPF LSM programs supervising link/prog detach, update etc and
> then designating only PID 1 as handing out those magical xattrs. Can be
> as fine-grained as needed and that tells everyone else to go away and do
> something else.

(These are too many new concepts for me, I must skip it now. I may catch
up after more study.)

> systemd will gain the ability to implement policy to control cgroup tree
> modifications in as much details as it needs without having the kernel
> in need to be aware of it. This can take various forms by marking only
> select processes as being eligible for managing cgroup migrations or
> even just locking down specific cgroups.

This is how I understand the goal could be expressed in current terms:

  a) allowlisting processes that can do migrations
     # common ancestor of all + access to each dst
     chown -R :grA $root_cgroup/cgroup.procs
     chmod -R g+w $root_cgroup/cgroup.procs

     # static:
       usermod -G grA user_of_pid
       (re)start pid
     # or in spawner:
       fork
       setgroups([grA])
       exec

  b) rules that are specific to cgroup (subtree)
    # applying same like above but to a $lower_group

    $ setfacl -R -m g:grB:w $lower_group/cgroup.procs
    setfacl: cgroup.procs: Operation not supported
    # here I notice why current impl isn't sufficient

Also, if I understand this correctly you semm to move from the semantics
where users (UIDs) are subjects to a different one where it's bound to
processes (PIDs).

> The policy needs to be flexible so it can be live-updated, switched into
> auditing mode, and losened, tightened on-demand as needed.

OK.
(I'd add that policy should be also easily debuggable/troubleshootable.)

> Ok, let's start with cgroup_can_fork(). The sched ext hook isn't a
> generic permission check. It's called way after
> cgroup_attach_permissions() and is a per cgroup controller check that is
> only called for some cgroup controllers. So effectively useless to pull
> up (Notice also, how some controllers like cpuset call additional
> security hooks already.).

There could be one BPF predicate (on the cgroup level) and potentially
pass per-controller data, so that function could employ (or not) those.
It's true that semantics would be a bit different because of implicit
migrations happening with controller en-/disablement.

What I don't like about the multiple hooks is that there'd be several
places to check when one is trying to figure out why a migration failed.

> On top of that this looks like a category mistake imho. The callbacks
> are a dac-like permission mechanism whereas the hooks is actual mac
> permission checking. I'm not sure lumping this together with
> per-cgroup-controller migration preparations will be very clean. I think
> it will end up looking rather confusing. But that's best left to you
> cgroup maintainers, I think.

This paragraph hinted me at (yet) another mechanism in the kernel (and
you also mentioned it with cpuset) -- the LSM hooks. Namely, if this was
security_cgroup_attach() hook, the logic could be expressed with other
existing modules, IIUC, one of them is BPF. Would that fulfil the
behaviors you're missing?

(I'm proposing this as potentially less confusing/known "evil" approach
to the scenarios considered above.)

Thanks,
Michal

Re: [PATCH 2/4] cgroup: add bpf hook for attach

Posted by Tejun Heo 1 month, 1 week ago

Hello,

On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner wrote:
> Add a hook to manage attaching tasks to cgroup. I'm in the process of
> adding various "universal truth" bpf programs to systemd that will make
> use of this.
> 
> This has been a long-standing request (cf. [1] and [2]). It will allow us to
> enforce cgroup migrations and ensure that services can never escape their
> cgroups. This is just one of many use-cases.

From cgroup POV, this looks fine to me but I'm curious whether something
dumber would also work. With CLONE_INTO_CGROUP, cgroup migration isn't
necessary at all. Would something dumber like a mount option disabling
cgroup migrations completely work too or would that be too restrictive?

Thanks.

-- 
tejun

Re: [PATCH 2/4] cgroup: add bpf hook for attach

Posted by Christian Brauner 1 month, 1 week ago

On Fri, Feb 20, 2026 at 05:16:13AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner wrote:
> > Add a hook to manage attaching tasks to cgroup. I'm in the process of
> > adding various "universal truth" bpf programs to systemd that will make
> > use of this.
> > 
> > This has been a long-standing request (cf. [1] and [2]). It will allow us to
> > enforce cgroup migrations and ensure that services can never escape their
> > cgroups. This is just one of many use-cases.
> 
> >From cgroup POV, this looks fine to me but I'm curious whether something
> dumber would also work. With CLONE_INTO_CGROUP, cgroup migration isn't
> necessary at all. Would something dumber like a mount option disabling
> cgroup migrations completely work too or would that be too restrictive?

It would be too restrictive. I've played with various policies. For
example, a small set of tasks (like PID 1 or the session manager) are
allowed to move processes between cgroups (detectable via e.g., xattrs).
No other task is allowd. But that's already too restrictive because it
fscks over delegated subcgroups were tasks need to be moved around
(container managers etc.). IOW, any policy must be quite modular and
dynamic so a simple mount option wouldn't cover it.

As a sidenote, there would be other mount options that would be useful
but that currently aren't that easy to support/implement because of the
way cgroupfs (for historical reasons ofc) is architected where it shares
a single superblock.

I have a series (from quite some time ago) that makes cgroupfs truly
multi-instance. It would effectively behave just like tmpfs does. A new
mount gets you a new superblock. But once you have that you can e.g.,
simplify cgroup namespaces as well. I've done that work originally to
support idmapped mounts with cgroupfs but I can't find that branch
anymore.

[PATCH 1/4] ns: add bpf hooks
[PATCH 2/4] cgroup: add bpf hook for attach
[PATCH 3/4] selftests/bpf: add ns hook selftest
[PATCH 4/4] selftests/bpf: add cgroup attach selftests