Add a hook to manage attaching tasks to cgroup. I'm in the process of
adding various "universal truth" bpf programs to systemd that will make
use of this.
This has been a long-standing request (cf. [1] and [2]). It will allow us to
enforce cgroup migrations and ensure that services can never escape their
cgroups. This is just one of many use-cases.
Link: https://github.com/systemd/systemd/issues/6356 [1]
Link: https://github.com/systemd/systemd/issues/22874 [2]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/bpf_lsm.h | 15 +++++++++++++++
kernel/bpf/bpf_lsm.c | 12 ++++++++++++
kernel/cgroup/cgroup.c | 18 +++++++++++-------
3 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 5ae438fdf567..bc1d35b271f5 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -12,8 +12,11 @@
#include <linux/bpf_verifier.h>
#include <linux/lsm_hooks.h>
+struct cgroup;
+struct cgroup_namespace;
struct ns_common;
struct nsset;
+struct super_block;
#ifdef CONFIG_BPF_LSM
@@ -55,6 +58,9 @@ int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
int bpf_lsm_namespace_alloc(struct ns_common *ns);
void bpf_lsm_namespace_free(struct ns_common *ns);
int bpf_lsm_namespace_install(struct nsset *nsset, struct ns_common *ns);
+int bpf_lsm_cgroup_attach(struct task_struct *task, struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp, struct super_block *sb,
+ bool threadgroup, struct cgroup_namespace *ns);
int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
const struct bpf_dynptr *value_p, int flags);
@@ -125,6 +131,15 @@ static inline int bpf_lsm_namespace_install(struct nsset *nsset,
{
return 0;
}
+static inline int bpf_lsm_cgroup_attach(struct task_struct *task,
+ struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb,
+ bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ return 0;
+}
#endif /* CONFIG_BPF_LSM */
#endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index f6378db46220..1da5585082fa 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -47,6 +47,16 @@ __weak noinline int bpf_lsm_namespace_install(struct nsset *nsset,
return 0;
}
+__weak noinline int bpf_lsm_cgroup_attach(struct task_struct *task,
+ struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb,
+ bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ return 0;
+}
+
__bpf_hook_end();
#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
@@ -56,6 +66,7 @@ BTF_SET_START(bpf_lsm_hooks)
BTF_ID(func, bpf_lsm_namespace_alloc)
BTF_ID(func, bpf_lsm_namespace_free)
BTF_ID(func, bpf_lsm_namespace_install)
+BTF_ID(func, bpf_lsm_cgroup_attach)
BTF_SET_END(bpf_lsm_hooks)
BTF_SET_START(bpf_lsm_disabled_hooks)
@@ -407,6 +418,7 @@ BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
BTF_ID(func, bpf_lsm_namespace_alloc)
BTF_ID(func, bpf_lsm_namespace_install)
+BTF_ID(func, bpf_lsm_cgroup_attach)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8af4351536cf..16535349b22f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,6 +28,7 @@
#include "cgroup-internal.h"
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -5334,7 +5335,8 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
return 0;
}
-static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+static int cgroup_attach_permissions(struct task_struct *task,
+ struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
struct super_block *sb, bool threadgroup,
struct cgroup_namespace *ns)
@@ -5350,9 +5352,9 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
return ret;
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
- ret = -EOPNOTSUPP;
+ return -EOPNOTSUPP;
- return ret;
+ return bpf_lsm_cgroup_attach(task, src_cgrp, dst_cgrp, sb, threadgroup, ns);
}
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
@@ -5384,7 +5386,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* inherited fd attacks.
*/
scoped_with_creds(of->file->f_cred)
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ ret = cgroup_attach_permissions(task, src_cgrp, dst_cgrp,
of->file->f_path.dentry->d_sb,
threadgroup, ctx->ns);
if (ret)
@@ -6669,6 +6671,7 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
/**
* cgroup_css_set_fork - find or create a css_set for a child process
+ * @task: the task to be attached
* @kargs: the arguments passed to create the child process
*
* This functions finds or creates a new css_set which the child
@@ -6683,7 +6686,8 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
-static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+static int cgroup_css_set_fork(struct task_struct *task,
+ struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
int ret;
@@ -6752,7 +6756,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
* cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
* to always use the caller's credentials.
*/
- ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ ret = cgroup_attach_permissions(task, cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns);
if (ret)
@@ -6824,7 +6828,7 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
struct cgroup_subsys *ss;
int i, j, ret;
- ret = cgroup_css_set_fork(kargs);
+ ret = cgroup_css_set_fork(child, kargs);
if (ret)
return ret;
--
2.47.3
Hi. On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner <brauner@kernel.org> wrote: > Add a hook to manage attaching tasks to cgroup. I'm in the process of > adding various "universal truth" bpf programs to systemd that will make > use of this. > > This has been a long-standing request (cf. [1] and [2]). It will allow us to > enforce cgroup migrations and ensure that services can never escape their > cgroups. This is just one of many use-cases. > > Link: https://github.com/systemd/systemd/issues/6356 [1] > Link: https://github.com/systemd/systemd/issues/22874 [2] These two issues are misconfigured/misunderstood PAM configs. I don't think those warrant introduction of another permissions mechanism, furthermore they're relatively old and I estimate many of such configs must have been fixed in the course of time. As for services escaping their cgroups -- they needn't run as root, do they? And if you seek a mechanism how to prevent even root from migrations, there are cgroupnses for that. (BTW what would prevent a root detaching/disabling these hook progs anyway?) I think that the cgroup file permissions are sufficient for many use cases and this BPF hook is too tempting in unnecessary cases (like masking other issues). Could you please expand more about some other reasonable use cases not covered by those? (BTW I notice there's already a very similar BPF hook in sched_ext's cgroup_prep_move. It'd be nicer to have only one generic approach to these checks.) Regards, Michal
On Mon, Feb 23, 2026 at 04:47:11PM +0100, Michal Koutný wrote: > Hi. > > On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner <brauner@kernel.org> wrote: > > Add a hook to manage attaching tasks to cgroup. I'm in the process of > > adding various "universal truth" bpf programs to systemd that will make > > use of this. > > > > This has been a long-standing request (cf. [1] and [2]). It will allow us to > > enforce cgroup migrations and ensure that services can never escape their > > cgroups. This is just one of many use-cases. > > > > Link: https://github.com/systemd/systemd/issues/6356 [1] > > Link: https://github.com/systemd/systemd/issues/22874 [2] > > These two issues are misconfigured/misunderstood PAM configs. I don't > think those warrant introduction of another permissions mechanism, > furthermore they're relatively old and I estimate many of such configs > must have been fixed in the course of time. logind has to allow cgroup migrations but for say Docker this shouldn't be allowed. So calling this misconfiguration is like taking a shortcut by simply pointing to a different destination. But fine, let's say you insist on this not being valid. > As for services escaping their cgroups -- they needn't run as root, do > they? And if you seek a mechanism how to prevent even root from > migrations, there are cgroupnses for that. (BTW what would prevent a A bunch of tools that do cgroup migrations don't use cgroup namespaces and there's no requirement or way to enforce that they do. Plus, there's no requirement to only do cgroup management via systemd or its APIs. Frankly, I can't even blame userspace for not having widely adopted cgroup namespaces. The implementation atop of a single superblock like cgroupfs is questionable. But in general the point is that there's no mechanism to enforce cgroup tree policy currently in a sufficiently flexible manner. > root detaching/disabling these hook progs anyway?) I cannot help but read this as you asking me "What if you're too dumb to write a security policy that isn't self-defeating?" :) bpf has security hooks for itself including security_bpf(). First thing that comes to mind is to have security.bpf.* or trusted.* xattrs on selected processes like PID 1 that mark it as eligible for modifying BPF state or BPF LSM programs supervising link/prog detach, update etc and then designating only PID 1 as handing out those magical xattrs. Can be as fine-grained as needed and that tells everyone else to go away and do something else. There's more fine-grained mechanisms to deal with this. IOW, it's a solvable problem. > I think that the cgroup file permissions are sufficient for many use > cases and this BPF hook is too tempting in unnecessary cases (like > masking other issues). > Could you please expand more about some other reasonable use cases not > covered by those? systemd will gain the ability to implement policy to control cgroup tree modifications in as much details as it needs without having the kernel in need to be aware of it. This can take various forms by marking only select processes as being eligible for managing cgroup migrations or even just locking down specific cgroups. The policy needs to be flexible so it can be live-updated, switched into auditing mode, and losened, tightened on-demand as needed. > (BTW I notice there's already a very similar BPF hook in sched_ext's > cgroup_prep_move. It'd be nicer to have only one generic approach to > these checks.) This feels a bit like a wild goose chase. But fine, I'll look at it. /me goes off Ok, let's start with cgroup_can_fork(). The sched ext hook isn't a generic permission check. It's called way after cgroup_attach_permissions() and is a per cgroup controller check that is only called for some cgroup controllers. So effectively useless to pull up (Notice also, how some controllers like cpuset call additional security hooks already.). The same problem applies to writes for cgroup.procs and for subtree control. The sched ext hook are per cgroup controller not generically called. And they happen to be called in cgroup_migrate_execute() which is way deep in the callchain. When cgroup_attach_permissions() fails it's effectively free. If migrate_execute() fails it must put/free css sets, it must splice back task on mg_tasks, it must call cancel_attach() callbacks, thus it must call the sched-ext cancel callbacks for each already prepped task, it must uncharge pids for each already prepped task, it needs to unlock a bunch of stuff. On top of that this looks like a category mistake imho. The callbacks are a dac-like permission mechanism whereas the hooks is actual mac permission checking. I'm not sure lumping this together with per-cgroup-controller migration preparations will be very clean. I think it will end up looking rather confusing. But that's best left to you cgroup maintainers, I think.
Hello.
On Fri, Feb 27, 2026 at 02:44:27PM +0100, Christian Brauner <brauner@kernel.org> wrote:
> So calling this misconfiguration is like taking a shortcut
> by simply pointing to a different destination. But fine, let's say you
> insist on this not being valid.
I understand this in analogy with filesystem organization -- there's the
package manager that ensures files are put in right places,
non-conflicting and trackable. Subtrees may be delegated (e.g.
/usr/local). If root (or whoever has perms for it), decides to
manipulate the files, its up to them what they end up with.
> The implementation atop of a single superblock like cgroupfs is
> questionable.
(This is an interesting topic which I'd like to discuss some other
time not to diverge here.)
> But in general the point is that there's no mechanism to enforce cgroup
> tree policy currently in a sufficiently flexible manner.
>
> > root detaching/disabling these hook progs anyway?)
>
> I cannot help but read this as you asking me "What if you're too dumb to
> write a security policy that isn't self-defeating?" :)
I was just trying to express that there's only one level of root (user).
(Cautionary example are "containers" that executed as (host) root.
Lockdown neglected.)
> bpf has security hooks for itself including security_bpf(). First thing
> that comes to mind is to have security.bpf.* or trusted.* xattrs on
> selected processes like PID 1 that mark it as eligible for modifying BPF
> state or BPF LSM programs supervising link/prog detach, update etc and
> then designating only PID 1 as handing out those magical xattrs. Can be
> as fine-grained as needed and that tells everyone else to go away and do
> something else.
(These are too many new concepts for me, I must skip it now. I may catch
up after more study.)
> systemd will gain the ability to implement policy to control cgroup tree
> modifications in as much details as it needs without having the kernel
> in need to be aware of it. This can take various forms by marking only
> select processes as being eligible for managing cgroup migrations or
> even just locking down specific cgroups.
This is how I understand the goal could be expressed in current terms:
a) allowlisting processes that can do migrations
# common ancestor of all + access to each dst
chown -R :grA $root_cgroup/cgroup.procs
chmod -R g+w $root_cgroup/cgroup.procs
# static:
usermod -G grA user_of_pid
(re)start pid
# or in spawner:
fork
setgroups([grA])
exec
b) rules that are specific to cgroup (subtree)
# applying same like above but to a $lower_group
$ setfacl -R -m g:grB:w $lower_group/cgroup.procs
setfacl: cgroup.procs: Operation not supported
# here I notice why current impl isn't sufficient
Also, if I understand this correctly you semm to move from the semantics
where users (UIDs) are subjects to a different one where it's bound to
processes (PIDs).
> The policy needs to be flexible so it can be live-updated, switched into
> auditing mode, and losened, tightened on-demand as needed.
OK.
(I'd add that policy should be also easily debuggable/troubleshootable.)
> Ok, let's start with cgroup_can_fork(). The sched ext hook isn't a
> generic permission check. It's called way after
> cgroup_attach_permissions() and is a per cgroup controller check that is
> only called for some cgroup controllers. So effectively useless to pull
> up (Notice also, how some controllers like cpuset call additional
> security hooks already.).
There could be one BPF predicate (on the cgroup level) and potentially
pass per-controller data, so that function could employ (or not) those.
It's true that semantics would be a bit different because of implicit
migrations happening with controller en-/disablement.
What I don't like about the multiple hooks is that there'd be several
places to check when one is trying to figure out why a migration failed.
> On top of that this looks like a category mistake imho. The callbacks
> are a dac-like permission mechanism whereas the hooks is actual mac
> permission checking. I'm not sure lumping this together with
> per-cgroup-controller migration preparations will be very clean. I think
> it will end up looking rather confusing. But that's best left to you
> cgroup maintainers, I think.
This paragraph hinted me at (yet) another mechanism in the kernel (and
you also mentioned it with cpuset) -- the LSM hooks. Namely, if this was
security_cgroup_attach() hook, the logic could be expressed with other
existing modules, IIUC, one of them is BPF. Would that fulfil the
behaviors you're missing?
(I'm proposing this as potentially less confusing/known "evil" approach
to the scenarios considered above.)
Thanks,
Michal
Hello, On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner wrote: > Add a hook to manage attaching tasks to cgroup. I'm in the process of > adding various "universal truth" bpf programs to systemd that will make > use of this. > > This has been a long-standing request (cf. [1] and [2]). It will allow us to > enforce cgroup migrations and ensure that services can never escape their > cgroups. This is just one of many use-cases. From cgroup POV, this looks fine to me but I'm curious whether something dumber would also work. With CLONE_INTO_CGROUP, cgroup migration isn't necessary at all. Would something dumber like a mount option disabling cgroup migrations completely work too or would that be too restrictive? Thanks. -- tejun
On Fri, Feb 20, 2026 at 05:16:13AM -1000, Tejun Heo wrote: > Hello, > > On Fri, Feb 20, 2026 at 01:38:30AM +0100, Christian Brauner wrote: > > Add a hook to manage attaching tasks to cgroup. I'm in the process of > > adding various "universal truth" bpf programs to systemd that will make > > use of this. > > > > This has been a long-standing request (cf. [1] and [2]). It will allow us to > > enforce cgroup migrations and ensure that services can never escape their > > cgroups. This is just one of many use-cases. > > >From cgroup POV, this looks fine to me but I'm curious whether something > dumber would also work. With CLONE_INTO_CGROUP, cgroup migration isn't > necessary at all. Would something dumber like a mount option disabling > cgroup migrations completely work too or would that be too restrictive? It would be too restrictive. I've played with various policies. For example, a small set of tasks (like PID 1 or the session manager) are allowed to move processes between cgroups (detectable via e.g., xattrs). No other task is allowd. But that's already too restrictive because it fscks over delegated subcgroups were tasks need to be moved around (container managers etc.). IOW, any policy must be quite modular and dynamic so a simple mount option wouldn't cover it. As a sidenote, there would be other mount options that would be useful but that currently aren't that easy to support/implement because of the way cgroupfs (for historical reasons ofc) is architected where it shares a single superblock. I have a series (from quite some time ago) that makes cgroupfs truly multi-instance. It would effectively behave just like tmpfs does. A new mount gets you a new superblock. But once you have that you can e.g., simplify cgroup namespaces as well. I've done that work originally to support idmapped mounts with cgroupfs but I can't find that branch anymore.
© 2016 - 2026 Red Hat, Inc.