Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
lifetime to the pidfd returned from clone3(). When the last reference to
the struct file created by clone3() is closed the kernel sends SIGKILL
to the child. A pidfd obtained via pidfd_open() for the same process
does not keep the child alive and does not trigger autokill - only the
specific struct file from clone3() has this property.
This is useful for container runtimes, service managers, and sandboxed
subprocess execution - any scenario where the child must die if the
parent crashes or abandons the pidfd.
CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying
lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no
one to reap it would become a zombie). CLONE_THREAD is rejected because
autokill targets a process not a thread.
The clone3 pidfd is identified by storing a pointer to the struct file in
signal_struct.autokill_pidfd. The pidfs .release handler compares the
file being closed against this pointer and sends SIGKILL via
group_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only on match. Files
from pidfd_open() or open_by_handle_at() are distinct struct files and
will never match. dup()/fork() share the same struct file so they extend
the child's lifetime until the last reference drops.
Unlike pdeath_signal autokill isn't disarmed on exec and on credential
changes that cross privilege boundaries. It would defeat the purpose of
this whole endeavour.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/pidfs.c | 16 ++++++++++++++++
include/linux/sched/signal.h | 3 +++
include/uapi/linux/sched.h | 1 +
kernel/fork.c | 16 ++++++++++++++--
4 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 318253344b5c..b3891b2097eb 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -8,6 +8,8 @@
#include <linux/mount.h>
#include <linux/pid.h>
#include <linux/pidfs.h>
+#include <linux/sched/signal.h>
+#include <linux/signal.h>
#include <linux/pid_namespace.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
@@ -637,7 +639,21 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return open_namespace(ns_common);
}
+static int pidfs_file_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = inode->i_private;
+ struct task_struct *task;
+
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_TGID);
+ if (task && READ_ONCE(task->signal->autokill_pidfd) == file)
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
+
+ return 0;
+}
+
static const struct file_operations pidfs_file_operations = {
+ .release = pidfs_file_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index f842c86b806f..85a3de5c4030 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -134,6 +134,9 @@ struct signal_struct {
unsigned int has_child_subreaper:1;
unsigned int autoreap:1;
+ /* pidfd that triggers SIGKILL on close, or NULL */
+ const struct file *autokill_pidfd;
+
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 8a22ea640817..b1aea8a86e2f 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -37,6 +37,7 @@
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#define CLONE_AUTOREAP 0x400000000ULL /* Auto-reap child on exit. */
+#define CLONE_PIDFD_AUTOKILL 0x800000000ULL /* Kill child when clone pidfd closes. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
diff --git a/kernel/fork.c b/kernel/fork.c
index bc27dc10c309..7bcdba54c9a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2035,6 +2035,15 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2470,8 +2479,11 @@ __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
- if (pidfile)
+ if (pidfile) {
+ if (clone_flags & CLONE_PIDFD_AUTOKILL)
+ p->signal->autokill_pidfd = pidfile;
fd_install(pidfd, pidfile);
+ }
proc_fork_connector(p);
sched_post_fork(p);
@@ -2909,7 +2921,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
- CLONE_AUTOREAP))
+ CLONE_AUTOREAP | CLONE_PIDFD_AUTOKILL))
return false;
/*
--
2.47.3
On Tue, 17 Feb 2026 at 14:36, Christian Brauner <brauner@kernel.org> wrote:
>
> Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
> lifetime to the pidfd returned from clone3(). When the last reference to
> the struct file created by clone3() is closed the kernel sends SIGKILL
> to the child.
Did I read this right? You can now basically kill suid binaries that
you started but don't have rights to kill any other way.
If I'm right, this is completely broken. Please explain.
Linus
On Wed, Feb 18, 2026 at 12:18 AM Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, 17 Feb 2026 at 14:36, Christian Brauner <brauner@kernel.org> wrote: > > > > Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's > > lifetime to the pidfd returned from clone3(). When the last reference to > > the struct file created by clone3() is closed the kernel sends SIGKILL > > to the child. > > Did I read this right? You can now basically kill suid binaries that > you started but don't have rights to kill any other way. > > If I'm right, this is completely broken. Please explain. You can already send SIGHUP to such binaries through things like job control, right? Do we know if there are setuid binaries out there that change their ruid and suid to prevent being killable via kill_ok_by_cred(), then set SIGHUP to SIG_IGN to not be killable via job control, and then do some work that shouldn't be interrupted? Also, on a Linux system with systemd, I believe a normal user, when running in the context of a user session (but not when running in the context of a system service), can already SIGKILL anything they launch by launching it in a systemd user service, then doing something like "echo 1 > /sys/fs/cgroup/user.slice/user-$UID.slice/user@$UID.service/app.slice/<servicename>.scope/cgroup.kill" because systemd delegates cgroups for anything a user runs to that user; and cgroup.kill goes through the codepath cgroup_kill_write -> cgroup_kill -> __cgroup_kill -> send_sig(SIGKILL, task, 0) -> send_sig_info -> do_send_sig_info which, as far as I know, bypasses the normal signal sending permission checks. (For comparison, group_send_sig_info() first calls check_kill_permission(), then do_send_sig_info().) I agree that this would be a change to the security model, but I'm not sure if it would be that big a change. I guess an alternative might be to instead gate the clone() flag on a `task_no_new_privs(current) || ns_capable()` check like in seccomp, but that might be too restrictive for the usecases Christian has in mind...
On Tue, 17 Feb 2026 at 15:38, Jann Horn <jannh@google.com> wrote:
>
> You can already send SIGHUP to such binaries through things like job
> control, right?
But at least those can be blocked, and people can disassociate
themselves from a tty if they care etc.
This seems like it can't be blocked any way, although I guess you can
just do the double fork dance to distance yourself from your parent.
> Also, on a Linux system with systemd, I believe a normal user, when
> running in the context of a user session (but not when running in the
> context of a system service), can already SIGKILL anything they launch
> by launching it in a systemd user service, then doing something [...]
Ugh. But at least it's not the kernel that does it, and we have rules
for sending signals.
> I agree that this would be a change to the security model, but I'm not
> sure if it would be that big a change.
I would expect most normal binaries to expect to be killed with ^C etc
anyway, so in that sense this is indeed likely not a big deal. But at
least those are well-known and traditional ways of getting signals
that people kind of expecy.
But it does seem to violate all the normal 'kill()' checks, and it
smells horribly bad.
Linus
On Tue, Feb 17, 2026 at 03:44:52PM -0800, Linus Torvalds wrote: > On Tue, 17 Feb 2026 at 15:38, Jann Horn <jannh@google.com> wrote: > > > > You can already send SIGHUP to such binaries through things like job > > control, right? > > But at least those can be blocked, and people can disassociate > themselves from a tty if they care etc. > > This seems like it can't be blocked any way, although I guess you can > just do the double fork dance to distance yourself from your parent. > > > Also, on a Linux system with systemd, I believe a normal user, when > > running in the context of a user session (but not when running in the > > context of a system service), can already SIGKILL anything they launch > > by launching it in a systemd user service, then doing something [...] > > Ugh. But at least it's not the kernel that does it, and we have rules > for sending signals. > > > I agree that this would be a change to the security model, but I'm not > > sure if it would be that big a change. > > I would expect most normal binaries to expect to be killed with ^C etc > anyway, so in that sense this is indeed likely not a big deal. But at > least those are well-known and traditional ways of getting signals > that people kind of expecy. I think you missed the message that I sent as a reply right away. I'm very aware that as written this will allow users to kill setuid binaries. I explictly wrote the first RFC so autokill isn't reset during bprm->secureexec nor during commit_creds() - in contrast to pdeath signal. I'm very aware of all of this and am calling it out in the commit message as well. The kill-on-close contract cannot be flaunted no matter what gets executed very much in contrast to pdeath_signal which is annoying because it magically gets unset and then userspace needs to know when it got unset and then needs to reset it again. My ideal model for kill-on-close is to just ruthlessly enforce that the kernel murders anything once the file is released. I would value input under what circumstances we could make this work without having the kernel magically unset it under magical circumstances that are completely opaque to userspace.
On Wed, Feb 18, 2026 at 09:18:49AM +0100, Christian Brauner wrote:
> The kill-on-close contract cannot be flaunted no matter what gets
> executed very much in contrast to pdeath_signal which is annoying
> because it magically gets unset and then userspace needs to know when it
> got unset and then needs to reset it again.
I think you mean "violated", not "flaunted", above.
If a process can do the double-fork dance to avoid getting killed, is
that a problem with your use case?
What if we give the process time to exit before we bring down the
hammer, as I suggested in another message on this thread?
> My ideal model for kill-on-close is to just ruthlessly enforce that the
> kernel murders anything once the file is released. I would value input
> under what circumstances we could make this work without having the
> kernel magically unset it under magical circumstances that are
> completely opaque to userspace.
I don't think this proposal would fly, but what if an exec of a setuid
binary fails with an error if the AUTOKILL flag is set? :-)
- Ted
On Tue, Feb 17, 2026 at 03:44:52PM -0800, Linus Torvalds wrote: > On Tue, 17 Feb 2026 at 15:38, Jann Horn <jannh@google.com> wrote: > > > > You can already send SIGHUP to such binaries through things like job > > control, right? > > But at least those can be blocked, and people can disassociate > themselves from a tty if they care etc. Does CLONE_PIDFD_AUTOKILL need to send a SIGKILL? Could it be something that could be trapped/blocked, like SIGHUP or SIGTERM? Or maybe we could do the SIGHUP, wait 30 seconds (+/- a random delay), if it hasn't exited, send SIGTERM, wait another 30 seconds (+/- a random delay) if it hasn't exited send a SIGKILL. That's still a change in the security model, but it's less likely to cause problems if the goal is to try to catch a setuid program while it is in the middle of editing some critical file such as /etc/sudo.conf or /etc/passwd or some such. I bet we'll still see some zero days coming out of this, but we can at least mitigate likelihood of security breach. - Ted
On Wed, Feb 18, 2026 at 12:38:02AM +0100, Jann Horn wrote:
> On Wed, Feb 18, 2026 at 12:18 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > On Tue, 17 Feb 2026 at 14:36, Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
> > > lifetime to the pidfd returned from clone3(). When the last reference to
> > > the struct file created by clone3() is closed the kernel sends SIGKILL
> > > to the child.
> >
> > Did I read this right? You can now basically kill suid binaries that
> > you started but don't have rights to kill any other way.
> >
> > If I'm right, this is completely broken. Please explain.
>
> You can already send SIGHUP to such binaries through things like job
> control, right?
> Do we know if there are setuid binaries out there that change their
> ruid and suid to prevent being killable via kill_ok_by_cred(), then
> set SIGHUP to SIG_IGN to not be killable via job control, and then do
> some work that shouldn't be interrupted?
>
> Also, on a Linux system with systemd, I believe a normal user, when
> running in the context of a user session (but not when running in the
> context of a system service), can already SIGKILL anything they launch
> by launching it in a systemd user service, then doing something like
> "echo 1 > /sys/fs/cgroup/user.slice/user-$UID.slice/user@$UID.service/app.slice/<servicename>.scope/cgroup.kill"
> because systemd delegates cgroups for anything a user runs to that
> user; and cgroup.kill goes through the codepath
> cgroup_kill_write -> cgroup_kill -> __cgroup_kill -> send_sig(SIGKILL,
> task, 0) -> send_sig_info -> do_send_sig_info
> which, as far as I know, bypasses the normal signal sending permission
> checks. (For comparison, group_send_sig_info() first calls
> check_kill_permission(), then do_send_sig_info().)
>
> I agree that this would be a change to the security model, but I'm not
> sure if it would be that big a change. I guess an alternative might be
> to instead gate the clone() flag on a `task_no_new_privs(current) ||
> ns_capable()` check like in seccomp, but that might be too restrictive
> for the usecases Christian has in mind...
So I'm going to briefly reiterate what I wrote in my other replies because
I really don't want to get anyone the impression that I don't understand
that this is a change in the security model - It's what I explicitly
wanted to discuss:
I'm very aware that as written this will allow users to kill setuid
binaries. I explictly wrote the first RFC so autokill isn't reset during
bprm->secureexec nor during commit_creds() - in contrast to pdeath
signal.
I did indeed think of simply using the seccomp model. I have a long
document about all of the different implications for all of this.
Ideally we'd not have to use the seccomp model but if we have to I'm
fine with it. There are two problems I would want to avoid though. Right
now pdeath_signal is reset on _any_ set*id() transition via
commit_creds(). Which makes it really useless.
For example, if you setup a container the child sets pdeath_signal so it
gets auto-killed when the container setup process dies. But as soon as
the child uses set*id() calls to become privileged over the container's
namespaces pdeath_signal magically gets reset. So all container runtimes
have this annoying code in some form:
static int do_start(void *data) /* container workload that gets setup */
{
<snip>
/* This prctl must be before the synchro, so if the parent dies before
* we set the parent death signal, we will detect its death with the
* synchro right after, otherwise we have a window where the parent can
* exit before we set the pdeath signal leading to a unsupervized
* container.
*/
ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
if (ret < 0) {
SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
goto out_warn_father;
}
<snip>
/* If we are in a new user namespace, become root there to have
* privilege over our namespace.
*/
if (!list_empty(&handler->conf->id_map)) {
<snip>
/* Drop groups only after we switched to a valid gid in the new
* user namespace.
*/
if (!lxc_drop_groups() &&
(handler->am_root || errno != EPERM))
goto out_warn_father;
if (!lxc_switch_uid_gid(nsuid, nsgid))
goto out_warn_father;
ret = prctl(PR_SET_DUMPABLE, prctl_arg(1), prctl_arg(0),
prctl_arg(0), prctl_arg(0));
if (ret < 0)
goto out_warn_father;
/* set{g,u}id() clears deathsignal */
ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
if (ret < 0) {
SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
goto out_warn_father;
}
<sip>
I can't stress how useless this often makes pdeath_signal. Let alone
that the child must set it so there's always a race with the parent
dying while the child is setting it. And obviously it isn't just
containers. It's anything that deprivileges itself including some
services.
If we require the seccomp task_no_new_privs() thing I really really
would like to not have to reset autokill during commit_creds().
Because then it is at least consistent for task_no_new_privs() without
magic resets.
TL;DR as long as we can come up with a model where there are no magical
resets of the property by the kernel this is useful.
On Tue, Feb 17, 2026 at 11:36 PM Christian Brauner <brauner@kernel.org> wrote:
> Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
> lifetime to the pidfd returned from clone3(). When the last reference to
> the struct file created by clone3() is closed the kernel sends SIGKILL
> to the child. A pidfd obtained via pidfd_open() for the same process
> does not keep the child alive and does not trigger autokill - only the
> specific struct file from clone3() has this property.
>
> This is useful for container runtimes, service managers, and sandboxed
> subprocess execution - any scenario where the child must die if the
> parent crashes or abandons the pidfd.
Idle thought, feel free to ignore:
In those scenarios, I guess what you'd ideally want would be a way to
kill the entire process hierarchy, not just the one process that was
spawned? Unless the process is anyway PID 1 of its own pid namespace.
But that would probably be more invasive and kind of an orthogonal
feature...
[...]
> +static int pidfs_file_release(struct inode *inode, struct file *file)
> +{
> + struct pid *pid = inode->i_private;
> + struct task_struct *task;
> +
> + guard(rcu)();
> + task = pid_task(pid, PIDTYPE_TGID);
> + if (task && READ_ONCE(task->signal->autokill_pidfd) == file)
Can you maybe also clear out the task->signal->autokill_pidfd pointer
here? It should be fine in practice either way, but theoretically,
with the current code, this equality check could wrongly match if the
actual autokill file has been released and a new pidfd file has been
reallocated at the same address... Of course, at worst that would kill
a task that has already been killed, so it wouldn't be particularly
bad, but still it's ugly.
> + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
> +
> + return 0;
> +}
[...]
> @@ -2470,8 +2479,11 @@ __latent_entropy struct task_struct *copy_process(
> syscall_tracepoint_update(p);
> write_unlock_irq(&tasklist_lock);
>
> - if (pidfile)
> + if (pidfile) {
> + if (clone_flags & CLONE_PIDFD_AUTOKILL)
> + p->signal->autokill_pidfd = pidfile;
WRITE_ONCE() to match the READ_ONCE() in pidfs_file_release()?
> fd_install(pidfd, pidfile);
> + }
>
> proc_fork_connector(p);
> sched_post_fork(p);
On Wed, Feb 18, 2026 at 12:43:59AM +0100, Jann Horn wrote:
> On Tue, Feb 17, 2026 at 11:36 PM Christian Brauner <brauner@kernel.org> wrote:
> > Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
> > lifetime to the pidfd returned from clone3(). When the last reference to
> > the struct file created by clone3() is closed the kernel sends SIGKILL
> > to the child. A pidfd obtained via pidfd_open() for the same process
> > does not keep the child alive and does not trigger autokill - only the
> > specific struct file from clone3() has this property.
> >
> > This is useful for container runtimes, service managers, and sandboxed
> > subprocess execution - any scenario where the child must die if the
> > parent crashes or abandons the pidfd.
>
> Idle thought, feel free to ignore:
> In those scenarios, I guess what you'd ideally want would be a way to
> kill the entire process hierarchy, not just the one process that was
> spawned? Unless the process is anyway PID 1 of its own pid namespace.
> But that would probably be more invasive and kind of an orthogonal
> feature...
It's something that I have as an exploration item on a ToDo. :)
>
> [...]
> > +static int pidfs_file_release(struct inode *inode, struct file *file)
> > +{
> > + struct pid *pid = inode->i_private;
> > + struct task_struct *task;
> > +
> > + guard(rcu)();
> > + task = pid_task(pid, PIDTYPE_TGID);
> > + if (task && READ_ONCE(task->signal->autokill_pidfd) == file)
>
> Can you maybe also clear out the task->signal->autokill_pidfd pointer
> here? It should be fine in practice either way, but theoretically,
Yes, of course.
> with the current code, this equality check could wrongly match if the
> actual autokill file has been released and a new pidfd file has been
> reallocated at the same address... Of course, at worst that would kill
> a task that has already been killed, so it wouldn't be particularly
> bad, but still it's ugly.
>
> > + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
> > +
> > + return 0;
> > +}
> [...]
> > @@ -2470,8 +2479,11 @@ __latent_entropy struct task_struct *copy_process(
> > syscall_tracepoint_update(p);
> > write_unlock_irq(&tasklist_lock);
> >
> > - if (pidfile)
> > + if (pidfile) {
> > + if (clone_flags & CLONE_PIDFD_AUTOKILL)
> > + p->signal->autokill_pidfd = pidfile;
>
> WRITE_ONCE() to match the READ_ONCE() in pidfs_file_release()?
Agreed.
On 02/17, Christian Brauner wrote:
>
> @@ -2470,8 +2479,11 @@ __latent_entropy struct task_struct *copy_process(
> syscall_tracepoint_update(p);
> write_unlock_irq(&tasklist_lock);
>
> - if (pidfile)
> + if (pidfile) {
> + if (clone_flags & CLONE_PIDFD_AUTOKILL)
> + p->signal->autokill_pidfd = pidfile;
> fd_install(pidfd, pidfile);
Just curious... Instead of adding signal->autokill_pidfd, can't we
add another "not fcntl" PIDFD_AUTOKILL flag that lives in ->f_flags ?
Oleg.
On Wed, Feb 18, 2026 at 12:50:41PM +0100, Oleg Nesterov wrote:
> On 02/17, Christian Brauner wrote:
> >
> > @@ -2470,8 +2479,11 @@ __latent_entropy struct task_struct *copy_process(
> > syscall_tracepoint_update(p);
> > write_unlock_irq(&tasklist_lock);
> >
> > - if (pidfile)
> > + if (pidfile) {
> > + if (clone_flags & CLONE_PIDFD_AUTOKILL)
> > + p->signal->autokill_pidfd = pidfile;
> > fd_install(pidfd, pidfile);
>
> Just curious... Instead of adding signal->autokill_pidfd, can't we
> add another "not fcntl" PIDFD_AUTOKILL flag that lives in ->f_flags ?
This is a version I had as well and yes, that works too!
© 2016 - 2026 Red Hat, Inc.