Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
lifetime to the pidfd returned from clone3(). When the last reference to
the struct file created by clone3() is closed the kernel sends SIGKILL
to the child. A pidfd obtained via pidfd_open() for the same process
does not keep the child alive and does not trigger autokill - only the
specific struct file from clone3() has this property.
This is useful for container runtimes, service managers, and sandboxed
subprocess execution - any scenario where the child must die if the
parent crashes or abandons the pidfd.
CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying
lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no
one to reap it would become a zombie). CLONE_THREAD is rejected because
autokill targets a process not a thread.
The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on
the struct file at clone3() time. The pidfs .release handler checks this
flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...)
only when it is set. Files from pidfd_open() or open_by_handle_at() are
distinct struct files that do not carry this flag. dup()/fork() share the
same struct file so they extend the child's lifetime until the last
reference drops.
CLONE_PIDFD_AUTOKILL automatically sets no_new_privs on the child
process. This ensures the child cannot escalate privileges beyond the
parent's credential level via setuid/setgid exec. Because the child can
never outprivilege the parent the autokill SIGKILL is always within the
parent's natural authority.
This is a deliberate departure from the pdeath_signal model which is
reset during secureexec and commit_creds() rendering it useless for
container runtimes that need to deprivilege themselves. Setting
no_new_privs on the child avoids the need for any such magical resets:
the kill-on-close contract is absolute.
The no_new_privs restriction only affects the child. The parent retains
its full privileges and can continue to execute setuid binaries.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/pidfs.c | 38 ++++++++++++++++++++++++++++++++------
include/uapi/linux/pidfd.h | 1 +
include/uapi/linux/sched.h | 1 +
kernel/fork.c | 22 +++++++++++++++++++---
4 files changed, 53 insertions(+), 9 deletions(-)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 318253344b5c..a8d1bca0395d 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -8,6 +8,8 @@
#include <linux/mount.h>
#include <linux/pid.h>
#include <linux/pidfs.h>
+#include <linux/sched/signal.h>
+#include <linux/signal.h>
#include <linux/pid_namespace.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
@@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return open_namespace(ns_common);
}
+static int pidfs_file_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = inode->i_private;
+ struct task_struct *task;
+
+ if (!(file->f_flags & PIDFD_AUTOKILL))
+ return 0;
+
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return 0;
+
+ /* Not available for kthreads or user workers for now. */
+ if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ return 0;
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
+ return 0;
+}
+
static const struct file_operations pidfs_file_operations = {
+ .release = pidfs_file_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
@@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
int ret;
/*
- * Ensure that PIDFD_STALE can be passed as a flag without
- * overloading other uapi pidfd flags.
+ * Ensure that internal pidfd flags don't overlap with each
+ * other or with uapi pidfd flags.
*/
- BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
- BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
+ BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK |
+ PIDFD_STALE | PIDFD_AUTOKILL) != 4);
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
@@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
flags &= ~PIDFD_STALE;
flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred());
- /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+ /*
+ * Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as
+ * do_dentry_open() strips O_EXCL and O_TRUNC.
+ */
if (!IS_ERR(pidfd_file))
- pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+ pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL));
return pidfd_file;
}
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index ea9a6811fc76..9281956a9f32 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -13,6 +13,7 @@
#ifdef __KERNEL__
#include <linux/sched.h>
#define PIDFD_STALE CLONE_PIDFD
+#define PIDFD_AUTOKILL O_TRUNC
#endif
/* Flags for pidfd_send_signal(). */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 8a22ea640817..b1aea8a86e2f 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -37,6 +37,7 @@
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#define CLONE_AUTOREAP 0x400000000ULL /* Auto-reap child on exit. */
+#define CLONE_PIDFD_AUTOKILL 0x800000000ULL /* Kill child when clone pidfd closes. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
diff --git a/kernel/fork.c b/kernel/fork.c
index 0dedf2999f0c..778aed24e01d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2037,6 +2037,15 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2259,13 +2268,20 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+ unsigned flags = PIDFD_STALE;
+
+ if (clone_flags & CLONE_THREAD)
+ flags |= PIDFD_THREAD;
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ task_set_no_new_privs(p);
+ flags |= PIDFD_AUTOKILL;
+ }
/*
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+ retval = pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2909,7 +2925,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
- CLONE_AUTOREAP))
+ CLONE_AUTOREAP | CLONE_PIDFD_AUTOKILL))
return false;
/*
--
2.47.3
On 02/23, Christian Brauner wrote:
>
> @@ -2259,13 +2268,20 @@ __latent_entropy struct task_struct *copy_process(
> * if the fd table isn't shared).
> */
> if (clone_flags & CLONE_PIDFD) {
> - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
> + unsigned flags = PIDFD_STALE;
> +
> + if (clone_flags & CLONE_THREAD)
> + flags |= PIDFD_THREAD;
> + if (clone_flags & CLONE_PIDFD_AUTOKILL) {
> + task_set_no_new_privs(p);
> + flags |= PIDFD_AUTOKILL;
> + }
>
> /*
> * Note that no task has been attached to @pid yet indicate
> * that via CLONE_PIDFD.
> */
> - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
> + retval = pidfd_prepare(pid, flags, &pidfile);
Confused... I think you also need to change pidfs_alloc_file() to restore
O_TRUNC after do_dentry_open() clears this flag? Just like it curently does
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
?
Oleg.
On 02/23, Oleg Nesterov wrote:
>
> On 02/23, Christian Brauner wrote:
> >
> > @@ -2259,13 +2268,20 @@ __latent_entropy struct task_struct *copy_process(
> > * if the fd table isn't shared).
> > */
> > if (clone_flags & CLONE_PIDFD) {
> > - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
> > + unsigned flags = PIDFD_STALE;
> > +
> > + if (clone_flags & CLONE_THREAD)
> > + flags |= PIDFD_THREAD;
> > + if (clone_flags & CLONE_PIDFD_AUTOKILL) {
> > + task_set_no_new_privs(p);
> > + flags |= PIDFD_AUTOKILL;
> > + }
> >
> > /*
> > * Note that no task has been attached to @pid yet indicate
> > * that via CLONE_PIDFD.
> > */
> > - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
> > + retval = pidfd_prepare(pid, flags, &pidfile);
>
> Confused... I think you also need to change pidfs_alloc_file() to restore
> O_TRUNC after do_dentry_open() clears this flag? Just like it curently does
>
> pidfd_file->f_flags |= (flags & PIDFD_THREAD);
Aah! please ignore me. Somehow I missed exactly this change in your patch.
Sorry for noise!
Oleg.
On 02/23, Oleg Nesterov wrote: > > Sorry for noise! Yes, but let me add more (off-topic) noise to this thread... pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR" makes no sense because pidfs_alloc_file() itself does flags |= O_RDWR; I was going to send the trivial cleanup, but why a pidfs file needs O_RDWR/FMODE_WRITE ? Actually the same question about some anon_inode_getfile_fmode(O_RDWR) users, for example signalfd.c. Can you explain just for my education? Oleg.
On 02/23, Oleg Nesterov wrote:
>
> pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> makes no sense because pidfs_alloc_file() itself does
>
> flags |= O_RDWR;
>
> I was going to send the trivial cleanup, but why a pidfs file needs
> O_RDWR/FMODE_WRITE ?
>
> Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> users, for example signalfd.c.
perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
files read-only") ?
Oleg.
On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> On 02/23, Oleg Nesterov wrote:
> >
> > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > makes no sense because pidfs_alloc_file() itself does
> >
> > flags |= O_RDWR;
> >
> > I was going to send the trivial cleanup, but why a pidfs file needs
> > O_RDWR/FMODE_WRITE ?
> >
> > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > users, for example signalfd.c.
>
> perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> files read-only") ?
It was always a possibility that we would support some form of
write-like operation eventually. And we have support for setting trusted
extended attributes on pidfds for some time now (trusted xattrs require
global cap_sys_admin).
On 02/23, Christian Brauner wrote:
>
> On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> > On 02/23, Oleg Nesterov wrote:
> > >
> > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > > makes no sense because pidfs_alloc_file() itself does
> > >
> > > flags |= O_RDWR;
> > >
> > > I was going to send the trivial cleanup, but why a pidfs file needs
> > > O_RDWR/FMODE_WRITE ?
> > >
> > > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > > users, for example signalfd.c.
> >
> > perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> > files read-only") ?
>
> It was always a possibility that we would support some form of
> write-like operation eventually. And we have support for setting trusted
> extended attributes on pidfds for some time now (trusted xattrs require
> global cap_sys_admin).
But why do we need O_RDWR right now? That was my question.
I can be easily wrong, but I think that pidfs_xattr_handlers logic doesn't
need it...
OK, I won't pretend I understand fs, I'll send the trivial cleanup which just
removes the unnecessary "flags | O_RDWR" in pidfd_prepare().
Oleg.
On Tue, Feb 24, 2026 at 11:17:43AM +0100, Oleg Nesterov wrote:
> On 02/23, Christian Brauner wrote:
> >
> > On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> > > On 02/23, Oleg Nesterov wrote:
> > > >
> > > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > > > makes no sense because pidfs_alloc_file() itself does
> > > >
> > > > flags |= O_RDWR;
> > > >
> > > > I was going to send the trivial cleanup, but why a pidfs file needs
> > > > O_RDWR/FMODE_WRITE ?
> > > >
> > > > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > > > users, for example signalfd.c.
> > >
> > > perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> > > files read-only") ?
> >
> > It was always a possibility that we would support some form of
> > write-like operation eventually. And we have support for setting trusted
> > extended attributes on pidfds for some time now (trusted xattrs require
> > global cap_sys_admin).
>
> But why do we need O_RDWR right now? That was my question.
>
> I can be easily wrong, but I think that pidfs_xattr_handlers logic doesn't
> need it...
>
> OK, I won't pretend I understand fs, I'll send the trivial cleanup which just
> removes the unnecessary "flags | O_RDWR" in pidfd_prepare().
xattrs don't need FMODE_WRITE. You can use O_RDONLY fds with the
justification that it's metadata (most likely). Although I always found
that rather weird. Sending signals is technically also equivalent to
writing and I think that was the original reason this was done. If you
want to remove it then be my guest.
On Mon, 23 Feb 2026 22:39:22 +0100
Christian Brauner <brauner@kernel.org> wrote:
> On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> > On 02/23, Oleg Nesterov wrote:
> > >
> > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > > makes no sense because pidfs_alloc_file() itself does
> > >
> > > flags |= O_RDWR;
> > >
> > > I was going to send the trivial cleanup, but why a pidfs file needs
> > > O_RDWR/FMODE_WRITE ?
> > >
> > > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > > users, for example signalfd.c.
> >
> > perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> > files read-only") ?
>
> It was always a possibility that we would support some form of
> write-like operation eventually. And we have support for setting trusted
> extended attributes on pidfds for some time now (trusted xattrs require
> global cap_sys_admin).
>
Isn't 'sending a signal' a write-like operation?
David
On Mon, 23 Feb 2026 18:05:44 +0100 Oleg Nesterov <oleg@redhat.com> wrote: > On 02/23, Oleg Nesterov wrote: > > > > Sorry for noise! > > Yes, but let me add more (off-topic) noise to this thread... > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR" > makes no sense because pidfs_alloc_file() itself does > > flags |= O_RDWR; > > I was going to send the trivial cleanup, but why a pidfs file needs > O_RDWR/FMODE_WRITE ? Or why any program that gets that far through the code 'wins' write access. David
© 2016 - 2026 Red Hat, Inc.