pidfd: add CLONE_AUTOREAP, CLONE_NNP, and CLONE_PIDFD_AUTOKILL

[PATCH v5 3/6] pidfd: add CLONE_PIDFD_AUTOKILL

Posted by Christian Brauner 1 month, 1 week ago

Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
lifetime to the pidfd returned from clone3(). When the last reference to
the struct file created by clone3() is closed the kernel sends SIGKILL
to the child. A pidfd obtained via pidfd_open() for the same process
does not keep the child alive and does not trigger autokill - only the
specific struct file from clone3() has this property.

This is useful for container runtimes, service managers, and sandboxed
subprocess execution - any scenario where the child must die if the
parent crashes or abandons the pidfd.

CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying
lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no
one to reap it would become a zombie). CLONE_THREAD is rejected because
autokill targets a process not a thread.

The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on
the struct file at clone3() time. The pidfs .release handler checks this
flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...)
only when it is set. Files from pidfd_open() or open_by_handle_at() are
distinct struct files that do not carry this flag. dup()/fork() share the
same struct file so they extend the child's lifetime until the last
reference drops.

CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without
CLONE_NNP the child could escalate privileges via setuid/setgid exec
after being spawned, so the caller must have CAP_SYS_ADMIN in its user
namespace. With CLONE_NNP the child can never gain new privileges so
unprivileged usage is allowed. This is a deliberate departure from the
pdeath_signal model which is reset during secureexec and commit_creds()
rendering it useless for container runtimes that need to deprivilege
themselves.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                 | 38 ++++++++++++++++++++++++++++++++------
 include/uapi/linux/pidfd.h |  1 +
 include/uapi/linux/sched.h |  1 +
 kernel/fork.c              | 29 ++++++++++++++++++++++++++---
 4 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 318253344b5c..a8d1bca0395d 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -8,6 +8,8 @@
 #include <linux/mount.h>
 #include <linux/pid.h>
 #include <linux/pidfs.h>
+#include <linux/sched/signal.h>
+#include <linux/signal.h>
 #include <linux/pid_namespace.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
@@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return open_namespace(ns_common);
 }
 
+static int pidfs_file_release(struct inode *inode, struct file *file)
+{
+	struct pid *pid = inode->i_private;
+	struct task_struct *task;
+
+	if (!(file->f_flags & PIDFD_AUTOKILL))
+		return 0;
+
+	guard(rcu)();
+	task = pid_task(pid, PIDTYPE_TGID);
+	if (!task)
+		return 0;
+
+	/* Not available for kthreads or user workers for now. */
+	if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER)))
+		return 0;
+	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
+	return 0;
+}
+
 static const struct file_operations pidfs_file_operations = {
+	.release	= pidfs_file_release,
 	.poll		= pidfd_poll,
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= pidfd_show_fdinfo,
@@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	int ret;
 
 	/*
-	 * Ensure that PIDFD_STALE can be passed as a flag without
-	 * overloading other uapi pidfd flags.
+	 * Ensure that internal pidfd flags don't overlap with each
+	 * other or with uapi pidfd flags.
 	 */
-	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
-	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
+	BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK |
+				PIDFD_STALE | PIDFD_AUTOKILL) != 4);
 
 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
 	if (ret < 0)
@@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	flags &= ~PIDFD_STALE;
 	flags |= O_RDWR;
 	pidfd_file = dentry_open(&path, flags, current_cred());
-	/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+	/*
+	 * Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as
+	 * do_dentry_open() strips O_EXCL and O_TRUNC.
+	 */
 	if (!IS_ERR(pidfd_file))
-		pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+		pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL));
 
 	return pidfd_file;
 }
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index ea9a6811fc76..9281956a9f32 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -13,6 +13,7 @@
 #ifdef __KERNEL__
 #include <linux/sched.h>
 #define PIDFD_STALE CLONE_PIDFD
+#define PIDFD_AUTOKILL O_TRUNC
 #endif
 
 /* Flags for pidfd_send_signal(). */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 7b1b87473ebb..0aafb4652afc 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -37,6 +37,7 @@
 #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
 #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
 #define CLONE_AUTOREAP 0x400000000ULL /* Auto-reap child on exit. */
+#define CLONE_PIDFD_AUTOKILL 0x800000000ULL /* Kill child when clone pidfd closes. */
 #define CLONE_NNP 0x1000000000ULL /* Set no_new_privs on child. */
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index a3202ee278d8..0f4944ce378d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2042,6 +2042,24 @@ __latent_entropy struct task_struct *copy_process(
 			return ERR_PTR(-EINVAL);
 	}
 
+	if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+		if (!(clone_flags & CLONE_PIDFD))
+			return ERR_PTR(-EINVAL);
+		if (!(clone_flags & CLONE_AUTOREAP))
+			return ERR_PTR(-EINVAL);
+		if (clone_flags & CLONE_THREAD)
+			return ERR_PTR(-EINVAL);
+		/*
+		 * Without CLONE_NNP the child could escalate privileges
+		 * after being spawned, so require CAP_SYS_ADMIN.
+		 * With CLONE_NNP the child can't gain new privileges,
+		 * so allow unprivileged usage.
+		 */
+		if (!(clone_flags & CLONE_NNP) &&
+		    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+	}
+
 	/*
 	 * Force any signals received before this point to be delivered
 	 * before the fork happens.  Collect up signals sent to multiple
@@ -2264,13 +2282,18 @@ __latent_entropy struct task_struct *copy_process(
 	 * if the fd table isn't shared).
 	 */
 	if (clone_flags & CLONE_PIDFD) {
-		int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+		unsigned flags = PIDFD_STALE;
+
+		if (clone_flags & CLONE_THREAD)
+			flags |= PIDFD_THREAD;
+		if (clone_flags & CLONE_PIDFD_AUTOKILL)
+			flags |= PIDFD_AUTOKILL;
 
 		/*
 		 * Note that no task has been attached to @pid yet indicate
 		 * that via CLONE_PIDFD.
 		 */
-		retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+		retval = pidfd_prepare(pid, flags, &pidfile);
 		if (retval < 0)
 			goto bad_fork_free_pid;
 		pidfd = retval;
@@ -2917,7 +2940,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
 	/* Verify that no unknown flags are passed along. */
 	if (kargs->flags &
 	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
-	      CLONE_AUTOREAP | CLONE_NNP))
+	      CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL))
 		return false;
 
 	/*

-- 
2.47.3

Re: [PATCH v5 3/6] pidfd: add CLONE_PIDFD_AUTOKILL

Posted by Jann Horn 1 month ago

On Thu, Feb 26, 2026 at 2:51 PM Christian Brauner <brauner@kernel.org> wrote:
> Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
> lifetime to the pidfd returned from clone3(). When the last reference to
> the struct file created by clone3() is closed the kernel sends SIGKILL
> to the child. A pidfd obtained via pidfd_open() for the same process
> does not keep the child alive and does not trigger autokill - only the
> specific struct file from clone3() has this property.
>
> This is useful for container runtimes, service managers, and sandboxed
> subprocess execution - any scenario where the child must die if the
> parent crashes or abandons the pidfd.
>
> CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying
> lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no
> one to reap it would become a zombie). CLONE_THREAD is rejected because
> autokill targets a process not a thread.
>
> The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on
> the struct file at clone3() time. The pidfs .release handler checks this
> flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...)
> only when it is set. Files from pidfd_open() or open_by_handle_at() are
> distinct struct files that do not carry this flag. dup()/fork() share the
> same struct file so they extend the child's lifetime until the last
> reference drops.
>
> CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without
> CLONE_NNP the child could escalate privileges via setuid/setgid exec
> after being spawned, so the caller must have CAP_SYS_ADMIN in its user
> namespace. With CLONE_NNP the child can never gain new privileges so
> unprivileged usage is allowed. This is a deliberate departure from the
> pdeath_signal model which is reset during secureexec and commit_creds()
> rendering it useless for container runtimes that need to deprivilege
> themselves.
[...]
> diff --git a/kernel/fork.c b/kernel/fork.c
> index a3202ee278d8..0f4944ce378d 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2042,6 +2042,24 @@ __latent_entropy struct task_struct *copy_process(
>                         return ERR_PTR(-EINVAL);
>         }
>
> +       if (clone_flags & CLONE_PIDFD_AUTOKILL) {
> +               if (!(clone_flags & CLONE_PIDFD))
> +                       return ERR_PTR(-EINVAL);
> +               if (!(clone_flags & CLONE_AUTOREAP))
> +                       return ERR_PTR(-EINVAL);
> +               if (clone_flags & CLONE_THREAD)
> +                       return ERR_PTR(-EINVAL);
> +               /*
> +                * Without CLONE_NNP the child could escalate privileges
> +                * after being spawned, so require CAP_SYS_ADMIN.
> +                * With CLONE_NNP the child can't gain new privileges,
> +                * so allow unprivileged usage.
> +                */
> +               if (!(clone_flags & CLONE_NNP) &&
> +                   !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> +                       return ERR_PTR(-EPERM);
> +       }

That security model looks good to me.

[PATCH v5 1/6] clone: add CLONE_AUTOREAP
[PATCH v5 2/6] clone: add CLONE_NNP
[PATCH v5 3/6] pidfd: add CLONE_PIDFD_AUTOKILL
[PATCH v5 4/6] selftests/pidfd: add CLONE_AUTOREAP tests
[PATCH v5 5/6] selftests/pidfd: add CLONE_NNP tests
[PATCH v5 6/6] selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests