[PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP

Christian Brauner posted 9 patches 7 months, 1 week ago
There is a newer version of this series
[PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP
Posted by Christian Brauner 7 months, 1 week ago
Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP
mask flag. This adds the fields @coredump_mask and @coredump_cookie to
struct pidfd_info.

When a task coredumps the kernel will provide the following information
to userspace in @coredump_mask:

* PIDFD_COREDUMPED is raised if the task did actually coredump.
* PIDFD_COREDUMP_SKIP is raised if the task skipped coredumping (e.g.,
  undumpable).
* PIDFD_COREDUMP_USER is raised if this is a regular coredump and
  doesn't need special care by the coredump server.
* PIDFD_COREDUMP_ROOT is raised if the generated coredump should be
  treated as sensitive and the coredump server should restrict to the
  generated coredump to sufficiently privileged users.

If userspace uses the coredump socket to process coredumps it needs to
be able to discern connection from the kernel from connects from
userspace (e.g., Python generating it's own coredumps and forwarding
them to systemd). The @coredump_cookie extension uses the SO_COOKIE of
the new connection. This allows userspace to validate that the
connection has been made from the kernel by a crashing task:

   fd_coredump = accept4(fd_socket, NULL, NULL, SOCK_CLOEXEC);
   getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, &fd_peer_pidfd_len);

   struct pidfd_info info = {
           info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
   };

   ioctl(pidfd, PIDFD_GET_INFO, &info);
   /* Refuse connections that aren't from a crashing task. */
   if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED) )
           close(fd_coredump);

   /*
    * Make sure that the coredump cookie matches the connection cookie.
    * If they don't it's not the coredump connection from the kernel.
    * We'll get another connection request in a bit.
    */
   getsocketop(fd_coredump, SOL_SOCKET, SO_COOKIE, &peer_cookie, &peer_cookie_len);
   if (!info.coredump_cookie || (info.coredump_cookie != peer_cookie))
           close(fd_coredump);

The kernel guarantees that by the time the connection is made the all
PIDFD_INFO_COREDUMP info is available.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c              | 34 ++++++++++++++++++++
 fs/pidfs.c                 | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pidfs.h      | 10 ++++++
 include/uapi/linux/pidfd.h | 22 +++++++++++++
 net/unix/af_unix.c         |  7 ++++
 5 files changed, 152 insertions(+)

diff --git a/fs/coredump.c b/fs/coredump.c
index e1256ebb89c1..bfc4a32f737c 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -46,7 +46,9 @@
 #include <linux/pidfs.h>
 #include <linux/net.h>
 #include <linux/socket.h>
+#include <net/af_unix.h>
 #include <net/net_namespace.h>
+#include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 #include <uapi/linux/un.h>
 
@@ -598,6 +600,8 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
 		if (IS_ERR(pidfs_file))
 			return PTR_ERR(pidfs_file);
 
+		pidfs_coredump(cp);
+
 		/*
 		 * Usermode helpers are childen of either
 		 * system_unbound_wq or of kthreadd. So we know that
@@ -876,8 +880,34 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			goto close_fail;
 		}
 
+		/*
+		 * Set the thread-group leader pid which is used for the
+		 * peer credentials during connect() below. Then
+		 * immediately register it in pidfs...
+		 */
+		cprm.pid = task_tgid(current);
+		retval = pidfs_register_pid(cprm.pid);
+		if (retval) {
+			sock_release(socket);
+			goto close_fail;
+		}
+
+		/*
+		 * ... and set the coredump information so userspace
+		 * has it available after connect()...
+		 */
+		pidfs_coredump(&cprm);
+
+		/*
+		 * ... On connect() the peer credentials are recorded
+		 * and @cprm.pid registered in pidfs...
+		 */
 		retval = kernel_connect(socket, (struct sockaddr *)(&addr),
 					addr_len, O_NONBLOCK | SOCK_COREDUMP);
+
+		/* ... So we can safely put our pidfs reference now... */
+		pidfs_put_pid(cprm.pid);
+
 		if (retval) {
 			if (retval == -EAGAIN)
 				coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
@@ -886,6 +916,10 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			goto close_fail;
 		}
 
+		/* ... and validate that @sk_peer_pid matches @cprm.pid. */
+		if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid))
+			goto close_fail;
+
 		cprm.limit = RLIM_INFINITY;
 		cprm.file = no_free_ptr(file);
 #else
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 3b39e471840b..d7b9a0dd2db6 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -20,6 +20,7 @@
 #include <linux/time_namespace.h>
 #include <linux/utsname.h>
 #include <net/net_namespace.h>
+#include <linux/coredump.h>
 
 #include "internal.h"
 #include "mount.h"
@@ -33,6 +34,8 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
 struct pidfs_exit_info {
 	__u64 cgroupid;
 	__s32 exit_code;
+	__u32 coredump_mask;
+	__u64 coredump_cookie;
 };
 
 struct pidfs_inode {
@@ -240,6 +243,22 @@ static inline bool pid_in_current_pidns(const struct pid *pid)
 	return false;
 }
 
+static __u32 pidfs_coredump_mask(unsigned long mm_flags)
+{
+	switch (__get_dumpable(mm_flags)) {
+	case SUID_DUMP_USER:
+		return PIDFD_COREDUMP_USER;
+	case SUID_DUMP_ROOT:
+		return PIDFD_COREDUMP_ROOT;
+	case SUID_DUMP_DISABLE:
+		return PIDFD_COREDUMP_SKIP;
+	default:
+		WARN_ON_ONCE(true);
+	}
+
+	return 0;
+}
+
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
@@ -280,6 +299,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 		}
 	}
 
+	if (mask & PIDFD_INFO_COREDUMP) {
+		kinfo.mask |= PIDFD_INFO_COREDUMP;
+		smp_rmb();
+		kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
+		kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+	}
+
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task) {
 		/*
@@ -296,6 +322,16 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 	if (!c)
 		return -ESRCH;
 
+	if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) {
+		task_lock(task);
+		if (task->mm) {
+			smp_rmb();
+			kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
+			kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
+		}
+		task_unlock(task);
+	}
+
 	/* Unconditionally return identifiers and credentials, the rest only on request */
 
 	user_ns = current_user_ns();
@@ -559,6 +595,49 @@ void pidfs_exit(struct task_struct *tsk)
 	}
 }
 
+#if defined(CONFIG_COREDUMP) && defined(CONFIG_UNIX)
+void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie)
+{
+	struct pidfs_exit_info *exit_info;
+	struct dentry *dentry = pid->stashed;
+	struct inode *inode;
+
+	if (WARN_ON_ONCE(!dentry))
+		return;
+
+	inode = d_inode(dentry);
+	exit_info = &pidfs_i(inode)->__pei;
+	/* Can't use smp_store_release() because of 32bit. */
+	smp_wmb();
+	WRITE_ONCE(exit_info->coredump_cookie, coredump_cookie);
+}
+#endif
+
+#ifdef CONFIG_COREDUMP
+void pidfs_coredump(const struct coredump_params *cprm)
+{
+	struct pid *pid = cprm->pid;
+	struct pidfs_exit_info *exit_info;
+	struct dentry *dentry;
+	struct inode *inode;
+	__u32 coredump_mask = 0;
+
+	dentry = pid->stashed;
+	if (WARN_ON_ONCE(!dentry))
+		return;
+
+	inode = d_inode(dentry);
+	exit_info = &pidfs_i(inode)->__pei;
+	/* Note how we were coredumped. */
+	coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
+	/* Note that we actually did coredump. */
+	coredump_mask |= PIDFD_COREDUMPED;
+	/* If coredumping is set to skip we should never end up here. */
+	VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
+	smp_store_release(&exit_info->coredump_mask, coredump_mask);
+}
+#endif
+
 static struct vfsmount *pidfs_mnt __ro_after_init;
 
 /*
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 2676890c4d0d..497997bc5e34 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -2,11 +2,21 @@
 #ifndef _LINUX_PID_FS_H
 #define _LINUX_PID_FS_H
 
+struct coredump_params;
+
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
 void pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
 void pidfs_exit(struct task_struct *tsk);
+#ifdef CONFIG_COREDUMP
+void pidfs_coredump(const struct coredump_params *cprm);
+#endif
+#if defined(CONFIG_COREDUMP) && defined(CONFIG_UNIX)
+void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie);
+#elif defined(CONFIG_UNIX)
+static inline void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie) { }
+#endif
 extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
 void pidfs_get_pid(struct pid *pid);
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 8c1511edd0e9..69267c5ae6d0 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -25,9 +25,28 @@
 #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
 #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
 #define PIDFD_INFO_EXIT			(1UL << 3) /* Only returned if requested. */
+#define PIDFD_INFO_COREDUMP		(1UL << 4) /* Only returned if requested. */
 
 #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
 
+/*
+ * Values for @coredump_mask in pidfd_info.
+ * Only valid if PIDFD_INFO_COREDUMP is set in @mask.
+ *
+ * Note, the @PIDFD_COREDUMP_ROOT flag indicates that the generated
+ * coredump should be treated as sensitive and access should only be
+ * granted to privileged users.
+ *
+ * If the coredump AF_UNIX socket is used for processing coredumps
+ * @coredump_cookie will be set to the socket SO_COOKIE of the receivers
+ * client socket. This allows the coredump handler to detect whether an
+ * incoming coredump connection was initiated from the crashing task.
+ */
+#define PIDFD_COREDUMPED	(1U << 0) /* Did crash and... */
+#define PIDFD_COREDUMP_SKIP	(1U << 1) /* coredumping generation was skipped. */
+#define PIDFD_COREDUMP_USER	(1U << 2) /* coredump was done as the user. */
+#define PIDFD_COREDUMP_ROOT	(1U << 3) /* coredump was done as root. */
+
 /*
  * The concept of process and threads in userland and the kernel is a confusing
  * one - within the kernel every thread is a 'task' with its own individual PID,
@@ -92,6 +111,9 @@ struct pidfd_info {
 	__u32 fsuid;
 	__u32 fsgid;
 	__s32 exit_code;
+	__u32 coredump_mask;
+	__u32 __spare1;
+	__u64 coredump_cookie;
 };
 
 #define PIDFS_IOCTL_MAGIC 0xFF
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a9d1c9ba2961..053d2e48e918 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -99,6 +99,7 @@
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
+#include <linux/sock_diag.h>
 #include <linux/socket.h>
 #include <linux/splice.h>
 #include <linux/string.h>
@@ -742,6 +743,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
 
 struct unix_peercred {
 	struct pid *peer_pid;
+	u64 cookie;
 	const struct cred *peer_cred;
 };
 
@@ -777,6 +779,8 @@ static void drop_peercred(struct unix_peercred *peercred)
 static inline void init_peercred(struct sock *sk,
 				 const struct unix_peercred *peercred)
 {
+	if (peercred->cookie)
+		pidfs_coredump_cookie(peercred->peer_pid, peercred->cookie);
 	sk->sk_peer_pid = peercred->peer_pid;
 	sk->sk_peer_cred = peercred->peer_cred;
 }
@@ -1713,6 +1717,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	unix_peer(newsk)	= sk;
 	newsk->sk_state		= TCP_ESTABLISHED;
 	newsk->sk_type		= sk->sk_type;
+	/* Prepare a new socket cookie for the receiver. */
+	if (flags & SOCK_COREDUMP)
+		peercred.cookie = sock_gen_cookie(newsk);
 	init_peercred(newsk, &peercred);
 	newu = unix_sk(newsk);
 	newu->listener = other;

-- 
2.47.2
Re: [PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP
Posted by Jann Horn 7 months, 1 week ago
On Thu, May 15, 2025 at 12:04 AM Christian Brauner <brauner@kernel.org> wrote:
> Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP
> mask flag. This adds the fields @coredump_mask and @coredump_cookie to
> struct pidfd_info.

FWIW, now that you're using path-based sockets and override_creds(),
one option may be to drop this patch and say "if you don't want
untrusted processes to directly connect to the coredumping socket,
just set the listening socket to mode 0000 or mode 0600"...

> Signed-off-by: Christian Brauner <brauner@kernel.org>
[...]
> diff --git a/fs/coredump.c b/fs/coredump.c
> index e1256ebb89c1..bfc4a32f737c 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
[...]
> @@ -876,8 +880,34 @@ void do_coredump(const kernel_siginfo_t *siginfo)
>                         goto close_fail;
>                 }
>
> +               /*
> +                * Set the thread-group leader pid which is used for the
> +                * peer credentials during connect() below. Then
> +                * immediately register it in pidfs...
> +                */
> +               cprm.pid = task_tgid(current);
> +               retval = pidfs_register_pid(cprm.pid);
> +               if (retval) {
> +                       sock_release(socket);
> +                       goto close_fail;
> +               }
> +
> +               /*
> +                * ... and set the coredump information so userspace
> +                * has it available after connect()...
> +                */
> +               pidfs_coredump(&cprm);
> +
> +               /*
> +                * ... On connect() the peer credentials are recorded
> +                * and @cprm.pid registered in pidfs...

I don't understand this comment. Wasn't "@cprm.pid registered in
pidfs" above with the explicit `pidfs_register_pid(cprm.pid)`?

> +                */
>                 retval = kernel_connect(socket, (struct sockaddr *)(&addr),
>                                         addr_len, O_NONBLOCK | SOCK_COREDUMP);
> +
> +               /* ... So we can safely put our pidfs reference now... */
> +               pidfs_put_pid(cprm.pid);

Why can we safely put the pidfs reference now but couldn't do it
before the kernel_connect()? Does the kernel_connect() look up this
pidfs entry by calling something like pidfs_alloc_file()? Or does that
only happen later on, when the peer does getsockopt(SO_PEERPIDFD)?

>                 if (retval) {
>                         if (retval == -EAGAIN)
>                                 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
[...]
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 3b39e471840b..d7b9a0dd2db6 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
[...]
> @@ -280,6 +299,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>                 }
>         }
>
> +       if (mask & PIDFD_INFO_COREDUMP) {
> +               kinfo.mask |= PIDFD_INFO_COREDUMP;
> +               smp_rmb();

I assume I would regret it if I asked what these barriers are for,
because the answer is something terrifying about how we otherwise
don't have a guarantee that memory accesses can't be reordered between
multiple subsequent syscalls or something like that?

checkpatch complains about the lack of comments on these memory barriers.

> +               kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> +               kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
> +       }
> +
>         task = get_pid_task(pid, PIDTYPE_PID);
>         if (!task) {
>                 /*
[...]
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index a9d1c9ba2961..053d2e48e918 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
[...]
> @@ -742,6 +743,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
>
>  struct unix_peercred {
>         struct pid *peer_pid;
> +       u64 cookie;

Maybe add a comment here documenting that for now, this is assumed to
be used exclusively for coredump sockets.


>         const struct cred *peer_cred;
>  };
>
Re: [PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP
Posted by Christian Brauner 7 months, 1 week ago
On Thu, May 15, 2025 at 10:56:26PM +0200, Jann Horn wrote:
> On Thu, May 15, 2025 at 12:04 AM Christian Brauner <brauner@kernel.org> wrote:
> > Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP
> > mask flag. This adds the fields @coredump_mask and @coredump_cookie to
> > struct pidfd_info.
> 
> FWIW, now that you're using path-based sockets and override_creds(),
> one option may be to drop this patch and say "if you don't want
> untrusted processes to directly connect to the coredumping socket,
> just set the listening socket to mode 0000 or mode 0600"...
> 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> [...]
> > diff --git a/fs/coredump.c b/fs/coredump.c
> > index e1256ebb89c1..bfc4a32f737c 100644
> > --- a/fs/coredump.c
> > +++ b/fs/coredump.c
> [...]
> > @@ -876,8 +880,34 @@ void do_coredump(const kernel_siginfo_t *siginfo)
> >                         goto close_fail;
> >                 }
> >
> > +               /*
> > +                * Set the thread-group leader pid which is used for the
> > +                * peer credentials during connect() below. Then
> > +                * immediately register it in pidfs...
> > +                */
> > +               cprm.pid = task_tgid(current);
> > +               retval = pidfs_register_pid(cprm.pid);
> > +               if (retval) {
> > +                       sock_release(socket);
> > +                       goto close_fail;
> > +               }
> > +
> > +               /*
> > +                * ... and set the coredump information so userspace
> > +                * has it available after connect()...
> > +                */
> > +               pidfs_coredump(&cprm);
> > +
> > +               /*
> > +                * ... On connect() the peer credentials are recorded
> > +                * and @cprm.pid registered in pidfs...
> 
> I don't understand this comment. Wasn't "@cprm.pid registered in
> pidfs" above with the explicit `pidfs_register_pid(cprm.pid)`?

I'll answer both questions in one go below...

> 
> > +                */
> >                 retval = kernel_connect(socket, (struct sockaddr *)(&addr),
> >                                         addr_len, O_NONBLOCK | SOCK_COREDUMP);
> > +
> > +               /* ... So we can safely put our pidfs reference now... */
> > +               pidfs_put_pid(cprm.pid);
> 
> Why can we safely put the pidfs reference now but couldn't do it
> before the kernel_connect()? Does the kernel_connect() look up this
> pidfs entry by calling something like pidfs_alloc_file()? Or does that
> only happen later on, when the peer does getsockopt(SO_PEERPIDFD)?

AF_UNIX sockets support SO_PEERPIDFD as you know. Users such as dbus or
systemd want to be able to retrieve a pidfd for the peer even if the
peer has already been reaped. To support this AF_UNIX ensures that when
the peer credentials are set up (connect(), listen()) the corresponding
@pid will also be registered in pidfs. This ensures that exit
information is stored in the inode if we hand out a pidfd for a reaped
task. IOW, we only hand out pidfds for reaped task if at the time of
reaping a pidfs entry existed for it.

Since we're setting coredump information on the pidfd here we're calling
pidfs_register_pid() even before connect() sets up the peer credentials
so we're sure that the coredump information is stored in the inode.

Then we delay our pidfs_put_pid() call until the connect() took it's own
reference and thus continues pinning the inode. IOW, connect() will also
call pidfs_register_pid() but it will ofc just increment the reference
count ensuring that our pidfs_put_pid() doesn't drop the inode.

If we immediately did a pidfs_put_pid() before connect() we'd loose the
coredump information.

> 
> >                 if (retval) {
> >                         if (retval == -EAGAIN)
> >                                 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
> [...]
> > diff --git a/fs/pidfs.c b/fs/pidfs.c
> > index 3b39e471840b..d7b9a0dd2db6 100644
> > --- a/fs/pidfs.c
> > +++ b/fs/pidfs.c
> [...]
> > @@ -280,6 +299,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
> >                 }
> >         }
> >
> > +       if (mask & PIDFD_INFO_COREDUMP) {
> > +               kinfo.mask |= PIDFD_INFO_COREDUMP;
> > +               smp_rmb();
> 
> I assume I would regret it if I asked what these barriers are for,
> because the answer is something terrifying about how we otherwise
> don't have a guarantee that memory accesses can't be reordered between
> multiple subsequent syscalls or something like that?

No, not really. It's just so that when someone calls PIDFD_GET_INFO with
PIDFD_INFO_COREDUMP but one gotten from the coredump socket that they
don't see half-initialized information. I can just use WRITE_ONCE() for
that.

> 
> checkpatch complains about the lack of comments on these memory barriers.

I'll just use WRITE_ONCE().

> 
> > +               kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> > +               kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
> > +       }
> > +
> >         task = get_pid_task(pid, PIDTYPE_PID);
> >         if (!task) {
> >                 /*
> [...]
> > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > index a9d1c9ba2961..053d2e48e918 100644
> > --- a/net/unix/af_unix.c
> > +++ b/net/unix/af_unix.c
> [...]
> > @@ -742,6 +743,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
> >
> >  struct unix_peercred {
> >         struct pid *peer_pid;
> > +       u64 cookie;
> 
> Maybe add a comment here documenting that for now, this is assumed to
> be used exclusively for coredump sockets.

I think we should just drop it.
Re: [PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP
Posted by Jann Horn 7 months, 1 week ago
On Fri, May 16, 2025 at 12:34 PM Christian Brauner <brauner@kernel.org> wrote:
> On Thu, May 15, 2025 at 10:56:26PM +0200, Jann Horn wrote:
> > Why can we safely put the pidfs reference now but couldn't do it
> > before the kernel_connect()? Does the kernel_connect() look up this
> > pidfs entry by calling something like pidfs_alloc_file()? Or does that
> > only happen later on, when the peer does getsockopt(SO_PEERPIDFD)?
>
> AF_UNIX sockets support SO_PEERPIDFD as you know. Users such as dbus or
> systemd want to be able to retrieve a pidfd for the peer even if the
> peer has already been reaped. To support this AF_UNIX ensures that when
> the peer credentials are set up (connect(), listen()) the corresponding
> @pid will also be registered in pidfs. This ensures that exit
> information is stored in the inode if we hand out a pidfd for a reaped
> task. IOW, we only hand out pidfds for reaped task if at the time of
> reaping a pidfs entry existed for it.
>
> Since we're setting coredump information on the pidfd here we're calling
> pidfs_register_pid() even before connect() sets up the peer credentials
> so we're sure that the coredump information is stored in the inode.
>
> Then we delay our pidfs_put_pid() call until the connect() took it's own
> reference and thus continues pinning the inode. IOW, connect() will also
> call pidfs_register_pid() but it will ofc just increment the reference
> count ensuring that our pidfs_put_pid() doesn't drop the inode.

Aah, so the call graph looks like this:

unix_stream_connect
  prepare_peercred
    pidfs_register_pid
      [pidfs reference taken]
  [point of no return]
  init_peercred
    [copies creds to socket, moving ref ownership]
  copy_peercred
    [copies creds from socket to peer socket, taking refs]

Thanks for explaining!
Re: [PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP
Posted by Jann Horn 7 months, 1 week ago
On Thu, May 15, 2025 at 10:56 PM Jann Horn <jannh@google.com> wrote:
> On Thu, May 15, 2025 at 12:04 AM Christian Brauner <brauner@kernel.org> wrote:
> > Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP
> > mask flag. This adds the fields @coredump_mask and @coredump_cookie to
> > struct pidfd_info.
>
> FWIW, now that you're using path-based sockets and override_creds(),
> one option may be to drop this patch and say "if you don't want
> untrusted processes to directly connect to the coredumping socket,
> just set the listening socket to mode 0000 or mode 0600"...

Er, forget I said that, of course we'd still want to have at least the
@coredump_mask.

> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> [...]
> > diff --git a/fs/coredump.c b/fs/coredump.c
> > index e1256ebb89c1..bfc4a32f737c 100644
> > --- a/fs/coredump.c
> > +++ b/fs/coredump.c
> [...]
> > @@ -876,8 +880,34 @@ void do_coredump(const kernel_siginfo_t *siginfo)
> >                         goto close_fail;
> >                 }
> >
> > +               /*
> > +                * Set the thread-group leader pid which is used for the
> > +                * peer credentials during connect() below. Then
> > +                * immediately register it in pidfs...
> > +                */
> > +               cprm.pid = task_tgid(current);
> > +               retval = pidfs_register_pid(cprm.pid);
> > +               if (retval) {
> > +                       sock_release(socket);
> > +                       goto close_fail;
> > +               }
> > +
> > +               /*
> > +                * ... and set the coredump information so userspace
> > +                * has it available after connect()...
> > +                */
> > +               pidfs_coredump(&cprm);
> > +
> > +               /*
> > +                * ... On connect() the peer credentials are recorded
> > +                * and @cprm.pid registered in pidfs...
>
> I don't understand this comment. Wasn't "@cprm.pid registered in
> pidfs" above with the explicit `pidfs_register_pid(cprm.pid)`?
>
> > +                */
> >                 retval = kernel_connect(socket, (struct sockaddr *)(&addr),
> >                                         addr_len, O_NONBLOCK | SOCK_COREDUMP);
> > +
> > +               /* ... So we can safely put our pidfs reference now... */
> > +               pidfs_put_pid(cprm.pid);
>
> Why can we safely put the pidfs reference now but couldn't do it
> before the kernel_connect()? Does the kernel_connect() look up this
> pidfs entry by calling something like pidfs_alloc_file()? Or does that
> only happen later on, when the peer does getsockopt(SO_PEERPIDFD)?
>
> >                 if (retval) {
> >                         if (retval == -EAGAIN)
> >                                 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
> [...]
> > diff --git a/fs/pidfs.c b/fs/pidfs.c
> > index 3b39e471840b..d7b9a0dd2db6 100644
> > --- a/fs/pidfs.c
> > +++ b/fs/pidfs.c
> [...]
> > @@ -280,6 +299,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
> >                 }
> >         }
> >
> > +       if (mask & PIDFD_INFO_COREDUMP) {
> > +               kinfo.mask |= PIDFD_INFO_COREDUMP;
> > +               smp_rmb();
>
> I assume I would regret it if I asked what these barriers are for,
> because the answer is something terrifying about how we otherwise
> don't have a guarantee that memory accesses can't be reordered between
> multiple subsequent syscalls or something like that?
>
> checkpatch complains about the lack of comments on these memory barriers.
>
> > +               kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> > +               kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
> > +       }
> > +
> >         task = get_pid_task(pid, PIDTYPE_PID);
> >         if (!task) {
> >                 /*
> [...]
> > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > index a9d1c9ba2961..053d2e48e918 100644
> > --- a/net/unix/af_unix.c
> > +++ b/net/unix/af_unix.c
> [...]
> > @@ -742,6 +743,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
> >
> >  struct unix_peercred {
> >         struct pid *peer_pid;
> > +       u64 cookie;
>
> Maybe add a comment here documenting that for now, this is assumed to
> be used exclusively for coredump sockets.
>
>
> >         const struct cred *peer_cred;
> >  };
> >
Re: [PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP
Posted by Alexander Mikhalitsyn 7 months, 1 week ago
Am Do., 15. Mai 2025 um 00:04 Uhr schrieb Christian Brauner
<brauner@kernel.org>:
>
> Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP
> mask flag. This adds the fields @coredump_mask and @coredump_cookie to
> struct pidfd_info.
>
> When a task coredumps the kernel will provide the following information
> to userspace in @coredump_mask:
>
> * PIDFD_COREDUMPED is raised if the task did actually coredump.
> * PIDFD_COREDUMP_SKIP is raised if the task skipped coredumping (e.g.,
>   undumpable).
> * PIDFD_COREDUMP_USER is raised if this is a regular coredump and
>   doesn't need special care by the coredump server.
> * PIDFD_COREDUMP_ROOT is raised if the generated coredump should be
>   treated as sensitive and the coredump server should restrict to the
>   generated coredump to sufficiently privileged users.
>
> If userspace uses the coredump socket to process coredumps it needs to
> be able to discern connection from the kernel from connects from
> userspace (e.g., Python generating it's own coredumps and forwarding
> them to systemd). The @coredump_cookie extension uses the SO_COOKIE of
> the new connection. This allows userspace to validate that the
> connection has been made from the kernel by a crashing task:
>
>    fd_coredump = accept4(fd_socket, NULL, NULL, SOCK_CLOEXEC);
>    getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, &fd_peer_pidfd_len);
>
>    struct pidfd_info info = {
>            info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
>    };
>
>    ioctl(pidfd, PIDFD_GET_INFO, &info);
>    /* Refuse connections that aren't from a crashing task. */
>    if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED) )
>            close(fd_coredump);
>
>    /*
>     * Make sure that the coredump cookie matches the connection cookie.
>     * If they don't it's not the coredump connection from the kernel.
>     * We'll get another connection request in a bit.
>     */
>    getsocketop(fd_coredump, SOL_SOCKET, SO_COOKIE, &peer_cookie, &peer_cookie_len);
>    if (!info.coredump_cookie || (info.coredump_cookie != peer_cookie))
>            close(fd_coredump);
>
> The kernel guarantees that by the time the connection is made the all
> PIDFD_INFO_COREDUMP info is available.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>

Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>

> ---
>  fs/coredump.c              | 34 ++++++++++++++++++++
>  fs/pidfs.c                 | 79 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/pidfs.h      | 10 ++++++
>  include/uapi/linux/pidfd.h | 22 +++++++++++++
>  net/unix/af_unix.c         |  7 ++++
>  5 files changed, 152 insertions(+)
>
> diff --git a/fs/coredump.c b/fs/coredump.c
> index e1256ebb89c1..bfc4a32f737c 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -46,7 +46,9 @@
>  #include <linux/pidfs.h>
>  #include <linux/net.h>
>  #include <linux/socket.h>
> +#include <net/af_unix.h>
>  #include <net/net_namespace.h>
> +#include <net/sock.h>
>  #include <uapi/linux/pidfd.h>
>  #include <uapi/linux/un.h>
>
> @@ -598,6 +600,8 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
>                 if (IS_ERR(pidfs_file))
>                         return PTR_ERR(pidfs_file);
>
> +               pidfs_coredump(cp);
> +
>                 /*
>                  * Usermode helpers are childen of either
>                  * system_unbound_wq or of kthreadd. So we know that
> @@ -876,8 +880,34 @@ void do_coredump(const kernel_siginfo_t *siginfo)
>                         goto close_fail;
>                 }
>
> +               /*
> +                * Set the thread-group leader pid which is used for the
> +                * peer credentials during connect() below. Then
> +                * immediately register it in pidfs...
> +                */
> +               cprm.pid = task_tgid(current);
> +               retval = pidfs_register_pid(cprm.pid);
> +               if (retval) {
> +                       sock_release(socket);
> +                       goto close_fail;
> +               }
> +
> +               /*
> +                * ... and set the coredump information so userspace
> +                * has it available after connect()...
> +                */
> +               pidfs_coredump(&cprm);
> +
> +               /*
> +                * ... On connect() the peer credentials are recorded
> +                * and @cprm.pid registered in pidfs...
> +                */
>                 retval = kernel_connect(socket, (struct sockaddr *)(&addr),
>                                         addr_len, O_NONBLOCK | SOCK_COREDUMP);
> +
> +               /* ... So we can safely put our pidfs reference now... */
> +               pidfs_put_pid(cprm.pid);
> +
>                 if (retval) {
>                         if (retval == -EAGAIN)
>                                 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
> @@ -886,6 +916,10 @@ void do_coredump(const kernel_siginfo_t *siginfo)
>                         goto close_fail;
>                 }
>
> +               /* ... and validate that @sk_peer_pid matches @cprm.pid. */
> +               if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid))
> +                       goto close_fail;
> +
>                 cprm.limit = RLIM_INFINITY;
>                 cprm.file = no_free_ptr(file);
>  #else
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 3b39e471840b..d7b9a0dd2db6 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -20,6 +20,7 @@
>  #include <linux/time_namespace.h>
>  #include <linux/utsname.h>
>  #include <net/net_namespace.h>
> +#include <linux/coredump.h>
>
>  #include "internal.h"
>  #include "mount.h"
> @@ -33,6 +34,8 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
>  struct pidfs_exit_info {
>         __u64 cgroupid;
>         __s32 exit_code;
> +       __u32 coredump_mask;
> +       __u64 coredump_cookie;
>  };
>
>  struct pidfs_inode {
> @@ -240,6 +243,22 @@ static inline bool pid_in_current_pidns(const struct pid *pid)
>         return false;
>  }
>
> +static __u32 pidfs_coredump_mask(unsigned long mm_flags)
> +{
> +       switch (__get_dumpable(mm_flags)) {
> +       case SUID_DUMP_USER:
> +               return PIDFD_COREDUMP_USER;
> +       case SUID_DUMP_ROOT:
> +               return PIDFD_COREDUMP_ROOT;
> +       case SUID_DUMP_DISABLE:
> +               return PIDFD_COREDUMP_SKIP;
> +       default:
> +               WARN_ON_ONCE(true);
> +       }
> +
> +       return 0;
> +}
> +
>  static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>  {
>         struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
> @@ -280,6 +299,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>                 }
>         }
>
> +       if (mask & PIDFD_INFO_COREDUMP) {
> +               kinfo.mask |= PIDFD_INFO_COREDUMP;
> +               smp_rmb();
> +               kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> +               kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
> +       }
> +
>         task = get_pid_task(pid, PIDTYPE_PID);
>         if (!task) {
>                 /*
> @@ -296,6 +322,16 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>         if (!c)
>                 return -ESRCH;
>
> +       if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) {
> +               task_lock(task);
> +               if (task->mm) {
> +                       smp_rmb();
> +                       kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> +                       kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
> +               }
> +               task_unlock(task);
> +       }
> +
>         /* Unconditionally return identifiers and credentials, the rest only on request */
>
>         user_ns = current_user_ns();
> @@ -559,6 +595,49 @@ void pidfs_exit(struct task_struct *tsk)
>         }
>  }
>
> +#if defined(CONFIG_COREDUMP) && defined(CONFIG_UNIX)
> +void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie)
> +{
> +       struct pidfs_exit_info *exit_info;
> +       struct dentry *dentry = pid->stashed;
> +       struct inode *inode;
> +
> +       if (WARN_ON_ONCE(!dentry))
> +               return;
> +
> +       inode = d_inode(dentry);
> +       exit_info = &pidfs_i(inode)->__pei;
> +       /* Can't use smp_store_release() because of 32bit. */
> +       smp_wmb();
> +       WRITE_ONCE(exit_info->coredump_cookie, coredump_cookie);
> +}
> +#endif
> +
> +#ifdef CONFIG_COREDUMP
> +void pidfs_coredump(const struct coredump_params *cprm)
> +{
> +       struct pid *pid = cprm->pid;
> +       struct pidfs_exit_info *exit_info;
> +       struct dentry *dentry;
> +       struct inode *inode;
> +       __u32 coredump_mask = 0;
> +
> +       dentry = pid->stashed;
> +       if (WARN_ON_ONCE(!dentry))
> +               return;
> +
> +       inode = d_inode(dentry);
> +       exit_info = &pidfs_i(inode)->__pei;
> +       /* Note how we were coredumped. */
> +       coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
> +       /* Note that we actually did coredump. */
> +       coredump_mask |= PIDFD_COREDUMPED;
> +       /* If coredumping is set to skip we should never end up here. */
> +       VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
> +       smp_store_release(&exit_info->coredump_mask, coredump_mask);
> +}
> +#endif
> +
>  static struct vfsmount *pidfs_mnt __ro_after_init;
>
>  /*
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 2676890c4d0d..497997bc5e34 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -2,11 +2,21 @@
>  #ifndef _LINUX_PID_FS_H
>  #define _LINUX_PID_FS_H
>
> +struct coredump_params;
> +
>  struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
>  void __init pidfs_init(void);
>  void pidfs_add_pid(struct pid *pid);
>  void pidfs_remove_pid(struct pid *pid);
>  void pidfs_exit(struct task_struct *tsk);
> +#ifdef CONFIG_COREDUMP
> +void pidfs_coredump(const struct coredump_params *cprm);
> +#endif
> +#if defined(CONFIG_COREDUMP) && defined(CONFIG_UNIX)
> +void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie);
> +#elif defined(CONFIG_UNIX)
> +static inline void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie) { }
> +#endif
>  extern const struct dentry_operations pidfs_dentry_operations;
>  int pidfs_register_pid(struct pid *pid);
>  void pidfs_get_pid(struct pid *pid);
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 8c1511edd0e9..69267c5ae6d0 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -25,9 +25,28 @@
>  #define PIDFD_INFO_CREDS               (1UL << 1) /* Always returned, even if not requested */
>  #define PIDFD_INFO_CGROUPID            (1UL << 2) /* Always returned if available, even if not requested */
>  #define PIDFD_INFO_EXIT                        (1UL << 3) /* Only returned if requested. */
> +#define PIDFD_INFO_COREDUMP            (1UL << 4) /* Only returned if requested. */
>
>  #define PIDFD_INFO_SIZE_VER0           64 /* sizeof first published struct */
>
> +/*
> + * Values for @coredump_mask in pidfd_info.
> + * Only valid if PIDFD_INFO_COREDUMP is set in @mask.
> + *
> + * Note, the @PIDFD_COREDUMP_ROOT flag indicates that the generated
> + * coredump should be treated as sensitive and access should only be
> + * granted to privileged users.
> + *
> + * If the coredump AF_UNIX socket is used for processing coredumps
> + * @coredump_cookie will be set to the socket SO_COOKIE of the receivers
> + * client socket. This allows the coredump handler to detect whether an
> + * incoming coredump connection was initiated from the crashing task.
> + */
> +#define PIDFD_COREDUMPED       (1U << 0) /* Did crash and... */
> +#define PIDFD_COREDUMP_SKIP    (1U << 1) /* coredumping generation was skipped. */
> +#define PIDFD_COREDUMP_USER    (1U << 2) /* coredump was done as the user. */
> +#define PIDFD_COREDUMP_ROOT    (1U << 3) /* coredump was done as root. */
> +
>  /*
>   * The concept of process and threads in userland and the kernel is a confusing
>   * one - within the kernel every thread is a 'task' with its own individual PID,
> @@ -92,6 +111,9 @@ struct pidfd_info {
>         __u32 fsuid;
>         __u32 fsgid;
>         __s32 exit_code;
> +       __u32 coredump_mask;
> +       __u32 __spare1;
> +       __u64 coredump_cookie;
>  };
>
>  #define PIDFS_IOCTL_MAGIC 0xFF
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index a9d1c9ba2961..053d2e48e918 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -99,6 +99,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/skbuff.h>
>  #include <linux/slab.h>
> +#include <linux/sock_diag.h>
>  #include <linux/socket.h>
>  #include <linux/splice.h>
>  #include <linux/string.h>
> @@ -742,6 +743,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
>
>  struct unix_peercred {
>         struct pid *peer_pid;
> +       u64 cookie;
>         const struct cred *peer_cred;
>  };
>
> @@ -777,6 +779,8 @@ static void drop_peercred(struct unix_peercred *peercred)
>  static inline void init_peercred(struct sock *sk,
>                                  const struct unix_peercred *peercred)
>  {
> +       if (peercred->cookie)
> +               pidfs_coredump_cookie(peercred->peer_pid, peercred->cookie);
>         sk->sk_peer_pid = peercred->peer_pid;
>         sk->sk_peer_cred = peercred->peer_cred;
>  }
> @@ -1713,6 +1717,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>         unix_peer(newsk)        = sk;
>         newsk->sk_state         = TCP_ESTABLISHED;
>         newsk->sk_type          = sk->sk_type;
> +       /* Prepare a new socket cookie for the receiver. */
> +       if (flags & SOCK_COREDUMP)
> +               peercred.cookie = sock_gen_cookie(newsk);
>         init_peercred(newsk, &peercred);
>         newu = unix_sk(newsk);
>         newu->listener = other;
>
> --
> 2.47.2
>