[v3] coredump: add coredump socket

[PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Christian Brauner 9 months, 1 week ago

Make sure that only tasks that actually coredumped may connect to the
coredump socket. This restriction may be loosened later in case
userspace processes would like to use it to generate their own
coredumps. Though it'd be wiser if userspace just exposed a separate
socket for that.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c            | 25 +++++++++++++++++++++++++
 fs/pidfs.c               | 13 +++++++++++++
 include/linux/coredump.h | 12 ++++++++++++
 include/linux/pidfs.h    |  1 +
 net/unix/af_unix.c       | 19 +++++++++++++------
 5 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index a5f17aaeee32..8a9620ce4fd0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -630,6 +630,31 @@ struct sockaddr_un coredump_unix_socket = {
 #define COREDUMP_UNIX_SOCKET_ADDR_SIZE            \
 	(offsetof(struct sockaddr_un, sun_path) + \
 	 sizeof("\0linuxafsk/coredump.socket") - 1)
+
+static inline bool is_coredump_socket(const struct sockaddr_un *sunname,
+				      unsigned int len)
+{
+	return (COREDUMP_UNIX_SOCKET_ADDR_SIZE == len) &&
+	       !memcmp(&coredump_unix_socket, sunname, len);
+}
+
+/*
+ * For the coredump socket in the initial network namespace we only
+ * allow actual coredumping processes to connect to it, i.e., the kernel
+ * itself.
+ */
+bool unix_use_coredump_socket(const struct net *net, const struct pid *pid,
+			      const struct sockaddr_un *sunname,
+			      unsigned int len)
+{
+	if (net != &init_net)
+		return true;
+
+	if (!is_coredump_socket(sunname, len))
+		return true;
+
+	return pidfs_pid_coredumped(pid);
+}
 #endif
 
 void do_coredump(const kernel_siginfo_t *siginfo)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 8c4d83fb115b..e0a4c34ddc42 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -258,6 +258,19 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 	return 0;
 }
 
+bool pidfs_pid_coredumped(const struct pid *pid)
+{
+	struct inode *inode;
+
+	if (!pid)
+		return false;
+
+	/* We expect the caller to hold a reference to @pid->stashed. */
+	VFS_WARN_ON_ONCE(!pid->stashed);
+	inode = d_inode(pid->stashed);
+	return (READ_ONCE(pidfs_i(inode)->__pei.coredump_mask) & PIDFD_COREDUMPED);
+}
+
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 76e41805b92d..2b2f0c016c16 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -7,6 +7,8 @@
 #include <linux/fs.h>
 #include <asm/siginfo.h>
 
+struct sockaddr_un;
+
 #ifdef CONFIG_COREDUMP
 struct core_vma_metadata {
 	unsigned long start, end;
@@ -44,6 +46,9 @@ extern int dump_align(struct coredump_params *cprm, int align);
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len);
 extern void do_coredump(const kernel_siginfo_t *siginfo);
+bool unix_use_coredump_socket(const struct net *net, const struct pid *pid,
+			      const struct sockaddr_un *sunname,
+			      unsigned int len);
 
 /*
  * Logging for the coredump code, ratelimited.
@@ -64,6 +69,13 @@ extern void do_coredump(const kernel_siginfo_t *siginfo);
 
 #else
 static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
+static inline bool unix_use_coredump_socket(const struct net *net,
+					    const struct pid *pid,
+					    const struct sockaddr_un *sunname,
+					    unsigned int len)
+{
+	return true;
+}
 
 #define coredump_report(...)
 #define coredump_report_failure(...)
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index f7729b9371bc..f9c287c0e0ff 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -14,5 +14,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
 void pidfs_get_pid(struct pid *pid);
 void pidfs_put_pid(struct pid *pid);
+bool pidfs_pid_coredumped(const struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index edc2f143f401..613cf9225381 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -101,6 +101,7 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/pidfs.h>
+#include <linux/coredump.h>
 #include <net/af_unix.h>
 #include <net/net_namespace.h>
 #include <net/scm.h>
@@ -1217,6 +1218,7 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
 }
 
 static struct sock *unix_find_abstract(struct net *net,
+				       const struct pid *peer_pid,
 				       struct sockaddr_un *sunaddr,
 				       int addr_len, int type)
 {
@@ -1224,6 +1226,9 @@ static struct sock *unix_find_abstract(struct net *net,
 	struct dentry *dentry;
 	struct sock *sk;
 
+	if (!unix_use_coredump_socket(net, peer_pid, sunaddr, addr_len))
+		return ERR_PTR(-ECONNREFUSED);
+
 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
 	if (!sk)
 		return ERR_PTR(-ECONNREFUSED);
@@ -1236,15 +1241,16 @@ static struct sock *unix_find_abstract(struct net *net,
 }
 
 static struct sock *unix_find_other(struct net *net,
-				    struct sockaddr_un *sunaddr,
-				    int addr_len, int type)
+				    const struct pid *peer_pid,
+				    struct sockaddr_un *sunaddr, int addr_len,
+				    int type)
 {
 	struct sock *sk;
 
 	if (sunaddr->sun_path[0])
 		sk = unix_find_bsd(sunaddr, addr_len, type);
 	else
-		sk = unix_find_abstract(net, sunaddr, addr_len, type);
+		sk = unix_find_abstract(net, peer_pid, sunaddr, addr_len, type);
 
 	return sk;
 }
@@ -1500,7 +1506,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		}
 
 restart:
-		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
+		other = unix_find_other(sock_net(sk), NULL, sunaddr, alen, sock->type);
 		if (IS_ERR(other)) {
 			err = PTR_ERR(other);
 			goto out;
@@ -1647,7 +1653,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 restart:
 	/*  Find listening sock. */
-	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
+	other = unix_find_other(net, peercred.peer_pid, sunaddr, addr_len,
+				sk->sk_type);
 	if (IS_ERR(other)) {
 		err = PTR_ERR(other);
 		goto out_free_skb;
@@ -2115,7 +2122,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 
 	if (msg->msg_namelen) {
 lookup:
-		other = unix_find_other(sock_net(sk), msg->msg_name,
+		other = unix_find_other(sock_net(sk), NULL, msg->msg_name,
 					msg->msg_namelen, sk->sk_type);
 		if (IS_ERR(other)) {
 			err = PTR_ERR(other);

-- 
2.47.2

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Jann Horn 9 months, 1 week ago

On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> Make sure that only tasks that actually coredumped may connect to the
> coredump socket. This restriction may be loosened later in case
> userspace processes would like to use it to generate their own
> coredumps. Though it'd be wiser if userspace just exposed a separate
> socket for that.

This implementation kinda feels a bit fragile to me... I wonder if we
could instead have a flag inside the af_unix client socket that says
"this is a special client socket for coredumping".

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Christian Brauner 9 months, 1 week ago

On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > Make sure that only tasks that actually coredumped may connect to the
> > coredump socket. This restriction may be loosened later in case
> > userspace processes would like to use it to generate their own
> > coredumps. Though it'd be wiser if userspace just exposed a separate
> > socket for that.
> 
> This implementation kinda feels a bit fragile to me... I wonder if we
> could instead have a flag inside the af_unix client socket that says
> "this is a special client socket for coredumping".

Should be easily doable with a sock_flag().

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Christian Brauner <brauner@kernel.org>
Date: Mon, 5 May 2025 16:06:40 +0200
> On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > Make sure that only tasks that actually coredumped may connect to the
> > > coredump socket. This restriction may be loosened later in case
> > > userspace processes would like to use it to generate their own
> > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > socket for that.
> > 
> > This implementation kinda feels a bit fragile to me... I wonder if we
> > could instead have a flag inside the af_unix client socket that says
> > "this is a special client socket for coredumping".
> 
> Should be easily doable with a sock_flag().

This restriction should be applied by BPF LSM.

It's hard to loosen such a default restriction as someone might
argue that's unexpected and regression.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Jann Horn 9 months, 1 week ago

On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> From: Christian Brauner <brauner@kernel.org>
> Date: Mon, 5 May 2025 16:06:40 +0200
> > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > Make sure that only tasks that actually coredumped may connect to the
> > > > coredump socket. This restriction may be loosened later in case
> > > > userspace processes would like to use it to generate their own
> > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > socket for that.
> > >
> > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > could instead have a flag inside the af_unix client socket that says
> > > "this is a special client socket for coredumping".
> >
> > Should be easily doable with a sock_flag().
>
> This restriction should be applied by BPF LSM.

I think we shouldn't allow random userspace processes to connect to
the core dump handling service and provide bogus inputs; that
unnecessarily increases the risk that a crafted coredump can be used
to exploit a bug in the service. So I think it makes sense to enforce
this restriction in the kernel.

My understanding is that BPF LSM creates fairly tight coupling between
userspace and the kernel implementation, and it is kind of unwieldy
for userspace. (I imagine the "man 5 core" manpage would get a bit
longer and describe more kernel implementation detail if you tried to
show how to write a BPF LSM that is capable of detecting unix domain
socket connections to a specific address that are not initiated by
core dumping.) I would like to keep it possible to implement core
userspace functionality in a best-practice way without needing eBPF.

> It's hard to loosen such a default restriction as someone might
> argue that's unexpected and regression.

If userspace wants to allow other processes to connect to the core
dumping service, that's easy to implement - userspace can listen on a
separate address that is not subject to these restrictions.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Christian Brauner 9 months, 1 week ago

On Mon, May 05, 2025 at 09:10:28PM +0200, Jann Horn wrote:
> On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > From: Christian Brauner <brauner@kernel.org>
> > Date: Mon, 5 May 2025 16:06:40 +0200
> > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > coredump socket. This restriction may be loosened later in case
> > > > > userspace processes would like to use it to generate their own
> > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > socket for that.
> > > >
> > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > could instead have a flag inside the af_unix client socket that says
> > > > "this is a special client socket for coredumping".
> > >
> > > Should be easily doable with a sock_flag().
> >
> > This restriction should be applied by BPF LSM.
> 
> I think we shouldn't allow random userspace processes to connect to
> the core dump handling service and provide bogus inputs; that
> unnecessarily increases the risk that a crafted coredump can be used
> to exploit a bug in the service. So I think it makes sense to enforce
> this restriction in the kernel.
> 
> My understanding is that BPF LSM creates fairly tight coupling between
> userspace and the kernel implementation, and it is kind of unwieldy
> for userspace. (I imagine the "man 5 core" manpage would get a bit
> longer and describe more kernel implementation detail if you tried to
> show how to write a BPF LSM that is capable of detecting unix domain
> socket connections to a specific address that are not initiated by
> core dumping.) I would like to keep it possible to implement core
> userspace functionality in a best-practice way without needing eBPF.
> 
> > It's hard to loosen such a default restriction as someone might
> > argue that's unexpected and regression.
> 
> If userspace wants to allow other processes to connect to the core
> dumping service, that's easy to implement - userspace can listen on a
> separate address that is not subject to these restrictions.

I think Kuniyuki's point is defensible. And I did discuss this with
Lennart when I wrote the patch and he didn't see a point in preventing
other processes from connecting to the core dump socket. He actually
would like this to be possible because there's some userspace programs
out there that generate their own coredumps (Python?) and he wanted them
to use the general coredump socket to send them to.

I just found it more elegant to simply guarantee that only connections
are made to that socket come from coredumping tasks.

But I should note there are two ways to cleanly handle this in
userspace. I had already mentioned the bpf LSM in the contect of
rate-limiting in an earlier posting:

(1) complex:

    Use a bpf LSM to intercept the connection request via
    security_unix_stream_connect() in unix_stream_connect().

    The bpf program can simply check:

    current->signal->core_state

    and reject any connection if it isn't set to NULL.

    The big downside is that bpf (and security) need to be enabled.
    Neither is guaranteed and there's quite a few users out there that
    don't enable bpf.

(2) simple (and supported in this series):

    Userspace accepts a connection. It has to get SO_PEERPIDFD anyway.
    It then needs to verify:

    struct pidfd_info info = {
            info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
    };

    ioctl(pidfd, PIDFD_GET_INFO, &info);
    if (!(info.mask & PIDFD_INFO_COREDUMP)) {
            // Can't be from a coredumping task so we can close the
	    // connection without reading.
	    close(coredump_client_fd);
	    return;
    }

    /* This has to be set and is only settable by do_coredump(). */
    if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
            // Can't be from a coredumping task so we can close the
	    // connection without reading.
	    close(coredump_client_fd);
	    return;
    }

    // Ok, this is a connection from a task that has coredumped, let's
    // handle it.

    The crux is that the series guarantees that by the time the
    connection is made the info whether the task/thread-group did
    coredump is guaranteed to be available via the pidfd.
 
I think if we document that most coredump servers have to do (2) then
this is fine. But I wouldn't mind a nod from Jann on this.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Christian Brauner <brauner@kernel.org>
Date: Tue, 6 May 2025 10:06:27 +0200
> On Mon, May 05, 2025 at 09:10:28PM +0200, Jann Horn wrote:
> > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > From: Christian Brauner <brauner@kernel.org>
> > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > userspace processes would like to use it to generate their own
> > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > socket for that.
> > > > >
> > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > could instead have a flag inside the af_unix client socket that says
> > > > > "this is a special client socket for coredumping".
> > > >
> > > > Should be easily doable with a sock_flag().
> > >
> > > This restriction should be applied by BPF LSM.
> > 
> > I think we shouldn't allow random userspace processes to connect to
> > the core dump handling service and provide bogus inputs; that
> > unnecessarily increases the risk that a crafted coredump can be used
> > to exploit a bug in the service. So I think it makes sense to enforce
> > this restriction in the kernel.
> > 
> > My understanding is that BPF LSM creates fairly tight coupling between
> > userspace and the kernel implementation, and it is kind of unwieldy
> > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > longer and describe more kernel implementation detail if you tried to
> > show how to write a BPF LSM that is capable of detecting unix domain
> > socket connections to a specific address that are not initiated by
> > core dumping.) I would like to keep it possible to implement core
> > userspace functionality in a best-practice way without needing eBPF.
> > 
> > > It's hard to loosen such a default restriction as someone might
> > > argue that's unexpected and regression.
> > 
> > If userspace wants to allow other processes to connect to the core
> > dumping service, that's easy to implement - userspace can listen on a
> > separate address that is not subject to these restrictions.
> 
> I think Kuniyuki's point is defensible. And I did discuss this with
> Lennart when I wrote the patch and he didn't see a point in preventing
> other processes from connecting to the core dump socket. He actually
> would like this to be possible because there's some userspace programs
> out there that generate their own coredumps (Python?) and he wanted them
> to use the general coredump socket to send them to.
> 
> I just found it more elegant to simply guarantee that only connections
> are made to that socket come from coredumping tasks.
> 
> But I should note there are two ways to cleanly handle this in
> userspace. I had already mentioned the bpf LSM in the contect of
> rate-limiting in an earlier posting:
> 
> (1) complex:
> 
>     Use a bpf LSM to intercept the connection request via
>     security_unix_stream_connect() in unix_stream_connect().
> 
>     The bpf program can simply check:
> 
>     current->signal->core_state
> 
>     and reject any connection if it isn't set to NULL.
> 
>     The big downside is that bpf (and security) need to be enabled.
>     Neither is guaranteed and there's quite a few users out there that
>     don't enable bpf.
> 
> (2) simple (and supported in this series):
> 
>     Userspace accepts a connection. It has to get SO_PEERPIDFD anyway.
>     It then needs to verify:
> 
>     struct pidfd_info info = {
>             info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
>     };
> 
>     ioctl(pidfd, PIDFD_GET_INFO, &info);
>     if (!(info.mask & PIDFD_INFO_COREDUMP)) {
>             // Can't be from a coredumping task so we can close the
> 	    // connection without reading.
> 	    close(coredump_client_fd);
> 	    return;
>     }
> 
>     /* This has to be set and is only settable by do_coredump(). */
>     if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
>             // Can't be from a coredumping task so we can close the
> 	    // connection without reading.
> 	    close(coredump_client_fd);
> 	    return;
>     }
> 
>     // Ok, this is a connection from a task that has coredumped, let's
>     // handle it.
> 
>     The crux is that the series guarantees that by the time the
>     connection is made the info whether the task/thread-group did
>     coredump is guaranteed to be available via the pidfd.
>  
> I think if we document that most coredump servers have to do (2) then
> this is fine. But I wouldn't mind a nod from Jann on this.

I like this approach (2) allowing users to filter the right client.
This way we can extend the application flexibly for another coredump
service.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Mickaël Salaün 9 months, 1 week ago

On Tue, May 06, 2025 at 12:18:12PM -0700, Kuniyuki Iwashima wrote:
> From: Christian Brauner <brauner@kernel.org>
> Date: Tue, 6 May 2025 10:06:27 +0200
> > On Mon, May 05, 2025 at 09:10:28PM +0200, Jann Horn wrote:
> > > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > > From: Christian Brauner <brauner@kernel.org>
> > > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > > userspace processes would like to use it to generate their own
> > > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > > socket for that.
> > > > > >
> > > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > > could instead have a flag inside the af_unix client socket that says
> > > > > > "this is a special client socket for coredumping".
> > > > >
> > > > > Should be easily doable with a sock_flag().
> > > >
> > > > This restriction should be applied by BPF LSM.
> > > 
> > > I think we shouldn't allow random userspace processes to connect to
> > > the core dump handling service and provide bogus inputs; that
> > > unnecessarily increases the risk that a crafted coredump can be used
> > > to exploit a bug in the service. So I think it makes sense to enforce
> > > this restriction in the kernel.
> > > 
> > > My understanding is that BPF LSM creates fairly tight coupling between
> > > userspace and the kernel implementation, and it is kind of unwieldy
> > > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > > longer and describe more kernel implementation detail if you tried to
> > > show how to write a BPF LSM that is capable of detecting unix domain
> > > socket connections to a specific address that are not initiated by
> > > core dumping.) I would like to keep it possible to implement core
> > > userspace functionality in a best-practice way without needing eBPF.
> > > 
> > > > It's hard to loosen such a default restriction as someone might
> > > > argue that's unexpected and regression.
> > > 
> > > If userspace wants to allow other processes to connect to the core
> > > dumping service, that's easy to implement - userspace can listen on a
> > > separate address that is not subject to these restrictions.
> > 
> > I think Kuniyuki's point is defensible. And I did discuss this with
> > Lennart when I wrote the patch and he didn't see a point in preventing
> > other processes from connecting to the core dump socket. He actually
> > would like this to be possible because there's some userspace programs
> > out there that generate their own coredumps (Python?) and he wanted them
> > to use the general coredump socket to send them to.
> > 
> > I just found it more elegant to simply guarantee that only connections
> > are made to that socket come from coredumping tasks.
> > 
> > But I should note there are two ways to cleanly handle this in
> > userspace. I had already mentioned the bpf LSM in the contect of
> > rate-limiting in an earlier posting:
> > 
> > (1) complex:
> > 
> >     Use a bpf LSM to intercept the connection request via
> >     security_unix_stream_connect() in unix_stream_connect().
> > 
> >     The bpf program can simply check:
> > 
> >     current->signal->core_state
> > 
> >     and reject any connection if it isn't set to NULL.
> > 
> >     The big downside is that bpf (and security) need to be enabled.
> >     Neither is guaranteed and there's quite a few users out there that
> >     don't enable bpf.

The kernel should indeed always have a minimal security policy in place,
LSM can tailored that but we should not assume that a specific LSM with
a specific policy is enabled/configured on the system.

> > 
> > (2) simple (and supported in this series):
> > 
> >     Userspace accepts a connection. It has to get SO_PEERPIDFD anyway.
> >     It then needs to verify:
> > 
> >     struct pidfd_info info = {
> >             info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
> >     };
> > 
> >     ioctl(pidfd, PIDFD_GET_INFO, &info);
> >     if (!(info.mask & PIDFD_INFO_COREDUMP)) {
> >             // Can't be from a coredumping task so we can close the
> > 	    // connection without reading.
> > 	    close(coredump_client_fd);
> > 	    return;
> >     }
> > 
> >     /* This has to be set and is only settable by do_coredump(). */
> >     if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
> >             // Can't be from a coredumping task so we can close the
> > 	    // connection without reading.
> > 	    close(coredump_client_fd);
> > 	    return;
> >     }
> > 
> >     // Ok, this is a connection from a task that has coredumped, let's
> >     // handle it.

What if the task send a "fake" coredump and just after that really
coredump?  There could be a race condition on the server side when
checking the coredump property of this pidfd.

Could we add a trusted header to the coredump payload that is always
written by the kernel?  This would enable to read a trusted flag
indicating if the following payload is a coredumped generated by the
kernel or not.

> > 
> >     The crux is that the series guarantees that by the time the
> >     connection is made the info whether the task/thread-group did
> >     coredump is guaranteed to be available via the pidfd.
> >  
> > I think if we document that most coredump servers have to do (2) then
> > this is fine. But I wouldn't mind a nod from Jann on this.
> 
> I like this approach (2) allowing users to filter the right client.
> This way we can extend the application flexibly for another coredump
> service.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Paul Moore 9 months, 1 week ago

On Wed, May 7, 2025 at 7:54 AM Mickaël Salaün <mic@digikod.net> wrote:
> On Tue, May 06, 2025 at 12:18:12PM -0700, Kuniyuki Iwashima wrote:
> > From: Christian Brauner <brauner@kernel.org>
> > Date: Tue, 6 May 2025 10:06:27 +0200
> > > On Mon, May 05, 2025 at 09:10:28PM +0200, Jann Horn wrote:
> > > > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > > > From: Christian Brauner <brauner@kernel.org>
> > > > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > > > userspace processes would like to use it to generate their own
> > > > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > > > socket for that.
> > > > > > >
> > > > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > > > could instead have a flag inside the af_unix client socket that says
> > > > > > > "this is a special client socket for coredumping".
> > > > > >
> > > > > > Should be easily doable with a sock_flag().
> > > > >
> > > > > This restriction should be applied by BPF LSM.
> > > >
> > > > I think we shouldn't allow random userspace processes to connect to
> > > > the core dump handling service and provide bogus inputs; that
> > > > unnecessarily increases the risk that a crafted coredump can be used
> > > > to exploit a bug in the service. So I think it makes sense to enforce
> > > > this restriction in the kernel.
> > > >
> > > > My understanding is that BPF LSM creates fairly tight coupling between
> > > > userspace and the kernel implementation, and it is kind of unwieldy
> > > > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > > > longer and describe more kernel implementation detail if you tried to
> > > > show how to write a BPF LSM that is capable of detecting unix domain
> > > > socket connections to a specific address that are not initiated by
> > > > core dumping.) I would like to keep it possible to implement core
> > > > userspace functionality in a best-practice way without needing eBPF.
> > > >
> > > > > It's hard to loosen such a default restriction as someone might
> > > > > argue that's unexpected and regression.
> > > >
> > > > If userspace wants to allow other processes to connect to the core
> > > > dumping service, that's easy to implement - userspace can listen on a
> > > > separate address that is not subject to these restrictions.
> > >
> > > I think Kuniyuki's point is defensible. And I did discuss this with
> > > Lennart when I wrote the patch and he didn't see a point in preventing
> > > other processes from connecting to the core dump socket. He actually
> > > would like this to be possible because there's some userspace programs
> > > out there that generate their own coredumps (Python?) and he wanted them
> > > to use the general coredump socket to send them to.

From a security perspective, it seems very reasonable to me that an
LSM would want to potentially control which processes are allowed to
bind or connect to the coredump socket.  Assuming that the socket
creation, bind(), and connect() operations go through all of the
normal LSM hooks like any other socket that shouldn't be a problem.
Some LSMs may have challenges with the lack of a filesystem path
associated with the socket, but abstract sockets are far from a new
concept and those LSMs should already have a mechanism for dealing
with such sockets.

I also want to stress that when we think about LSM controls, we need
to think in generic terms and not solely on a specific LSM, e.g. BPF.
It's fine and good to have documentation about how one might use a BPF
LSM to mitigate access to a coredump socket, but it should be made
clear in that same documentation that other LSMs may also be enforcing
access controls on that socket.  Further, and I believe everyone here
already knows this, but just to be clear, the kernel code should
definitely not assume either the presence of a specific LSM, or the
LSM in general.

> > > I just found it more elegant to simply guarantee that only connections
> > > are made to that socket come from coredumping tasks.
> > >
> > > But I should note there are two ways to cleanly handle this in
> > > userspace. I had already mentioned the bpf LSM in the contect of
> > > rate-limiting in an earlier posting:
> > >
> > > (1) complex:
> > >
> > >     Use a bpf LSM to intercept the connection request via
> > >     security_unix_stream_connect() in unix_stream_connect().
> > >
> > >     The bpf program can simply check:
> > >
> > >     current->signal->core_state
> > >
> > >     and reject any connection if it isn't set to NULL.
> > >
> > >     The big downside is that bpf (and security) need to be enabled.
> > >     Neither is guaranteed and there's quite a few users out there that
> > >     don't enable bpf.
>
> The kernel should indeed always have a minimal security policy in place,
> LSM can tailored that but we should not assume that a specific LSM with
> a specific policy is enabled/configured on the system.

None of the LSM mailing lists were CC'd so I haven't seen the full
thread yet, and haven't had the chance to dig it up on lore, but at
the very least I would think there should be some basic controls
around who can bind/receive coredumps.

-- 
paul-moore.com

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Lennart Poettering 9 months, 1 week ago

On Mi, 07.05.25 13:51, Mickaël Salaün (mic@digikod.net) wrote:

> What if the task send a "fake" coredump and just after that really
> coredump?  There could be a race condition on the server side when
> checking the coredump property of this pidfd.
>
> Could we add a trusted header to the coredump payload that is always
> written by the kernel?  This would enable to read a trusted flag
> indicating if the following payload is a coredumped generated by the
> kernel or not.

With my systemd hat on I must say I don't really care if the coredump
is "authentic" or not. The coredump is untrusted data anyway, it needs
to be quarantined from systemd-coredump's PoV, there's no security
value in distinguishing if some random user's process was sent to the
handler because the user called raise(SIGSEGV) or because the user
called connect() to our socket. The process is under user control in
almost all ways anyways, they can rearrange its internals in almost
any way already, play any games they want. It's of very little value
if the receiving side can determine if the serialization of potential
complete garbage was written out by the kernel or by the process' own
code.

Moreover, in systemd-coredump we these days collect not only classic
ELF coredumps passed to us by the kernel, but also other forms of
crashes. For example if a Python program dies because of an uncaught
exception this is also passed to systemd-coredump, and treated the
same way in regards to metadata collection, logging, storage,
recycling and so on. Conceptually a python crash like that and a
process level crash are the same thing for us, we treat them the
same. And of course, Python crashes like this are *inherently* a
userspace concept, hence we explicitly want to accept them. Hence even
if we'd be able to distinguish "true" from "fake" coredumps we'd still
not care or change our behaviour much, because we are *as* *much*
interested in user-level crashes as in kernel handled crashes.

Lennart

--
Lennart Poettering, Berlin

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Jann Horn 9 months, 1 week ago

On Tue, May 6, 2025 at 10:06 AM Christian Brauner <brauner@kernel.org> wrote:
> On Mon, May 05, 2025 at 09:10:28PM +0200, Jann Horn wrote:
> > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > From: Christian Brauner <brauner@kernel.org>
> > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > userspace processes would like to use it to generate their own
> > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > socket for that.
> > > > >
> > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > could instead have a flag inside the af_unix client socket that says
> > > > > "this is a special client socket for coredumping".
> > > >
> > > > Should be easily doable with a sock_flag().
> > >
> > > This restriction should be applied by BPF LSM.
> >
> > I think we shouldn't allow random userspace processes to connect to
> > the core dump handling service and provide bogus inputs; that
> > unnecessarily increases the risk that a crafted coredump can be used
> > to exploit a bug in the service. So I think it makes sense to enforce
> > this restriction in the kernel.
> >
> > My understanding is that BPF LSM creates fairly tight coupling between
> > userspace and the kernel implementation, and it is kind of unwieldy
> > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > longer and describe more kernel implementation detail if you tried to
> > show how to write a BPF LSM that is capable of detecting unix domain
> > socket connections to a specific address that are not initiated by
> > core dumping.) I would like to keep it possible to implement core
> > userspace functionality in a best-practice way without needing eBPF.
> >
> > > It's hard to loosen such a default restriction as someone might
> > > argue that's unexpected and regression.
> >
> > If userspace wants to allow other processes to connect to the core
> > dumping service, that's easy to implement - userspace can listen on a
> > separate address that is not subject to these restrictions.
>
> I think Kuniyuki's point is defensible. And I did discuss this with
> Lennart when I wrote the patch and he didn't see a point in preventing
> other processes from connecting to the core dump socket. He actually
> would like this to be possible because there's some userspace programs
> out there that generate their own coredumps (Python?) and he wanted them
> to use the general coredump socket to send them to.
>
> I just found it more elegant to simply guarantee that only connections
> are made to that socket come from coredumping tasks.
>
> But I should note there are two ways to cleanly handle this in
> userspace. I had already mentioned the bpf LSM in the contect of
> rate-limiting in an earlier posting:
>
> (1) complex:
>
>     Use a bpf LSM to intercept the connection request via
>     security_unix_stream_connect() in unix_stream_connect().
>
>     The bpf program can simply check:
>
>     current->signal->core_state
>
>     and reject any connection if it isn't set to NULL.

I think that would be racy, since zap_threads sets that pointer before
ensuring that the other threads under the signal_struct are killed.

>     The big downside is that bpf (and security) need to be enabled.
>     Neither is guaranteed and there's quite a few users out there that
>     don't enable bpf.
>
> (2) simple (and supported in this series):
>
>     Userspace accepts a connection. It has to get SO_PEERPIDFD anyway.
>     It then needs to verify:
>
>     struct pidfd_info info = {
>             info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
>     };
>
>     ioctl(pidfd, PIDFD_GET_INFO, &info);
>     if (!(info.mask & PIDFD_INFO_COREDUMP)) {
>             // Can't be from a coredumping task so we can close the
>             // connection without reading.
>             close(coredump_client_fd);
>             return;
>     }
>
>     /* This has to be set and is only settable by do_coredump(). */
>     if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
>             // Can't be from a coredumping task so we can close the
>             // connection without reading.
>             close(coredump_client_fd);
>             return;
>     }
>
>     // Ok, this is a connection from a task that has coredumped, let's
>     // handle it.
>
>     The crux is that the series guarantees that by the time the
>     connection is made the info whether the task/thread-group did
>     coredump is guaranteed to be available via the pidfd.
>
> I think if we document that most coredump servers have to do (2) then
> this is fine. But I wouldn't mind a nod from Jann on this.

I wouldn't recommend either of these as a way to verify that the data
coming over the socket is a core dump generated by the kernel, since
they both look racy in that regard.

But given that you're saying the initial userspace user wouldn't
actually want such a restriction, and that we could later provide a
separate way for userspace to check what initiated the connection, I
guess this is fine for now.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Jann Horn <jannh@google.com>
Date: Mon, 5 May 2025 21:10:28 +0200
> On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > From: Christian Brauner <brauner@kernel.org>
> > Date: Mon, 5 May 2025 16:06:40 +0200
> > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > coredump socket. This restriction may be loosened later in case
> > > > > userspace processes would like to use it to generate their own
> > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > socket for that.
> > > >
> > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > could instead have a flag inside the af_unix client socket that says
> > > > "this is a special client socket for coredumping".
> > >
> > > Should be easily doable with a sock_flag().
> >
> > This restriction should be applied by BPF LSM.
> 
> I think we shouldn't allow random userspace processes to connect to
> the core dump handling service and provide bogus inputs; that
> unnecessarily increases the risk that a crafted coredump can be used
> to exploit a bug in the service. So I think it makes sense to enforce
> this restriction in the kernel.

It's already restricted by /proc/sys/kernel/core_pattern.
We don't need a duplicated logic.

Even when the process holding the listener dies, you can
still avoid such a leak.

e.g.

1. Set up a listener
2. Put the socket into a bpf map
3. Attach LSM at connect()

Then, the LSM checks if the destination socket is

  * listening on the name specified in /proc/sys/kernel/core_pattern
  * exists in the associated BPF map

So, if the socket is dies and a malicious user tries to hijack
the core_pattern name, LSM still rejects such connect().

Later, the admin can restart the program with different core_pattern.


> 
> My understanding is that BPF LSM creates fairly tight coupling between
> userspace and the kernel implementation, and it is kind of unwieldy
> for userspace. (I imagine the "man 5 core" manpage would get a bit
> longer and describe more kernel implementation detail if you tried to
> show how to write a BPF LSM that is capable of detecting unix domain
> socket connections to a specific address that are not initiated by
> core dumping.) I would like to keep it possible to implement core
> userspace functionality in a best-practice way without needing eBPF.

I think the untrusted user scenario is paranoia in most cases,
and the man page just says "if you really care, use BPF LSM".

If someone can listen on a name AND set it to core_pattern, most
likely something worse already happened.


> 
> > It's hard to loosen such a default restriction as someone might
> > argue that's unexpected and regression.
> 
> If userspace wants to allow other processes to connect to the core
> dumping service, that's easy to implement - userspace can listen on a
> separate address that is not subject to these restrictions.
>

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Jann Horn 9 months, 1 week ago

On Mon, May 5, 2025 at 9:38 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> From: Jann Horn <jannh@google.com>
> Date: Mon, 5 May 2025 21:10:28 +0200
> > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > From: Christian Brauner <brauner@kernel.org>
> > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > userspace processes would like to use it to generate their own
> > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > socket for that.
> > > > >
> > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > could instead have a flag inside the af_unix client socket that says
> > > > > "this is a special client socket for coredumping".
> > > >
> > > > Should be easily doable with a sock_flag().
> > >
> > > This restriction should be applied by BPF LSM.
> >
> > I think we shouldn't allow random userspace processes to connect to
> > the core dump handling service and provide bogus inputs; that
> > unnecessarily increases the risk that a crafted coredump can be used
> > to exploit a bug in the service. So I think it makes sense to enforce
> > this restriction in the kernel.
>
> It's already restricted by /proc/sys/kernel/core_pattern.
> We don't need a duplicated logic.

The core_pattern does not restrict which processes can call connect()
on the unix domain socket address.

> Even when the process holding the listener dies, you can
> still avoid such a leak.
>
> e.g.
>
> 1. Set up a listener
> 2. Put the socket into a bpf map
> 3. Attach LSM at connect()
>
> Then, the LSM checks if the destination socket is

Where does the LSM get the destination socket pointer from? The
socket_connect LSM hook happens before the point where the destination
socket is looked up. What you have in that hook is the unix socket
address structure.

>   * listening on the name specified in /proc/sys/kernel/core_pattern
>   * exists in the associated BPF map
>
> So, if the socket is dies and a malicious user tries to hijack
> the core_pattern name, LSM still rejects such connect().

This patch is not about a malicious user binding to the core dumping
service's unix domain socket address, that is blocked in "[PATCH RFC
v3 03/10] net: reserve prefix". This patch is about preventing
userspace from connect()ing to the legitimate listening socket.

> Later, the admin can restart the program with different core_pattern.
>
>
> >
> > My understanding is that BPF LSM creates fairly tight coupling between
> > userspace and the kernel implementation, and it is kind of unwieldy
> > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > longer and describe more kernel implementation detail if you tried to
> > show how to write a BPF LSM that is capable of detecting unix domain
> > socket connections to a specific address that are not initiated by
> > core dumping.) I would like to keep it possible to implement core
> > userspace functionality in a best-practice way without needing eBPF.
>
> I think the untrusted user scenario is paranoia in most cases,
> and the man page just says "if you really care, use BPF LSM".

Are you saying that you expect crash dumping services to be written
with the expectation that the system will not have multiple users or
multiple security contexts?

> If someone can listen on a name AND set it to core_pattern, most
> likely something worse already happened.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Jann Horn <jannh@google.com>
Date: Mon, 5 May 2025 21:55:04 +0200
> On Mon, May 5, 2025 at 9:38 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > From: Jann Horn <jannh@google.com>
> > Date: Mon, 5 May 2025 21:10:28 +0200
> > > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > > From: Christian Brauner <brauner@kernel.org>
> > > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > > userspace processes would like to use it to generate their own
> > > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > > socket for that.
> > > > > >
> > > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > > could instead have a flag inside the af_unix client socket that says
> > > > > > "this is a special client socket for coredumping".
> > > > >
> > > > > Should be easily doable with a sock_flag().
> > > >
> > > > This restriction should be applied by BPF LSM.
> > >
> > > I think we shouldn't allow random userspace processes to connect to
> > > the core dump handling service and provide bogus inputs; that
> > > unnecessarily increases the risk that a crafted coredump can be used
> > > to exploit a bug in the service. So I think it makes sense to enforce
> > > this restriction in the kernel.
> >
> > It's already restricted by /proc/sys/kernel/core_pattern.
> > We don't need a duplicated logic.
> 
> The core_pattern does not restrict which processes can call connect()
> on the unix domain socket address.

I misread it so added LSM can filter the source as well.


> 
> > Even when the process holding the listener dies, you can
> > still avoid such a leak.
> >
> > e.g.
> >
> > 1. Set up a listener
> > 2. Put the socket into a bpf map
> > 3. Attach LSM at connect()
> >
> > Then, the LSM checks if the destination socket is
> 
> Where does the LSM get the destination socket pointer from? The
> socket_connect LSM hook happens before the point where the destination
> socket is looked up. What you have in that hook is the unix socket
> address structure.

The hook is invoked after the lookup.
"sk" is the source and "other" is the destination.

        err = security_unix_stream_connect(sk, other, newsk);
        if (err) {
                unix_state_unlock(sk);
                goto out_unlock;
        }


> 
> >   * listening on the name specified in /proc/sys/kernel/core_pattern
> >   * exists in the associated BPF map
> >
> > So, if the socket is dies and a malicious user tries to hijack
> > the core_pattern name, LSM still rejects such connect().
> 
> This patch is not about a malicious user binding to the core dumping
> service's unix domain socket address, that is blocked in "[PATCH RFC
> v3 03/10] net: reserve prefix". This patch is about preventing
> userspace from connect()ing to the legitimate listening socket.

I mean both can be better handled by BPF.


> 
> > Later, the admin can restart the program with different core_pattern.
> >
> >
> > >
> > > My understanding is that BPF LSM creates fairly tight coupling between
> > > userspace and the kernel implementation, and it is kind of unwieldy
> > > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > > longer and describe more kernel implementation detail if you tried to
> > > show how to write a BPF LSM that is capable of detecting unix domain
> > > socket connections to a specific address that are not initiated by
> > > core dumping.) I would like to keep it possible to implement core
> > > userspace functionality in a best-practice way without needing eBPF.
> >
> > I think the untrusted user scenario is paranoia in most cases,
> > and the man page just says "if you really care, use BPF LSM".
> 
> Are you saying that you expect crash dumping services to be written
> with the expectation that the system will not have multiple users or
> multiple security contexts?

Do you mean other program could use different LSM ?

I think different LSMs can co-exist, see call_int_hook().

I remember BPF LSM was not stackable at some point, but not sure
if it's still true.

I expect the service to use necessary functionality if it's
needed for the specific use case.

Not everyone need such restriction, and it's optional.  Why not
allowing those who really need it to implement it.


> 
> > If someone can listen on a name AND set it to core_pattern, most
> > likely something worse already happened.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 5 May 2025 12:35:50 -0700
> From: Jann Horn <jannh@google.com>
> Date: Mon, 5 May 2025 21:10:28 +0200
> > On Mon, May 5, 2025 at 8:41 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > From: Christian Brauner <brauner@kernel.org>
> > > Date: Mon, 5 May 2025 16:06:40 +0200
> > > > On Mon, May 05, 2025 at 03:08:07PM +0200, Jann Horn wrote:
> > > > > On Mon, May 5, 2025 at 1:14 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > > > Make sure that only tasks that actually coredumped may connect to the
> > > > > > coredump socket. This restriction may be loosened later in case
> > > > > > userspace processes would like to use it to generate their own
> > > > > > coredumps. Though it'd be wiser if userspace just exposed a separate
> > > > > > socket for that.
> > > > >
> > > > > This implementation kinda feels a bit fragile to me... I wonder if we
> > > > > could instead have a flag inside the af_unix client socket that says
> > > > > "this is a special client socket for coredumping".
> > > >
> > > > Should be easily doable with a sock_flag().
> > >
> > > This restriction should be applied by BPF LSM.
> > 
> > I think we shouldn't allow random userspace processes to connect to
> > the core dump handling service and provide bogus inputs; that
> > unnecessarily increases the risk that a crafted coredump can be used
> > to exploit a bug in the service. So I think it makes sense to enforce
> > this restriction in the kernel.
> 
> It's already restricted by /proc/sys/kernel/core_pattern.
> We don't need a duplicated logic.
> 
> Even when the process holding the listener dies, you can
> still avoid such a leak.
> 
> e.g.
> 
> 1. Set up a listener
> 2. Put the socket into a bpf map
> 3. Attach LSM at connect()
> 
> Then, the LSM checks if the destination socket is
> 
>   * listening on the name specified in /proc/sys/kernel/core_pattern
>   * exists in the associated BPF map

and LSM can check if the source socket is a kernel socket too.


> 
> So, if the socket is dies and a malicious user tries to hijack
> the core_pattern name, LSM still rejects such connect().
> 
> Later, the admin can restart the program with different core_pattern.
> 
> 
> > 
> > My understanding is that BPF LSM creates fairly tight coupling between
> > userspace and the kernel implementation, and it is kind of unwieldy
> > for userspace. (I imagine the "man 5 core" manpage would get a bit
> > longer and describe more kernel implementation detail if you tried to
> > show how to write a BPF LSM that is capable of detecting unix domain
> > socket connections to a specific address that are not initiated by
> > core dumping.) I would like to keep it possible to implement core
> > userspace functionality in a best-practice way without needing eBPF.
> 
> I think the untrusted user scenario is paranoia in most cases,
> and the man page just says "if you really care, use BPF LSM".
> 
> If someone can listen on a name AND set it to core_pattern, most
> likely something worse already happened.
> 
> 
> > 
> > > It's hard to loosen such a default restriction as someone might
> > > argue that's unexpected and regression.
> > 
> > If userspace wants to allow other processes to connect to the core
> > dumping service, that's easy to implement - userspace can listen on a
> > separate address that is not subject to these restrictions.
> >

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Jann Horn 9 months, 1 week ago

On Mon, May 5, 2025 at 9:45 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> and LSM can check if the source socket is a kernel socket too.

("a kernel socket" is not necessarily the same as "a kernel socket
intended for core dumping")

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Christian Brauner 9 months, 1 week ago

> ("a kernel socket" is not necessarily the same as "a kernel socket
> intended for core dumping")

Indeed. The usermodehelper is a kernel protocol. Here it's the task with
its own credentials that's connecting to a userspace socket. Which makes
this very elegant because it's just userspace IPC. No one is running
around with kernel credentials anywhere.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Jann Horn 9 months, 1 week ago

On Tue, May 6, 2025 at 9:39 AM Christian Brauner <brauner@kernel.org> wrote:
> > ("a kernel socket" is not necessarily the same as "a kernel socket
> > intended for core dumping")
>
> Indeed. The usermodehelper is a kernel protocol. Here it's the task with
> its own credentials that's connecting to a userspace socket. Which makes
> this very elegant because it's just userspace IPC. No one is running
> around with kernel credentials anywhere.

To be clear: I think your current patch is using special kernel
privileges in one regard, because kernel_connect() bypasses the
security_socket_connect() security hook. I think it is a good thing
that it bypasses security hooks in this way; I think we wouldn't want
LSMs to get in the way of this special connect(), since the task in
whose context the connect() call happens is not in control of this
connection; the system administrator is the one who decided that this
connect() should happen on core dumps. It is kind of inconsistent
though that that separate security_unix_stream_connect() LSM hook will
still be invoked in this case, and we might have to watch out to make
sure that LSMs won't end up blocking such connections... which I think
is related to what Mickael was saying on the other thread. Landlock
currently doesn't filter abstract connections at that hook, so for now
this would only be relevant for SELinux and Smack. I guess those are
maybe less problematic in this regard because they work on full-system
policies rather than app-specific policies; but still, with the
current implementation, SELinux/Smack policies would need to be
designed to allow processes to connect to the core dumping socket to
make core dumping work.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Mickaël Salaün 9 months, 1 week ago

On Tue, May 06, 2025 at 04:51:25PM +0200, Jann Horn wrote:
> On Tue, May 6, 2025 at 9:39 AM Christian Brauner <brauner@kernel.org> wrote:
> > > ("a kernel socket" is not necessarily the same as "a kernel socket
> > > intended for core dumping")
> >
> > Indeed. The usermodehelper is a kernel protocol. Here it's the task with
> > its own credentials that's connecting to a userspace socket. Which makes
> > this very elegant because it's just userspace IPC. No one is running
> > around with kernel credentials anywhere.
> 
> To be clear: I think your current patch is using special kernel
> privileges in one regard, because kernel_connect() bypasses the
> security_socket_connect() security hook. I think it is a good thing
> that it bypasses security hooks in this way; I think we wouldn't want
> LSMs to get in the way of this special connect(), since the task in
> whose context the connect() call happens is not in control of this
> connection; the system administrator is the one who decided that this
> connect() should happen on core dumps. It is kind of inconsistent
> though that that separate security_unix_stream_connect() LSM hook will
> still be invoked in this case, and we might have to watch out to make
> sure that LSMs won't end up blocking such connections... which I think
> is related to what Mickael was saying on the other thread.

Right

> Landlock
> currently doesn't filter abstract connections at that hook, so for now

Landlock implements this hook since Linux 6.12 and can deny connections
from a sandboxed process to a peer outside the sandbox:
https://docs.kernel.org/userspace-api/landlock.html#ipc-scoping
I was worried that security_unix_stream_connect() would be called with
the task's credential, which would block coredumps from sandboxed tasks.
This would also apply to other LSMs.

> this would only be relevant for SELinux and Smack. I guess those are
> maybe less problematic in this regard because they work on full-system
> policies rather than app-specific policies; but still, with the
> current implementation, SELinux/Smack policies would need to be
> designed to allow processes to connect to the core dumping socket to
> make core dumping work.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Christian Brauner 9 months, 1 week ago

On Tue, May 06, 2025 at 04:51:25PM +0200, Jann Horn wrote:
> On Tue, May 6, 2025 at 9:39 AM Christian Brauner <brauner@kernel.org> wrote:
> > > ("a kernel socket" is not necessarily the same as "a kernel socket
> > > intended for core dumping")
> >
> > Indeed. The usermodehelper is a kernel protocol. Here it's the task with
> > its own credentials that's connecting to a userspace socket. Which makes
> > this very elegant because it's just userspace IPC. No one is running
> > around with kernel credentials anywhere.
> 
> To be clear: I think your current patch is using special kernel
> privileges in one regard, because kernel_connect() bypasses the
> security_socket_connect() security hook. I think it is a good thing
> that it bypasses security hooks in this way; I think we wouldn't want
> LSMs to get in the way of this special connect(), since the task in
> whose context the connect() call happens is not in control of this
> connection; the system administrator is the one who decided that this
> connect() should happen on core dumps. It is kind of inconsistent
> though that that separate security_unix_stream_connect() LSM hook will
> still be invoked in this case, and we might have to watch out to make
> sure that LSMs won't end up blocking such connections... which I think

Right, it is the same as for the usermode helper. It calls
kernel_execve() which invokes at least security_bprm_creds_for_exec()
and security_bprm_check(). Both of which can be used to make the
usermodehelper execve fail.

Fwiw, it's even the case for dumping directly to a file as in that case
it's subject to all kinds of lookup and open security hooks like
security_file_open() and then another round in do_truncate().

All of that happens fully in the task's context as well via
file_open()/file_open_root() or do_truncate().

So there's nothing special here.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Christian Brauner <brauner@kernel.org>
Date: Tue, 6 May 2025 17:16:13 +0200
> On Tue, May 06, 2025 at 04:51:25PM +0200, Jann Horn wrote:
> > On Tue, May 6, 2025 at 9:39 AM Christian Brauner <brauner@kernel.org> wrote:
> > > > ("a kernel socket" is not necessarily the same as "a kernel socket
> > > > intended for core dumping")
> > >
> > > Indeed. The usermodehelper is a kernel protocol. Here it's the task with
> > > its own credentials that's connecting to a userspace socket. Which makes
> > > this very elegant because it's just userspace IPC. No one is running
> > > around with kernel credentials anywhere.
> > 
> > To be clear: I think your current patch is using special kernel
> > privileges in one regard, because kernel_connect() bypasses the
> > security_socket_connect() security hook.

Precisely, whether LSM ignores kernel sockets or not depends on LSM.

When we create a socket, kern=0/1 is passed to security_socket_create().
Currently, SELinux always ignore the kernel socket, and AppArmor depends
on another condition.  Other LSM doesn't care.  Especially, BPF LSM is
just a set of functions to attach BPF programs, so it can enfoce whatever.


> I think it is a good thing
> > that it bypasses security hooks in this way; I think we wouldn't want
> > LSMs to get in the way of this special connect(), since the task in
> > whose context the connect() call happens is not in control of this
> > connection; the system administrator is the one who decided that this
> > connect() should happen on core dumps. It is kind of inconsistent
> > though that that separate security_unix_stream_connect() LSM hook will
> > still be invoked in this case, and we might have to watch out to make
> > sure that LSMs won't end up blocking such connections... which I think
> 
> Right, it is the same as for the usermode helper. It calls
> kernel_execve() which invokes at least security_bprm_creds_for_exec()
> and security_bprm_check(). Both of which can be used to make the
> usermodehelper execve fail.
> 
> Fwiw, it's even the case for dumping directly to a file as in that case
> it's subject to all kinds of lookup and open security hooks like
> security_file_open() and then another round in do_truncate().
> 
> All of that happens fully in the task's context as well via
> file_open()/file_open_root() or do_truncate().
> 
> So there's nothing special here.

Re: [PATCH RFC v3 08/10] net, pidfs, coredump: only allow coredumping tasks to connect to coredump socket

Posted by Kuniyuki Iwashima 9 months, 1 week ago

From: Jann Horn <jannh@google.com>
Date: Mon, 5 May 2025 21:55:06 +0200
> On Mon, May 5, 2025 at 9:45 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > and LSM can check if the source socket is a kernel socket too.
> 
> ("a kernel socket" is not necessarily the same as "a kernel socket
> intended for core dumping")

Yes, but why we need to care about it :)

It doesn't happen or it's out-of-tree driver that is out-of-control
for us but should be in-control on the host where the service is
running.