From nobody Sun Feb  8 16:31:23 2026
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 83CCB25DCE4;
	Thu, 24 Apr 2025 12:25:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1745497522; cv=none;
 b=b4D0BaWj4ERkukoutW7imnMrfTUTLkUMqPGuUmRohhsd1HTYvWdzkhoU6yqt+JmDXM1VW5kGnKkr6N6jiIp7BknEoBZUK/PnYeuJn5mD3+BnExbTUG+YN4MD6wMGRJJ8eiEycjuI7CeJZez+RR0NUi6yJj6Uo1OyN7baBNgrOZw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1745497522; c=relaxed/simple;
	bh=BAR2mPEdQmLCpUNaicjcrIVZW8pnh6HvsXslZl4NsA4=;
	h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References:
	 In-Reply-To:To:Cc;
 b=LGcBrs0zjxf8x5O1V8XeAOQ7PDxUnwERh2P5Wt7gdhDx/bfNg0K7Xsl11s4RGiJhCou8NjgdEeWYvPgicjFjaBENSz8Su2MQeQL63RauzHo+n3t/UyCLi825qOxI43JQHWPWeQeaUatRFwr2c1w3GemACIt2ql5PY60vi/olPc4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=XvB/hpcv; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="XvB/hpcv"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4186AC4CEE3;
	Thu, 24 Apr 2025 12:25:18 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1745497522;
	bh=BAR2mPEdQmLCpUNaicjcrIVZW8pnh6HvsXslZl4NsA4=;
	h=From:Date:Subject:References:In-Reply-To:To:Cc:From;
	b=XvB/hpcvs4PfU9v42lLjOJdW3kpZK1j1TMROnLbJq64GRoLVUpaRLm5VHGmvCzc70
	 lAbEx5+G3Abvc81v2gPDoPTOtcuZxeNQR/jVfU9fr9KIAFTsbd+WLl/Ba81FRmHl40
	 9yTYxeDkAabxaCwMLlwnWa14pKuRB7I2f/seiuEjUj2HZLzx+G5A3+N1x0WTa1EsTe
	 6ucCtdd0G8s82ZF8f3wlS0KPvnhh/OCyEOjYSOhrTUDGiXMNYeWzoDDWmsUdoKlo6c
	 7CwQmDxZfW71xucG6W5Zw9TpWbWFUHsiZDdR7jZORuL897CT11ANTKHRr6h7LoI7zV
	 LEP80n+bdHReA==
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 24 Apr 2025 14:24:35 +0200
Subject: [PATCH RFC 2/4] net, pidfs: prepare for handing out pidfds for
 reaped sk->sk_peer_pid
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable
Message-Id: <20250424-work-pidfs-net-v1-2-0dc97227d854@kernel.org>
References: <20250424-work-pidfs-net-v1-0-0dc97227d854@kernel.org>
In-Reply-To: <20250424-work-pidfs-net-v1-0-0dc97227d854@kernel.org>
To: Oleg Nesterov <oleg@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>,
 "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>,
 Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
 Simon Horman <horms@kernel.org>
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
 netdev@vger.kernel.org, David Rheinsberg <david@readahead.eu>,
 Jan Kara <jack@suse.cz>, Alexander Mikhalitsyn <alexander@mihalicyn.com>,
 Luca Boccassi <bluca@debian.org>,
 Lennart Poettering <lennart@poettering.net>,
 Daan De Meyer <daan.j.demeyer@gmail.com>, Mike Yuan <me@yhndnzj.com>,
 Christian Brauner <brauner@kernel.org>
X-Mailer: b4 0.15-dev-c25d1
X-Developer-Signature: v=1; a=openpgp-sha256; l=9059; i=brauner@kernel.org;
 h=from:subject:message-id; bh=BAR2mPEdQmLCpUNaicjcrIVZW8pnh6HvsXslZl4NsA4=;
 b=owGbwMvMwCU28Zj0gdSKO4sYT6slMWRw6S7lXeDqu/WB+cyNTxbvWt9y/Mzf/1v+p35wmGj3I
 r+54O/rOR2lLAxiXAyyYoosDu0m4XLLeSo2G2VqwMxhZQIZwsDFKQATcX3K8N/1ZsBOpddXQtfL
 uEpvlNxWHvf0ge2tT7EyehcTUtPeTuBhZDixZCljUEXQxOc8xV++qm7rWv9W6cps1eyr/AmSp3a
 85uYGAA==
X-Developer-Key: i=brauner@kernel.org; a=openpgp;
 fpr=4880B8C9BD0E5106FC070F4F7B3C391EFEA93624

SO_PEERPIDFD currently doesn't support handing out pidfds if the
sk->sk_peer_pid thread-group leader has already been reaped. In this
case it currently returns EINVAL. Userspace still wants to get a pidfd
for a reaped process to have a stable handle it can pass on.
This is especially useful now that it is possible to retrieve exit
information through a pidfd via the PIDFD_GET_INFO ioctl()'s
PIDFD_INFO_EXIT flag.

Another summary has been provided by David in [1]:

> A pidfd can outlive the task it refers to, and thus user-space must
> already be prepared that the task underlying a pidfd is gone at the time
> they get their hands on the pidfd. For instance, resolving the pidfd to
> a PID via the fdinfo must be prepared to read `-1`.
>
> Despite user-space knowing that a pidfd might be stale, several kernel
> APIs currently add another layer that checks for this. In particular,
> SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped,
> but returns a stale pidfd if the task is reaped immediately after the
> respective alive-check.
>
> This has the unfortunate effect that user-space now has two ways to
> check for the exact same scenario: A syscall might return
> EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no
> particular reason to distinguish both cases. This also propagates
> through user-space APIs, which pass on pidfds. They must be prepared to
> pass on `-1` *or* the pidfd, because there is no guaranteed way to get a
> stale pidfd from the kernel.
> Userspace must already deal with a pidfd referring to a reaped task as
> the task may exit and get reaped at any time will there are still many
> pidfds referring to it.

In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure
that PIDFD_INFO_EXIT information is available whenever a pidfd for a
reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped
pidfds are only handed out if it is guaranteed that the caller sees the
exit information:

TEST_F(pidfd_info, success_reaped)
{
        struct pidfd_info info =3D {
                .mask =3D PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT,
        };

        /*
         * Process has already been reaped and PIDFD_INFO_EXIT been set.
         * Verify that we can retrieve the exit status of the process.
         */
        ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0);
        ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
        ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
        ASSERT_TRUE(WIFEXITED(info.exit_code));
        ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
}

To hand out pidfds for reaped processes we thus allocate a pidfs entry
for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is
stashed and drop it when the socket is destroyed. This guarantees that
exit information will always be recorded for the sk->sk_peer_pid task
and we can hand out pidfds for reaped processes.

Link: https://lore.kernel.org/lkml/20230807085203.819772-1-david@readahead.=
eu [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 net/unix/af_unix.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++---=
----
 1 file changed, 79 insertions(+), 11 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f78a2492826f..83b5aebf499e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -100,6 +100,7 @@
 #include <linux/splice.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <linux/pidfs.h>
 #include <net/af_unix.h>
 #include <net/net_namespace.h>
 #include <net/scm.h>
@@ -643,6 +644,14 @@ static void unix_sock_destructor(struct sock *sk)
 		return;
 	}
=20
+	if (sock_flag(sk, SOCK_RCU_FREE)) {
+		pr_info("Attempting to release RCU protected socket with sleeping locks:=
 %p\n", sk);
+		return;
+	}
+
+	if (sk->sk_peer_pid)
+		pidfs_put_pid(sk->sk_peer_pid);
+
 	if (u->addr)
 		unix_release_addr(u->addr);
=20
@@ -734,13 +743,48 @@ static void unix_release_sock(struct sock *sk, int em=
brion)
 		unix_gc();		/* Garbage collect fds */
 }
=20
-static void init_peercred(struct sock *sk)
+struct af_unix_peercred {
+	struct pid *peer_pid;
+	const struct cred *peer_cred;
+};
+
+static inline int prepare_peercred(struct af_unix_peercred *peercred)
+{
+	struct pid *pid;
+	int err;
+
+	pid =3D task_tgid(current);
+	err =3D pidfs_register_pid(pid);
+	if (likely(!err)) {
+		peercred->peer_pid =3D get_pid(pid);
+		peercred->peer_cred =3D get_current_cred();
+	}
+	return err;
+}
+
+static void drop_peercred(struct af_unix_peercred *peercred)
+{
+	struct pid *pid =3D NULL;
+	const struct cred *cred =3D NULL;
+
+	might_sleep();
+
+	swap(peercred->peer_pid, pid);
+	swap(peercred->peer_cred, cred);
+
+	pidfs_put_pid(pid);
+	put_pid(pid);
+	put_cred(cred);
+}
+
+static inline void init_peercred(struct sock *sk,
+				 const struct af_unix_peercred *peercred)
 {
-	sk->sk_peer_pid =3D get_pid(task_tgid(current));
-	sk->sk_peer_cred =3D get_current_cred();
+	sk->sk_peer_pid =3D peercred->peer_pid;
+	sk->sk_peer_cred =3D peercred->peer_cred;
 }
=20
-static void update_peercred(struct sock *sk)
+static void update_peercred(struct sock *sk, struct af_unix_peercred *peer=
cred)
 {
 	const struct cred *old_cred;
 	struct pid *old_pid;
@@ -748,11 +792,11 @@ static void update_peercred(struct sock *sk)
 	spin_lock(&sk->sk_peer_lock);
 	old_pid =3D sk->sk_peer_pid;
 	old_cred =3D sk->sk_peer_cred;
-	init_peercred(sk);
+	init_peercred(sk, peercred);
 	spin_unlock(&sk->sk_peer_lock);
=20
-	put_pid(old_pid);
-	put_cred(old_cred);
+	peercred->peer_pid =3D old_pid;
+	peercred->peer_cred =3D old_cred;
 }
=20
 static void copy_peercred(struct sock *sk, struct sock *peersk)
@@ -761,6 +805,7 @@ static void copy_peercred(struct sock *sk, struct sock =
*peersk)
=20
 	spin_lock(&sk->sk_peer_lock);
 	sk->sk_peer_pid =3D get_pid(peersk->sk_peer_pid);
+	pidfs_get_pid(sk->sk_peer_pid);
 	sk->sk_peer_cred =3D get_cred(peersk->sk_peer_cred);
 	spin_unlock(&sk->sk_peer_lock);
 }
@@ -770,6 +815,7 @@ static int unix_listen(struct socket *sock, int backlog)
 	int err;
 	struct sock *sk =3D sock->sk;
 	struct unix_sock *u =3D unix_sk(sk);
+	struct af_unix_peercred peercred =3D {};
=20
 	err =3D -EOPNOTSUPP;
 	if (sock->type !=3D SOCK_STREAM && sock->type !=3D SOCK_SEQPACKET)
@@ -777,6 +823,9 @@ static int unix_listen(struct socket *sock, int backlog)
 	err =3D -EINVAL;
 	if (!READ_ONCE(u->addr))
 		goto out;	/* No listens on an unbound socket */
+	err =3D prepare_peercred(&peercred);
+	if (err)
+		goto out;
 	unix_state_lock(sk);
 	if (sk->sk_state !=3D TCP_CLOSE && sk->sk_state !=3D TCP_LISTEN)
 		goto out_unlock;
@@ -786,11 +835,12 @@ static int unix_listen(struct socket *sock, int backl=
og)
 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
=20
 	/* set credentials so connect can copy them */
-	update_peercred(sk);
+	update_peercred(sk, &peercred);
 	err =3D 0;
=20
 out_unlock:
 	unix_state_unlock(sk);
+	drop_peercred(&peercred);
 out:
 	return err;
 }
@@ -1525,6 +1575,7 @@ static int unix_stream_connect(struct socket *sock, s=
truct sockaddr *uaddr,
 	struct sockaddr_un *sunaddr =3D (struct sockaddr_un *)uaddr;
 	struct sock *sk =3D sock->sk, *newsk =3D NULL, *other =3D NULL;
 	struct unix_sock *u =3D unix_sk(sk), *newu, *otheru;
+	struct af_unix_peercred peercred =3D {};
 	struct net *net =3D sock_net(sk);
 	struct sk_buff *skb =3D NULL;
 	unsigned char state;
@@ -1561,6 +1612,10 @@ static int unix_stream_connect(struct socket *sock, =
struct sockaddr *uaddr,
 		goto out;
 	}
=20
+	err =3D prepare_peercred(&peercred);
+	if (err)
+		goto out;
+
 	/* Allocate skb for sending to listening sock */
 	skb =3D sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
 	if (!skb) {
@@ -1636,7 +1691,7 @@ static int unix_stream_connect(struct socket *sock, s=
truct sockaddr *uaddr,
 	unix_peer(newsk)	=3D sk;
 	newsk->sk_state		=3D TCP_ESTABLISHED;
 	newsk->sk_type		=3D sk->sk_type;
-	init_peercred(newsk);
+	init_peercred(newsk, &peercred);
 	newu =3D unix_sk(newsk);
 	newu->listener =3D other;
 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
@@ -1695,20 +1750,33 @@ static int unix_stream_connect(struct socket *sock,=
 struct sockaddr *uaddr,
 out_free_sk:
 	unix_release_sock(newsk, 0);
 out:
+	drop_peercred(&peercred);
 	return err;
 }
=20
 static int unix_socketpair(struct socket *socka, struct socket *sockb)
 {
+	struct af_unix_peercred ska_peercred =3D {}, skb_peercred =3D {};
 	struct sock *ska =3D socka->sk, *skb =3D sockb->sk;
+	int err;
+
+	err =3D prepare_peercred(&ska_peercred);
+	if (err)
+		return err;
+
+	err =3D prepare_peercred(&skb_peercred);
+	if (err) {
+		drop_peercred(&ska_peercred);
+		return err;
+	}
=20
 	/* Join our sockets back to back */
 	sock_hold(ska);
 	sock_hold(skb);
 	unix_peer(ska) =3D skb;
 	unix_peer(skb) =3D ska;
-	init_peercred(ska);
-	init_peercred(skb);
+	init_peercred(ska, &ska_peercred);
+	init_peercred(skb, &skb_peercred);
=20
 	ska->sk_state =3D TCP_ESTABLISHED;
 	skb->sk_state =3D TCP_ESTABLISHED;

--=20
2.47.2