fs/pidfs.c | 78 +++++++++++++++++- include/uapi/linux/pidfd.h | 24 ++++++ .../testing/selftests/pidfd/pidfd_open_test.c | 81 ++++++++++++++++++- 3 files changed, 179 insertions(+), 4 deletions(-)
From: Luca Boccassi <luca.boccassi@gmail.com>
A common pattern when using pid fds is having to get information
about the process, which currently requires /proc being mounted,
resolving the fd to a pid, and then do manual string parsing of
/proc/N/status and friends. This needs to be reimplemented over
and over in all userspace projects (e.g.: I have reimplemented
resolving in systemd, dbus, dbus-daemon, polkit so far), and
requires additional care in checking that the fd is still valid
after having parsed the data, to avoid races.
Having a programmatic API that can be used directly removes all
these requirements, including having /proc mounted.
As discussed at LPC24, add an ioctl with an extensible struct
so that more parameters can be added later if needed. Start with
returning pid/tgid/ppid and creds unconditionally, and cgroupid
optionally.
Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
---
v5: check again that the task hasn't exited immediately before copying
the result out to userspace, to ensure we are not returning stale data
add an ifdef around the cgroup structs usage to fix build errors when
the feature is disabled
v4: fix arg check in pidfd_ioctl() by moving it after the new call
v3: switch from pid_vnr() to task_pid_vnr()
v2: Apply comments from Christian, apart from the one about pid namespaces
as I need additional hints on how to implement it.
Drop the security_context string as it is not the appropriate
metadata to give userspace these days.
fs/pidfs.c | 78 +++++++++++++++++-
include/uapi/linux/pidfd.h | 24 ++++++
.../testing/selftests/pidfd/pidfd_open_test.c | 81 ++++++++++++++++++-
3 files changed, 179 insertions(+), 4 deletions(-)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 7ffdc88dfb52..47868c3baf5f 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -2,6 +2,7 @@
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/cgroup.h>
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/pid.h>
@@ -114,6 +115,73 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
+static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+ struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ size_t usize = _IOC_SIZE(cmd);
+ struct pidfd_info kinfo = {};
+ struct user_namespace *user_ns;
+ const struct cred *c;
+ __u64 request_mask;
+
+ if (!uinfo)
+ return -EINVAL;
+ if (usize < sizeof(struct pidfd_info))
+ return -EINVAL; /* First version, no smaller struct possible */
+
+ if (copy_from_user(&request_mask, &uinfo->request_mask, sizeof(request_mask)))
+ return -EFAULT;
+
+ c = get_task_cred(task);
+ if (!c)
+ return -ESRCH;
+
+ /* Unconditionally return identifiers and credentials, the rest only on request */
+
+ kinfo.pid = task_pid_vnr(task);
+ kinfo.tgid = task_tgid_vnr(task);
+ kinfo.ppid = task_ppid_nr_ns(task, task_active_pid_ns(task));
+
+ user_ns = current_user_ns();
+ kinfo.ruid = from_kuid_munged(user_ns, c->uid);
+ kinfo.rgid = from_kgid_munged(user_ns, c->gid);
+ kinfo.euid = from_kuid_munged(user_ns, c->euid);
+ kinfo.egid = from_kgid_munged(user_ns, c->egid);
+ kinfo.suid = from_kuid_munged(user_ns, c->suid);
+ kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
+ kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
+ kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
+
+#ifdef CONFIG_CGROUPS
+ if (request_mask & PIDFD_INFO_CGROUPID) {
+ struct cgroup *cgrp = task_css_check(task, pids_cgrp_id, 1)->cgroup;
+ if (!cgrp)
+ return -ENODEV;
+
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.result_mask |= PIDFD_INFO_CGROUPID;
+ }
+#endif
+
+ /*
+ * One last check before returning: the task might have exited while we
+ * were fetching all the data, so check again to be safe. */
+ if (task_pid_vnr(task) == 0)
+ return -ESRCH;
+
+ /*
+ * If userspace and the kernel have the same struct size it can just
+ * be copied. If userspace provides an older struct, only the bits that
+ * userspace knows about will be copied. If userspace provides a new
+ * struct, only the bits that the kernel knows about will be copied and
+ * the size value will be set to the size the kernel knows about.
+ */
+ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
+ return -EFAULT;
+
+ return 0;
+}
+
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@@ -121,13 +189,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
- if (arg)
- return -EINVAL;
-
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
+ /* Extensible IOCTL that does not open namespace FDs, take a shortcut */
+ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
+ return pidfd_info(task, cmd, arg);
+
+ if (arg)
+ return -EINVAL;
+
scoped_guard(task_lock, task) {
nsp = task->nsproxy;
if (nsp)
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 565fc0629fff..f278db1a3fe8 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -16,6 +16,29 @@
#define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1)
#define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2)
+/* Flags for pidfd_info. */
+#define PIDFD_INFO_CGROUPID (1UL << 0)
+
+struct pidfd_info {
+ /* Let userspace request expensive stuff explictly. */
+ __u64 request_mask;
+ /* And let the kernel indicate whether it knows about it. */
+ __u64 result_mask;
+ __u64 cgroupid;
+ __u32 pid;
+ __u32 tgid;
+ __u32 ppid;
+ __u32 ruid;
+ __u32 rgid;
+ __u32 euid;
+ __u32 egid;
+ __u32 suid;
+ __u32 sgid;
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 spare0[1];
+};
+
#define PIDFS_IOCTL_MAGIC 0xFF
#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1)
@@ -28,5 +51,6 @@
#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8)
#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9)
#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10)
+#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
index c62564c264b1..30c50a8ae10b 100644
--- a/tools/testing/selftests/pidfd/pidfd_open_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -13,6 +13,7 @@
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
+#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/wait.h>
@@ -21,6 +22,35 @@
#include "pidfd.h"
#include "../kselftest.h"
+#ifndef PIDFS_IOCTL_MAGIC
+#define PIDFS_IOCTL_MAGIC 0xFF
+#endif
+
+#ifndef PIDFD_GET_INFO
+#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
+#define PIDFD_INFO_CGROUPID (1UL << 0)
+
+struct pidfd_info {
+ /* Let userspace request expensive stuff explictly. */
+ __u64 request_mask;
+ /* And let the kernel indicate whether it knows about it. */
+ __u64 result_mask;
+ __u64 cgroupid;
+ __u32 pid;
+ __u32 tgid;
+ __u32 ppid;
+ __u32 ruid;
+ __u32 rgid;
+ __u32 euid;
+ __u32 egid;
+ __u32 suid;
+ __u32 sgid;
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 spare0[1];
+};
+#endif
+
static int safe_int(const char *numstr, int *converted)
{
char *err = NULL;
@@ -120,10 +150,13 @@ static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
int main(int argc, char **argv)
{
+ struct pidfd_info info = {
+ .request_mask = PIDFD_INFO_CGROUPID,
+ };
int pidfd = -1, ret = 1;
pid_t pid;
- ksft_set_plan(3);
+ ksft_set_plan(4);
pidfd = sys_pidfd_open(-1, 0);
if (pidfd >= 0) {
@@ -153,6 +186,52 @@ int main(int argc, char **argv)
pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+ if (ioctl(pidfd, PIDFD_GET_INFO, &info) < 0) {
+ ksft_print_msg("%s - failed to get info from pidfd\n", strerror(errno));
+ goto on_error;
+ }
+ if (info.pid != pid) {
+ ksft_print_msg("pid from fdinfo file %d does not match pid from ioctl %d\n",
+ pid, info.pid);
+ goto on_error;
+ }
+ if (info.ppid != getppid()) {
+ ksft_print_msg("ppid %d does not match ppid from ioctl %d\n",
+ pid, info.pid);
+ goto on_error;
+ }
+ if (info.ruid != getuid()) {
+ ksft_print_msg("uid %d does not match uid from ioctl %d\n",
+ getuid(), info.ruid);
+ goto on_error;
+ }
+ if (info.rgid != getgid()) {
+ ksft_print_msg("gid %d does not match gid from ioctl %d\n",
+ getgid(), info.rgid);
+ goto on_error;
+ }
+ if (info.euid != geteuid()) {
+ ksft_print_msg("euid %d does not match euid from ioctl %d\n",
+ geteuid(), info.euid);
+ goto on_error;
+ }
+ if (info.egid != getegid()) {
+ ksft_print_msg("egid %d does not match egid from ioctl %d\n",
+ getegid(), info.egid);
+ goto on_error;
+ }
+ if (info.suid != geteuid()) {
+ ksft_print_msg("suid %d does not match suid from ioctl %d\n",
+ geteuid(), info.suid);
+ goto on_error;
+ }
+ if (info.sgid != getegid()) {
+ ksft_print_msg("sgid %d does not match sgid from ioctl %d\n",
+ getegid(), info.sgid);
+ goto on_error;
+ }
+ ksft_test_result_pass("get info from pidfd test: passed\n");
+
ret = 0;
on_error:
base-commit: 9852d85ec9d492ebef56dc5f229416c925758edc
prerequisite-patch-id: 45816348e644743f5a8bb09432fdbf6f2178a1c9
prerequisite-patch-id: 73310122c17be3fb2d53df26a9a0cb0beb5bb944
prerequisite-patch-id: 780fb5316cd3fd1bc110acb73985e82bc650dc46
prerequisite-patch-id: a4c617e4f5cc53939d7eaa5e35910f36ac181f2e
prerequisite-patch-id: cb4ea39b845078ff57ab562c58f22cc7e86a8690
--
2.45.2
On 10/06, luca.boccassi@gmail.com wrote: > > +#ifdef CONFIG_CGROUPS > + if (request_mask & PIDFD_INFO_CGROUPID) { Should we nack other bits in request_mask? > + struct cgroup *cgrp = task_css_check(task, pids_cgrp_id, 1)->cgroup; Well, the last "1" argument just fools CONFIG_PROVE_RCU, right? It seems you need rcu_read_lock() + task_cgroup(pids_cgrp_id)... Oleg.
On 10/06, luca.boccassi@gmail.com wrote: > > +static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) > +{ > + struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; > + size_t usize = _IOC_SIZE(cmd); > + struct pidfd_info kinfo = {}; > + struct user_namespace *user_ns; > + const struct cred *c; > + __u64 request_mask; > + > + if (!uinfo) > + return -EINVAL; > + if (usize < sizeof(struct pidfd_info)) > + return -EINVAL; /* First version, no smaller struct possible */ > + > + if (copy_from_user(&request_mask, &uinfo->request_mask, sizeof(request_mask))) > + return -EFAULT; > + > + c = get_task_cred(task); > + if (!c) > + return -ESRCH; > + > + /* Unconditionally return identifiers and credentials, the rest only on request */ > + > + kinfo.pid = task_pid_vnr(task); > + kinfo.tgid = task_tgid_vnr(task); > + kinfo.ppid = task_ppid_nr_ns(task, task_active_pid_ns(task)); ^^^^^^^^^^^^^^^^^^^^^^^^ The same problem as with "info.pid = pid_nr_ns(pid, task_active_pid_ns(task));" you used before. You should use the caller's namespace, not the task's namespace. kinfo.ppid = task_ppid_nr_ns(task, NULL); should work, see __task_pid_nr_ns() which uses task_active_pid_ns(current) if ns == NULL. > + /* > + * One last check before returning: the task might have exited while we > + * were fetching all the data, so check again to be safe. */ > + if (task_pid_vnr(task) == 0) > + return -ESRCH; Well, this looks strange. It would be better to kill "kinfo.pid = task_pid_vnr()" above and do kinfo.pid = task_pid_vnr(task); if (!kinfo.pid) return -ESRCH; at the end, but this is minor. I don't think we can rely on this check. Suppose that pidfd_info() runs on CPU_0 and it races with __unhash_process(task) on CPU_1 which does detach_pid(p, PIDTYPE_PID); detach_pid(p, PIDTYPE_TGID); Without the barries/locking CPU_0 can see the changes above out of order, so it is possible that pidfd_info() will see task_pid_vnr(task) != 0, but report kinfo.tgid == 0. Oleg.
On Sun, 6 Oct 2024 at 18:22, Oleg Nesterov <oleg@redhat.com> wrote: > > On 10/06, luca.boccassi@gmail.com wrote: > > > > +static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) > > +{ > > + struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; > > + size_t usize = _IOC_SIZE(cmd); > > + struct pidfd_info kinfo = {}; > > + struct user_namespace *user_ns; > > + const struct cred *c; > > + __u64 request_mask; > > + > > + if (!uinfo) > > + return -EINVAL; > > + if (usize < sizeof(struct pidfd_info)) > > + return -EINVAL; /* First version, no smaller struct possible */ > > + > > + if (copy_from_user(&request_mask, &uinfo->request_mask, sizeof(request_mask))) > > + return -EFAULT; > > + > > + c = get_task_cred(task); > > + if (!c) > > + return -ESRCH; > > + > > + /* Unconditionally return identifiers and credentials, the rest only on request */ > > + > > + kinfo.pid = task_pid_vnr(task); > > + kinfo.tgid = task_tgid_vnr(task); > > + kinfo.ppid = task_ppid_nr_ns(task, task_active_pid_ns(task)); > ^^^^^^^^^^^^^^^^^^^^^^^^ > The same problem as with "info.pid = pid_nr_ns(pid, task_active_pid_ns(task));" > you used before. You should use the caller's namespace, not the task's namespace. > > kinfo.ppid = task_ppid_nr_ns(task, NULL); > > should work, see __task_pid_nr_ns() which uses task_active_pid_ns(current) if > ns == NULL. > > > + /* > > + * One last check before returning: the task might have exited while we > > + * were fetching all the data, so check again to be safe. */ > > + if (task_pid_vnr(task) == 0) > > + return -ESRCH; > > Well, this looks strange. It would be better to kill "kinfo.pid = task_pid_vnr()" > above and do > > kinfo.pid = task_pid_vnr(task); > if (!kinfo.pid) > return -ESRCH; > > at the end, but this is minor. > > I don't think we can rely on this check. > > Suppose that pidfd_info() runs on CPU_0 and it races with __unhash_process(task) > on CPU_1 which does > > detach_pid(p, PIDTYPE_PID); > detach_pid(p, PIDTYPE_TGID); > > Without the barries/locking CPU_0 can see the changes above out of order, > so it is possible that pidfd_info() will see task_pid_vnr(task) != 0, but > report kinfo.tgid == 0. > > Oleg. I see, so what should I do here then? Check both? Or none? The caller needs to verify that the data is still valid at the point they use it anyway, so this helps but provides no guarantees anyway
On 10/06, Luca Boccassi wrote: > > I see, so what should I do here then? Check both? Or none? I don't know, because I don't know how are you going to use this API. > The caller > needs to verify that the data is still valid at the point they use it > anyway, So "none" should work fine? Just it should be documented that, say, kinfo.pid can be 0 if we race with the exiting task. Just in case, you can also use lock_task_sighand() || return -ESRCH, this way kinfo.*pid can't be zero. But I don't think this will buy too much, the task can exit right after pidfd_info() returns. Oleg.
On Sun, 6 Oct 2024 at 19:56, Oleg Nesterov <oleg@redhat.com> wrote: > > On 10/06, Luca Boccassi wrote: > > > > I see, so what should I do here then? Check both? Or none? > > I don't know, because I don't know how are you going to use this API. > > > The caller > > needs to verify that the data is still valid at the point they use it > > anyway, > > So "none" should work fine? Just it should be documented that, say, > kinfo.pid can be 0 if we race with the exiting task. > > Just in case, you can also use lock_task_sighand() || return -ESRCH, > this way kinfo.*pid can't be zero. But I don't think this will buy too > much, the task can exit right after pidfd_info() returns. > > Oleg. I think what we should do is check if any of those fields was set to 0, and return ESRCH in that case. This way either we return a full set of metadata that was correct at the time it was taken, or we provide nothing and a clear error. We cannot solve the race as you mentioned, but I think it is important to avoid providing incomplete information, so that either the data is complete or nothing is given back. If the information is complete but becomes stale later, that can happen and it's ok.
On Sun, Oct 06, 2024 at 10:33:08PM GMT, Luca Boccassi wrote: > On Sun, 6 Oct 2024 at 19:56, Oleg Nesterov <oleg@redhat.com> wrote: > > > > On 10/06, Luca Boccassi wrote: > > > > > > I see, so what should I do here then? Check both? Or none? > > > > I don't know, because I don't know how are you going to use this API. > > > > > The caller > > > needs to verify that the data is still valid at the point they use it > > > anyway, > > > > So "none" should work fine? Just it should be documented that, say, > > kinfo.pid can be 0 if we race with the exiting task. > > > > Just in case, you can also use lock_task_sighand() || return -ESRCH, > > this way kinfo.*pid can't be zero. But I don't think this will buy too > > much, the task can exit right after pidfd_info() returns. > > > > Oleg. > > I think what we should do is check if any of those fields was set to > 0, and return ESRCH in that case. This way either we return a full set > of metadata that was correct at the time it was taken, or we provide > nothing and a clear error. We cannot solve the race as you mentioned, > but I think it is important to avoid providing incomplete information, > so that either the data is complete or nothing is given back. If the > information is complete but becomes stale later, that can happen and > it's ok. Agree.
© 2016 - 2024 Red Hat, Inc.