fs/pidfs.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++---- include/linux/cred.h | 2 + kernel/signal.c | 19 +++++---- 3 files changed, 113 insertions(+), 15 deletions(-)
Right now we only support trusted.* xattrs which require CAP_SYS_ADMIN
which doesn't really require any meaningful permission checking. But in
order to support user.* xattrs and custom pidfs.* xattrs in the future
we need permission checking for pidfs inodes. Add baseline permission
checking that can later be extended with additional write-time checks
for specific pidfs.* xattrs.
Make the effective {u,g}id of the task the owner of the pidfs inode
(like procfs does). The ownership is set when the dentry is first
stashed and reported dynamically via getattr since credentials may
change due to setuid() and similar operations. For kernel threads use
root, for exited tasks use the credentials saved at exit time.
Note that I explicitly avoid just writing to the inode during getattr()
or similar operations to update it. Some filesystems do this but this is
just terrible as this doesn't serialize against inode->i_op->seattr:: at
all. I have a patch series that uses seqcounts to make it possible to
serialize such users against inode->i_op->setattr:: but it's not really
that much of a pressing need. But fwiw [1].
Save the task's credentials and thread group pid inode number at exit
time so that ownership and permission checks remain functional after
the task has been reaped.
Add a permission callback that checks access in two steps:
(1) Verify the caller is either in the same thread group as the target
or has equivalent signal permissions. This reuses the same
uid-based logic as kill() by extracting may_signal_creds() from
kill_ok_by_cred() so it can operate on credential pointers
directly. For exited tasks the check uses the saved exit
credentials and compares thread group identity.
(2) Perform standard POSIX permission checking via generic_permission()
against the inode's ownership and mode bits.
This is intentionally less strict than ptrace_may_access() because pidfs
currently does not allow operating on data that is completely private to
the process such as its mm or file descriptors. Additional checks will
be needed once that changes.
Link: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=work.inode.seqcount [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
Hey Jann,
hey Andy,
It would be nice if you could take a look at the permission model.
This is the first thing that came to my mind and I wrote it down so we
have something to look at.
Thanks!
Christian
---
fs/pidfs.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++----
include/linux/cred.h | 2 +
kernel/signal.c | 19 +++++----
3 files changed, 113 insertions(+), 15 deletions(-)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 1e20e36e0ed5..da9d18fa5425 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -14,6 +14,7 @@
#include <linux/proc_ns.h>
#include <linux/pseudo_fs.h>
#include <linux/ptrace.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <uapi/linux/pidfd.h>
#include <linux/ipc_namespace.h>
@@ -44,15 +45,23 @@ enum pidfs_attr_mask_bits {
PIDFS_ATTR_BIT_COREDUMP = 1,
};
+struct pidfs_exit_attr {
+ __u64 cgroupid;
+ __s32 exit_code;
+ const struct cred *exit_cred;
+ u64 exit_tgid_ino;
+};
+
+struct pidfs_coredump_attr {
+ __u32 coredump_mask;
+ __u32 coredump_signal;
+};
+
struct pidfs_attr {
unsigned long attr_mask;
struct simple_xattrs *xattrs;
- struct /* exit info */ {
- __u64 cgroupid;
- __s32 exit_code;
- };
- __u32 coredump_mask;
- __u32 coredump_signal;
+ struct pidfs_exit_attr;
+ struct pidfs_coredump_attr;
};
static struct rb_root pidfs_ino_tree = RB_ROOT;
@@ -169,6 +178,7 @@ void pidfs_free_pid(struct pid *pid)
if (IS_ERR(attr))
return;
+ put_cred(attr->exit_cred);
xattrs = no_free_ptr(attr->xattrs);
if (xattrs)
simple_xattrs_free(xattrs, NULL);
@@ -678,6 +688,8 @@ void pidfs_exit(struct task_struct *tsk)
attr->cgroupid = cgroup_id(cgrp);
rcu_read_unlock();
#endif
+ attr->exit_cred = get_cred(__task_cred(tsk));
+ attr->exit_tgid_ino = task_tgid(tsk)->ino;
attr->exit_code = tsk->exit_code;
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
@@ -710,6 +722,55 @@ void pidfs_coredump(const struct coredump_params *cprm)
static struct vfsmount *pidfs_mnt __ro_after_init;
+/*
+ * Fill in the effective {u,g}id of the task referred to by the pidfs
+ * inode. The task's credentials may change due to setuid(), etc.
+ */
+static void pidfs_fill_owner(struct inode *inode, kuid_t *uid, kgid_t *gid)
+{
+ struct pid *pid = inode->i_private;
+
+ VFS_WARN_ON_ONCE(!pid);
+
+ scoped_guard(rcu) {
+ struct task_struct *task;
+
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ struct pidfs_attr *attr = READ_ONCE(pid->attr);
+
+ VFS_WARN_ON_ONCE(!attr);
+ *uid = attr->exit_cred->euid;
+ *gid = attr->exit_cred->egid;
+ } else if (unlikely(task->flags & PF_KTHREAD)) {
+ *uid = GLOBAL_ROOT_UID;
+ *gid = GLOBAL_ROOT_GID;
+ } else {
+ const struct cred *cred = __task_cred(task);
+
+ *uid = cred->euid;
+ *gid = cred->egid;
+ }
+ }
+}
+
+/*
+ * Set pidfs inode ownership and security label. Called once when the
+ * dentry is first stashed.
+ */
+static void pidfs_update_inode(struct inode *inode)
+{
+ struct pid *pid = inode->i_private;
+ struct task_struct *task;
+
+ pidfs_fill_owner(inode, &inode->i_uid, &inode->i_gid);
+
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_PID);
+ if (task)
+ security_task_to_inode(task, inode);
+}
+
/*
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
* implemented. Let's reject it completely until we have a clean
@@ -725,7 +786,11 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
- return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
+ struct inode *inode = d_inode(path->dentry);
+
+ anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
+ pidfs_fill_owner(inode, &stat->uid, &stat->gid);
+ return 0;
}
static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
@@ -742,10 +807,37 @@ static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
return simple_xattr_list(inode, xattrs, buf, size);
}
+static int pidfs_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+
+ VFS_WARN_ON_ONCE(idmap != &nop_mnt_idmap);
+
+ scoped_guard(rcu) {
+ struct task_struct *task;
+
+ task = pid_task(pid, PIDTYPE_PID);
+ if (task) {
+ if (!same_thread_group(current, task) &&
+ !may_signal_creds(current_cred(), __task_cred(task)))
+ return -EPERM;
+ } else {
+ if (task_tgid(current)->ino != attr->exit_tgid_ino &&
+ !may_signal_creds(current_cred(), attr->exit_cred))
+ return -EPERM;
+ }
+ }
+
+ return generic_permission(&nop_mnt_idmap, inode, mask);
+}
+
static const struct inode_operations pidfs_inode_operations = {
.getattr = pidfs_getattr,
.setattr = pidfs_setattr,
.listxattr = pidfs_listxattr,
+ .permission = pidfs_permission,
};
static void pidfs_evict_inode(struct inode *inode)
@@ -978,6 +1070,7 @@ static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
if (ret)
return ERR_PTR(ret);
+ pidfs_update_inode(d_inode(dentry));
return stash_dentry(stashed, dentry);
}
diff --git a/include/linux/cred.h b/include/linux/cred.h
index ed1609d78cd7..d14b29fe9fee 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -168,6 +168,8 @@ extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);
+bool may_signal_creds(const struct cred *signaler_cred,
+ const struct cred *signalee_cred);
static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index e42b8bd6922f..9182d1ad19b7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -777,19 +777,22 @@ static inline bool si_fromuser(const struct kernel_siginfo *info)
(!is_si_special(info) && SI_FROMUSER(info));
}
+bool may_signal_creds(const struct cred *signaler_cred,
+ const struct cred *signalee_cred)
+{
+ return uid_eq(signaler_cred->euid, signalee_cred->suid) ||
+ uid_eq(signaler_cred->euid, signalee_cred->uid) ||
+ uid_eq(signaler_cred->uid, signalee_cred->suid) ||
+ uid_eq(signaler_cred->uid, signalee_cred->uid) ||
+ ns_capable(signalee_cred->user_ns, CAP_KILL);
+}
+
/*
* called with RCU read lock from check_kill_permission()
*/
static bool kill_ok_by_cred(struct task_struct *t)
{
- const struct cred *cred = current_cred();
- const struct cred *tcred = __task_cred(t);
-
- return uid_eq(cred->euid, tcred->suid) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->uid, tcred->suid) ||
- uid_eq(cred->uid, tcred->uid) ||
- ns_capable(tcred->user_ns, CAP_KILL);
+ return may_signal_creds(current_cred(), __task_cred(t));
}
/*
---
base-commit: 72c395024dac5e215136cbff793455f065603b06
change-id: 20260211-work-pidfs-inode-owner-0ca20de9ef23
syzbot ci has tested the following series [v1] pidfs: add inode ownership and permission checks https://lore.kernel.org/all/20260216-work-pidfs-inode-owner-v1-1-f8faa6b73983@kernel.org * [PATCH RFC] pidfs: add inode ownership and permission checks and found the following issue: general protection fault in pidfs_fill_owner Full report is available here: https://ci.syzbot.org/series/d1c7b2f5-ccdc-4ca8-af27-dd6c97fd9e90 *** general protection fault in pidfs_fill_owner tree: bpf-next URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git base: 72c395024dac5e215136cbff793455f065603b06 arch: amd64 compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8 config: https://ci.syzbot.org/builds/a2c9422a-a065-455a-83e1-f304e9567e3e/config syz repro: https://ci.syzbot.org/findings/b2100115-a69c-42ec-8aa0-29353ad7bd45/syz_repro Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 5803 Comm: syz-execprog Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:pidfs_fill_owner+0x1f5/0x500 fs/pidfs.c:743 Code: ff 49 83 c4 20 4c 89 e0 48 c1 e8 03 42 80 3c 30 00 74 08 4c 89 e7 e8 3a f2 da ff 4d 8b 2c 24 49 83 c5 18 4c 89 e8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 0f 85 7b 02 00 00 41 8b 6d 00 4c 89 f8 48 c1 RSP: 0018:ffffc90004997850 EFLAGS: 00010206 RAX: 0000000000000003 RBX: ffffffff8251bdfe RCX: ffff888175390000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: 0000000000000001 R08: ffffffff8251bdfe R09: ffffffff8e558220 R10: dffffc0000000000 R11: fffffbfff1fdf4ef R12: ffff8881078adaa0 R13: 0000000000000018 R14: dffffc0000000000 R15: ffff8881169793f8 FS: 000000c00007a898(0000) GS:ffff88818e0d7000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c002961200 CR3: 000000016d248000 CR4: 00000000000006f0 Call Trace: <TASK> pidfs_update_inode fs/pidfs.c:766 [inline] pidfs_stash_dentry+0xf2/0x280 fs/pidfs.c:1073 path_from_stashed+0x463/0x5c0 fs/libfs.c:2258 pidfs_alloc_file+0xff/0x290 fs/pidfs.c:1182 pidfd_prepare+0x14e/0x1b0 kernel/fork.c:1898 copy_process+0x1f3a/0x3cf0 kernel/fork.c:2259 kernel_clone+0x248/0x870 kernel/fork.c:2654 __do_sys_clone kernel/fork.c:2795 [inline] __se_sys_clone kernel/fork.c:2779 [inline] __x64_sys_clone+0x1b6/0x230 kernel/fork.c:2779 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4a7d4d Code: 24 10 48 8b 74 24 18 48 8b 54 24 20 49 c7 c2 00 00 00 00 49 c7 c0 00 00 00 00 49 c7 c1 00 00 00 00 48 8b 44 24 08 41 5c 0f 05 <41> 54 48 3d 01 f0 ff ff 76 12 48 c7 44 24 28 ff ff ff ff 48 f7 d8 RSP: 002b:000000c00001d5b8 EFLAGS: 00000216 ORIG_RAX: 0000000000000038 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004a7d4d RDX: 000000c00001d644 RSI: 0000000000000000 RDI: 0000000000005100 RBP: 000000c00001d5f0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 000000000049fd97 R13: 0000000000000000 R14: 000000c000002380 R15: 000000c000010020 </TASK> Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:pidfs_fill_owner+0x1f5/0x500 fs/pidfs.c:743 Code: ff 49 83 c4 20 4c 89 e0 48 c1 e8 03 42 80 3c 30 00 74 08 4c 89 e7 e8 3a f2 da ff 4d 8b 2c 24 49 83 c5 18 4c 89 e8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 0f 85 7b 02 00 00 41 8b 6d 00 4c 89 f8 48 c1 RSP: 0018:ffffc90004997850 EFLAGS: 00010206 RAX: 0000000000000003 RBX: ffffffff8251bdfe RCX: ffff888175390000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: 0000000000000001 R08: ffffffff8251bdfe R09: ffffffff8e558220 R10: dffffc0000000000 R11: fffffbfff1fdf4ef R12: ffff8881078adaa0 R13: 0000000000000018 R14: dffffc0000000000 R15: ffff8881169793f8 FS: 000000c00007a898(0000) GS:ffff88818e0d7000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c002961200 CR3: 000000016d248000 CR4: 00000000000006f0 ---------------- Code disassembly (best guess), 1 bytes skipped: 0: 49 83 c4 20 add $0x20,%r12 4: 4c 89 e0 mov %r12,%rax 7: 48 c1 e8 03 shr $0x3,%rax b: 42 80 3c 30 00 cmpb $0x0,(%rax,%r14,1) 10: 74 08 je 0x1a 12: 4c 89 e7 mov %r12,%rdi 15: e8 3a f2 da ff call 0xffdaf254 1a: 4d 8b 2c 24 mov (%r12),%r13 1e: 49 83 c5 18 add $0x18,%r13 22: 4c 89 e8 mov %r13,%rax 25: 48 c1 e8 03 shr $0x3,%rax * 29: 42 0f b6 04 30 movzbl (%rax,%r14,1),%eax <-- trapping instruction 2e: 84 c0 test %al,%al 30: 0f 85 7b 02 00 00 jne 0x2b1 36: 41 8b 6d 00 mov 0x0(%r13),%ebp 3a: 4c 89 f8 mov %r15,%rax 3d: 48 rex.W 3e: c1 .byte 0xc1 *** If these findings have caused you to resend the series or submit a separate fix, please add the following tag to your commit message: Tested-by: syzbot@syzkaller.appspotmail.com --- This report is generated by a bot. It may contain errors. syzbot ci engineers can be reached at syzkaller@googlegroups.com.
© 2016 - 2026 Red Hat, Inc.