[v2] vfs: transitive upgrade restrictions for fds

[RFC PATCH v2 2/3] vfs: transitive upgrade restrictions for fds

Posted by Jori Koolstra 1 week ago

Add upgrade restrictions to openat2(). Extend struct open_how to allow
setting transitive restrictions on using file descriptors to open other
files. A use case for this feature is to block services or containers
from re-opening/upgrading an O_PATH file descriptor through e.g.
/proc/<pid>/fd/<nr> as O_WRONLY.

The idea for this features comes form the UAPI group kernel feature idea
list [1].

[1] https://github.com/uapi-group/kernel-features?tab=readme-ov-file#upgrade-masks-in-openat2

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
 fs/file_table.c              |  2 ++
 fs/internal.h                |  1 +
 fs/namei.c                   | 41 +++++++++++++++++++++++++++++++++---
 fs/open.c                    |  9 ++++++++
 fs/proc/base.c               | 24 +++++++++++++++------
 fs/proc/fd.c                 |  6 +++++-
 fs/proc/internal.h           |  4 +++-
 include/linux/fcntl.h        |  6 +++++-
 include/linux/fs.h           |  1 +
 include/linux/namei.h        | 15 ++++++++++++-
 include/uapi/linux/openat2.h |  6 ++++++
 11 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index aaa5faaace1e..b98038009fd2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -196,6 +196,8 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	f->f_wb_err	= 0;
 	f->f_sb_err	= 0;
 
+	f->f_allowed_upgrades = VALID_UPGRADE_FLAGS;
+
 	/*
 	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While
 	 * fget-rcu pattern users need to be able to handle spurious
diff --git a/fs/internal.h b/fs/internal.h
index cbc384a1aa09..0a37bb208184 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -189,6 +189,7 @@ struct open_flags {
 	int acc_mode;
 	int intent;
 	int lookup_flags;
+	unsigned int allowed_upgrades;
 };
 extern struct file *do_file_open(int dfd, struct filename *pathname,
 		const struct open_flags *op);
diff --git a/fs/namei.c b/fs/namei.c
index 58f715f7657e..c3d48709a73b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -743,6 +743,7 @@ struct nameidata {
 	int		dfd;
 	vfsuid_t	dir_vfsuid;
 	umode_t		dir_mode;
+	unsigned int	allowed_upgrades;
 } __randomize_layout;
 
 #define ND_ROOT_PRESET 1
@@ -760,6 +761,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 	p->path.mnt = NULL;
 	p->path.dentry = NULL;
 	p->total_link_count = old ? old->total_link_count : 0;
+	p->allowed_upgrades = VALID_UPGRADE_FLAGS;
 	p->saved = old;
 	current->nameidata = p;
 }
@@ -1156,11 +1158,15 @@ static int nd_jump_root(struct nameidata *nd)
 	return 0;
 }
 
+const struct jump_how jump_how_unrestricted = {
+	.allowed_upgrades = VALID_UPGRADE_FLAGS
+};
+
 /*
  * Helper to directly jump to a known parsed path from ->get_link,
  * caller must have taken a reference to path beforehand.
  */
-int nd_jump_link(const struct path *path)
+int nd_jump_link_how(const struct path *path, const struct jump_how *how)
 {
 	int error = -ELOOP;
 	struct nameidata *nd = current->nameidata;
@@ -1181,6 +1187,7 @@ int nd_jump_link(const struct path *path)
 	nd->path = *path;
 	nd->inode = nd->path.dentry->d_inode;
 	nd->state |= ND_JUMPED;
+	nd->allowed_upgrades &= how->allowed_upgrades;
 	return 0;
 
 err:
@@ -2738,6 +2745,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		if (fd_empty(f))
 			return ERR_PTR(-EBADF);
 
+		nd->allowed_upgrades = fd_file(f)->f_allowed_upgrades;
+
 		if (flags & LOOKUP_LINKAT_EMPTY) {
 			if (fd_file(f)->f_cred != current_cred() &&
 			    !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
@@ -4266,6 +4275,28 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path,
 	return 0;
 }
 
+static bool may_upgrade(const int flag, const unsigned int allowed_upgrades)
+{
+	int mode = flag & O_ACCMODE;
+	unsigned int allowed = allowed_upgrades & ~DENY_UPGRADES;
+
+	if (mode != O_WRONLY && !(allowed & READ_UPGRADABLE))
+		return false;
+	if (mode != O_RDONLY && !(allowed & WRITE_UPGRADABLE))
+		return false;
+	return true;
+}
+
+static int may_open_upgrade(struct mnt_idmap *idmap, const struct path *path,
+			    int acc_mode, int flag,
+			    const unsigned int allowed_upgrades)
+{
+	if (!may_upgrade(flag, allowed_upgrades))
+		return -EACCES;
+
+	return may_open(idmap, path, acc_mode, flag);
+}
+
 static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
 {
 	const struct path *path = &filp->f_path;
@@ -4666,7 +4697,8 @@ static int do_open(struct nameidata *nd,
 			return error;
 		do_truncate = true;
 	}
-	error = may_open(idmap, &nd->path, acc_mode, open_flag);
+	error = may_open_upgrade(idmap, &nd->path, acc_mode, open_flag,
+				 nd->allowed_upgrades);
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
 	if (!error)
@@ -4831,8 +4863,11 @@ static struct file *path_openat(struct nameidata *nd,
 		terminate_walk(nd);
 	}
 	if (likely(!error)) {
-		if (likely(file->f_mode & FMODE_OPENED))
+		if (likely(file->f_mode & FMODE_OPENED)) {
+			file->f_allowed_upgrades =
+				op->allowed_upgrades & nd->allowed_upgrades;
 			return file;
+		}
 		WARN_ON(1);
 		error = -EINVAL;
 	}
diff --git a/fs/open.c b/fs/open.c
index e019ddecc73c..8b6ea5f90c6e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1167,6 +1167,7 @@ inline struct open_how build_open_how(int flags, umode_t mode)
 	struct open_how how = {
 		.flags = ((unsigned int) flags) & VALID_OPEN_FLAGS,
 		.mode = mode & S_IALLUGO,
+		.allowed_upgrades = VALID_UPGRADE_FLAGS
 	};
 
 	/* O_PATH beats everything else. */
@@ -1299,6 +1300,14 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 	}
 
 	op->lookup_flags = lookup_flags;
+
+	if (how->allowed_upgrades == 0)
+		op->allowed_upgrades = VALID_UPGRADE_FLAGS;
+	else if (how->allowed_upgrades & ~VALID_UPGRADE_FLAGS)
+		return -EINVAL;
+	else
+		op->allowed_upgrades = how->allowed_upgrades;
+
 	return 0;
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4c863d17dfb4..3f3a471bbb75 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -218,7 +218,8 @@ static int get_task_root(struct task_struct *task, struct path *root)
 	return result;
 }
 
-static int proc_cwd_link(struct dentry *dentry, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path,
+			 struct jump_how *jump_how)
 {
 	struct task_struct *task = get_proc_task(d_inode(dentry));
 	int result = -ENOENT;
@@ -227,6 +228,7 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path)
 		task_lock(task);
 		if (task->fs) {
 			get_fs_pwd(task->fs, path);
+			*jump_how = jump_how_unrestricted;
 			result = 0;
 		}
 		task_unlock(task);
@@ -235,7 +237,8 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static int proc_root_link(struct dentry *dentry, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path,
+			  struct jump_how *jump_how)
 {
 	struct task_struct *task = get_proc_task(d_inode(dentry));
 	int result = -ENOENT;
@@ -243,6 +246,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	if (task) {
 		result = get_task_root(task, path);
 		put_task_struct(task);
+		*jump_how = jump_how_unrestricted;
 	}
 	return result;
 }
@@ -1777,7 +1781,8 @@ static const struct file_operations proc_pid_set_comm_operations = {
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path,
+			 struct jump_how *jump_how)
 {
 	struct task_struct *task;
 	struct file *exe_file;
@@ -1789,6 +1794,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 	put_task_struct(task);
 	if (exe_file) {
 		*exe_path = exe_file->f_path;
+		*jump_how = jump_how_unrestricted;
 		path_get(&exe_file->f_path);
 		fput(exe_file);
 		return 0;
@@ -1801,6 +1807,7 @@ static const char *proc_pid_get_link(struct dentry *dentry,
 				     struct delayed_call *done)
 {
 	struct path path;
+	struct jump_how jump_how;
 	int error = -EACCES;
 
 	if (!dentry)
@@ -1810,11 +1817,11 @@ static const char *proc_pid_get_link(struct dentry *dentry,
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path, &jump_how);
 	if (error)
 		goto out;
 
-	error = nd_jump_link(&path);
+	error = nd_jump_link_how(&path, &jump_how);
 out:
 	return ERR_PTR(error);
 }
@@ -1848,12 +1855,13 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	int error = -EACCES;
 	struct inode *inode = d_inode(dentry);
 	struct path path;
+	struct jump_how jump_how;
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path, &jump_how);
 	if (error)
 		goto out;
 
@@ -2250,7 +2258,8 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
 	.d_delete	= pid_delete_dentry,
 };
 
-static int map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path,
+			      struct jump_how *jump_how)
 {
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
@@ -2279,6 +2288,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	rc = -ENOENT;
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
+		*jump_how = jump_how_unrestricted;
 		*path = *file_user_path(vma->vm_file);
 		path_get(path);
 		rc = 0;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 9eeccff49b2a..344485e8cb6f 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -171,7 +171,8 @@ static const struct dentry_operations tid_fd_dentry_operations = {
 	.d_delete	= pid_delete_dentry,
 };
 
-static int proc_fd_link(struct dentry *dentry, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path,
+			struct jump_how *jump_how)
 {
 	struct task_struct *task;
 	int ret = -ENOENT;
@@ -183,6 +184,9 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
 
 		fd_file = fget_task(task, fd);
 		if (fd_file) {
+			*jump_how = (struct jump_how) {
+				.allowed_upgrades = fd_file->f_allowed_upgrades
+			};
 			*path = fd_file->f_path;
 			path_get(&fd_file->f_path);
 			ret = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c1e8eb984da8..42f668059a30 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -14,6 +14,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 
 struct ctl_table_header;
 struct mempolicy;
@@ -107,7 +108,8 @@ extern struct kmem_cache *proc_dir_entry_cache;
 void pde_free(struct proc_dir_entry *pde);
 
 union proc_op {
-	int (*proc_get_link)(struct dentry *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct path *,
+		struct jump_how *);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,
 		struct task_struct *task);
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index d1bb87ff70e3..6506c2c6eca5 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -15,6 +15,9 @@
 	 /* upper 32-bit flags (openat2(2) only) */ \
 	 OPENAT2_EMPTY_PATH)
 
+#define VALID_UPGRADE_FLAGS \
+	(DENY_UPGRADES | READ_UPGRADABLE | WRITE_UPGRADABLE)
+
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
@@ -22,7 +25,8 @@
 
 /* List of all open_how "versions". */
 #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
-#define OPEN_HOW_SIZE_LATEST	OPEN_HOW_SIZE_VER0
+#define OPEN_HOW_SIZE_VER1	32 /* added allowed_upgrades */
+#define OPEN_HOW_SIZE_LATEST	OPEN_HOW_SIZE_VER1
 
 #ifndef force_o_largefile
 #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8b3dd145b25e..697d2fc6322b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1296,6 +1296,7 @@ struct file {
 	};
 	file_ref_t			f_ref;
 	/* --- cacheline 3 boundary (192 bytes) --- */
+	unsigned int			f_allowed_upgrades;
 } __randomize_layout
   __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 58600cf234bc..0c58ded7cd27 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -203,7 +203,20 @@ static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umo
 	return mode;
 }
 
-extern int __must_check nd_jump_link(const struct path *path);
+struct jump_how {
+	unsigned int allowed_upgrades;
+};
+
+extern const struct jump_how jump_how_unrestricted;
+#define JUMP_HOW_UNRESTRICTED &jump_how_unrestricted
+
+extern int __must_check nd_jump_link_how(const struct path *path,
+					 const struct jump_how *how);
+
+static inline int nd_jump_link(const struct path *path)
+{
+	return nd_jump_link_how(path, JUMP_HOW_UNRESTRICTED);
+}
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 {
diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
index c34f32e6fa96..fc1147e6ce41 100644
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -20,8 +20,14 @@ struct open_how {
 	__u64 flags;
 	__u64 mode;
 	__u64 resolve;
+	__u64 allowed_upgrades;
 };
 
+/* how->allowed_upgrades flags for openat2(2). */
+#define DENY_UPGRADES		0x01
+#define READ_UPGRADABLE		(0x02 | DENY_UPGRADES)
+#define WRITE_UPGRADABLE	(0x04 | DENY_UPGRADES)
+
 /* how->resolve flags for openat2(2). */
 #define RESOLVE_NO_XDEV		0x01 /* Block mount-point crossings
 					(includes bind-mounts). */
-- 
2.53.0

Re: [RFC PATCH v2 2/3] vfs: transitive upgrade restrictions for fds

Posted by Aleksa Sarai 6 days, 12 hours ago

On 2026-03-26, Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> Add upgrade restrictions to openat2(). Extend struct open_how to allow
> setting transitive restrictions on using file descriptors to open other
> files. A use case for this feature is to block services or containers
> from re-opening/upgrading an O_PATH file descriptor through e.g.
> /proc/<pid>/fd/<nr> as O_WRONLY.
> 
> The idea for this features comes form the UAPI group kernel feature idea
> list [1].
> 
> [1] https://github.com/uapi-group/kernel-features?tab=readme-ov-file#upgrade-masks-in-openat2

I had a version of this in the original openat2(2) pull request many
years ago[1].

Unfortunately there is a pretty big issue with doing it this way (which
I mentioned in one of the changelogs back then[2]): There are lots of
VFS operations that imply operations on a file (through a magic-link)
that are not blocked. truncate(2) and mount(MS_BIND)/open_tree(2) are
the most problematic examples, but this applies to basically any syscall
that takes a path argument. If you don't block those then re-opening
restrictions are functionally useless.

It also would be really nice if you could block more than just trailing
component operations -- having a directory file descriptor that blocks
lookups could be quite handy for a bunch of reasons.

I think the only workable solution to block all of these issue entirely
and in a comprehensive way is to have something akin to capsicum
capabilities[3] tied to file descriptors and have all of the VFS
operations check them (though I think that the way this was attempted in
the past[4] was far from ideal).

I have tried my hand at a few lighter-weight prototypes over the years
(mainly trying to add the necessary checks to every generic_permission()
call, and adding some more generic_permission() calls as well...). My
last prototype was adding the restriction information to "struct path"
but that would bloat too many structures to be merge-able. I was
planning on looking at this again later this year, but if you can come
up with a nice way of getting a minimal version of capsicum working,
that'd be amazing. :D

That being said, while my view at the time of openat2(2) was that we
need to do this at the same time as O_EMPTYPATH (and my tests showed
that this was a backwards-compatible change -- on modern desktops at
least), at this point I think it'd be better to just merge O_EMPTYPATH
by itself and we can work on this hardening separately and make it an
opt-in sysctl (with individual file descriptors being opt-in-able as
well).

[1]: https://lore.kernel.org/lkml/20190930183316.10190-2-cyphar@cyphar.com/
[2]: https://lore.kernel.org/lkml/20191026185700.10708-1-cyphar@cyphar.com/
[3]: https://lwn.net/Articles/482858/
[4]: https://lore.kernel.org/lkml/1404124096-21445-1-git-send-email-drysdale@google.com/

> +const struct jump_how jump_how_unrestricted = {
> +	.allowed_upgrades = VALID_UPGRADE_FLAGS
> +};
> +
>  /*
>   * Helper to directly jump to a known parsed path from ->get_link,
>   * caller must have taken a reference to path beforehand.
>   */
> -int nd_jump_link(const struct path *path)
> +int nd_jump_link_how(const struct path *path, const struct jump_how *how)
>  {
>  	int error = -ELOOP;
>  	struct nameidata *nd = current->nameidata;
> @@ -1181,6 +1187,7 @@ int nd_jump_link(const struct path *path)
>  	nd->path = *path;
>  	nd->inode = nd->path.dentry->d_inode;
>  	nd->state |= ND_JUMPED;
> +	nd->allowed_upgrades &= how->allowed_upgrades;
>  	return 0;

Way back then, Andy Lutomirski suggested that this be done via the
magic-link modes. While it is kind of ugly and in my patchset this
required adjusting some magic-link modes, it does provide a useful
indication to userspace of two things:

 - What upgrade modes are available for a file (this is useful for
   debugging but is also really necessary for the checkpoint-restore
   folks' needs). I did this with fmode (and exposed fmode in fdinfo)
   but I would not recommend that approach at all.

 - It indicates whether the kernel supports this feature, which will
   allow certain programs to loosen their hardening logic since the
   kernel implements the hardening for them.

   For instance, most container runtimes now either make a copy of
   /proc/self/exe as a sealed memfd or create a read-only overlayfs
   mount to re-exec /proc/self/exe so that containers cannot overwrite
   the host binary by doing a /proc/$pid/exe re-open. See CVE-2019-5736
   for more details.

   Indicating that the kernel blocks this attack would let container
   runtimes disable this hardening on such kernels.

Maybe we should at least change the modes even if they aren't used?

> -static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
> +static int proc_exe_link(struct dentry *dentry, struct path *exe_path,
> +			 struct jump_how *jump_how)
>  {
>  	struct task_struct *task;
>  	struct file *exe_file;
> @@ -1789,6 +1794,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
>  	put_task_struct(task);
>  	if (exe_file) {
>  		*exe_path = exe_file->f_path;
> +		*jump_how = jump_how_unrestricted;
>  		path_get(&exe_file->f_path);
>  		fput(exe_file);
>  		return 0;

This should restrict writes, for the reasons outlined above.

> -static int map_files_get_link(struct dentry *dentry, struct path *path)
> +static int map_files_get_link(struct dentry *dentry, struct path *path,
> +			      struct jump_how *jump_how)
>  {
>  	unsigned long vm_start, vm_end;
>  	struct vm_area_struct *vma;
> @@ -2279,6 +2288,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
>  	rc = -ENOENT;
>  	vma = find_exact_vma(mm, vm_start, vm_end);
>  	if (vma && vma->vm_file) {
> +		*jump_how = jump_how_unrestricted;
>  		*path = *file_user_path(vma->vm_file);
>  		path_get(path);
>  		rc = 0;

This should also restrict writes (this is a similar issue to
/proc/self/exe and is harder to harden against, the primary defense is
just ASLR...).

> diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
> index c34f32e6fa96..fc1147e6ce41 100644
> --- a/include/uapi/linux/openat2.h
> +++ b/include/uapi/linux/openat2.h
> @@ -20,8 +20,14 @@ struct open_how {
>  	__u64 flags;
>  	__u64 mode;
>  	__u64 resolve;
> +	__u64 allowed_upgrades;
>  };
>  
> +/* how->allowed_upgrades flags for openat2(2). */
> +#define DENY_UPGRADES		0x01
> +#define READ_UPGRADABLE		(0x02 | DENY_UPGRADES)
> +#define WRITE_UPGRADABLE	(0x04 | DENY_UPGRADES)

I'm not a huge fan of how this bitmask is set up, to be honest. I get
that you did it this way to make it disable-by-default but given that we
probably will want to add restrictions in the future that would break
backward compatibility (imagine an execute restriction that blocks
execve(2) -- adding the feature would break all existing programs if you
follow this scheme).

It probably makes more sense to do something more like statx(2) -- you
pick the restrictions you want and get back information about which
restrictions were supported. This is similar to what I did in the old
patchset too, though it didn't give you information like statx(2) does.

-- 
Aleksa Sarai
https://www.cyphar.com/

Re: [RFC PATCH v2 2/3] vfs: transitive upgrade restrictions for fds

Posted by Jori Koolstra 3 days, 20 hours ago

Hi Aleksa,

Thanks a lot for the detailed reply, it gives a lot of the background I was
missing. I really appreciate it, I have learned a lot as I wasn't aware of the
earlier work on this. Sorry for not getting back to you sooner, but I needed
a few days to absorb all the linked information.

> Op 27-03-2026 07:20 CET schreef Aleksa Sarai <cyphar@cyphar.com>:
> 
>  
> On 2026-03-26, Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> > Add upgrade restrictions to openat2(). Extend struct open_how to allow
> > setting transitive restrictions on using file descriptors to open other
> > files. A use case for this feature is to block services or containers
> > from re-opening/upgrading an O_PATH file descriptor through e.g.
> > /proc/<pid>/fd/<nr> as O_WRONLY.
> > 
> > The idea for this features comes form the UAPI group kernel feature idea
> > list [1].
> > 
> > [1] https://github.com/uapi-group/kernel-features?tab=readme-ov-file#upgrade-masks-in-openat2
> 
> I had a version of this in the original openat2(2) pull request many
> years ago[1].
> 
> Unfortunately there is a pretty big issue with doing it this way (which
> I mentioned in one of the changelogs back then[2]): There are lots of
> VFS operations that imply operations on a file (through a magic-link)
> that are not blocked. truncate(2) and mount(MS_BIND)/open_tree(2) are
> the most problematic examples, but this applies to basically any syscall
> that takes a path argument. If you don't block those then re-opening
> restrictions are functionally useless.

Ah yes. If I am correct, it would block all the fXXX syscalls from doing
harm (at least w.r.t. read/write operations) because they use the fmode for
rights checking on the fd, and this cannot be changed without going through
an open() variant.

Hence the issue is the case when we pass a magic path to e.g. truncate()
as it does no upgrade restriction check right now on the struct file. So
we hade to do this for every relevant syscall. And the question is... where.

> 
> It also would be really nice if you could block more than just trailing
> component operations -- having a directory file descriptor that blocks
> lookups could be quite handy for a bunch of reasons.
> 

Yes, so (at the very least) we also want RESTRICT_LOOKUP for directory fds.

> I think the only workable solution to block all of these issue entirely
> and in a comprehensive way is to have something akin to capsicum
> capabilities[3] tied to file descriptors and have all of the VFS
> operations check them (though I think that the way this was attempted in
> the past[4] was far from ideal).
> 

I went through Drysdale's implementation a bit. He links the capability check
to the translation of an fd to a struct file. I agree this is a bit invasive
(as he writes himself), and perhaps we can do better. Is this what you mean
by "far from ideal"?

> I have tried my hand at a few lighter-weight prototypes over the years
> (mainly trying to add the necessary checks to every generic_permission()
> call, and adding some more generic_permission() calls as well...). My
> last prototype was adding the restriction information to "struct path"
> but that would bloat too many structures to be merge-able. I was
> planning on looking at this again later this year, but if you can come
> up with a nice way of getting a minimal version of capsicum working,
> that'd be amazing. :D

I would really like to try; it is a very nice problem for me to tackle;
you need to gain experience somehow :)

I wonder how checking all this in generic_permission() would work. The
access to the fd that the procfs magic link provides is essentially an
issue of path traversal, and in generic_permission() you just have the
inode in question. Ah but of course, you can use the mode bits of the
magic link to encode the information, as you suggest. What downside did
you encounter using this idea?

One thing I can think of is that if we want more than rwx upgrade control
(more capsicum style control), this is not going to be sufficient. If you
want to restrict fchown on an fd, there is no way to encode this in the
magic link mode. Maybe we should determine first the minimum capability
support that we want to make the feature useful (and extendable)?

> 
> That being said, while my view at the time of openat2(2) was that we
> need to do this at the same time as O_EMPTYPATH (and my tests showed
> that this was a backwards-compatible change -- on modern desktops at
> least), at this point I think it'd be better to just merge O_EMPTYPATH
> by itself and we can work on this hardening separately and make it an
> opt-in sysctl (with individual file descriptors being opt-in-able as
> well).

Not a version of the cap syscalls?

> 
> > diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
> > index c34f32e6fa96..fc1147e6ce41 100644
> > --- a/include/uapi/linux/openat2.h
> > +++ b/include/uapi/linux/openat2.h
> > @@ -20,8 +20,14 @@ struct open_how {
> >  	__u64 flags;
> >  	__u64 mode;
> >  	__u64 resolve;
> > +	__u64 allowed_upgrades;
> >  };
> >  
> > +/* how->allowed_upgrades flags for openat2(2). */
> > +#define DENY_UPGRADES		0x01
> > +#define READ_UPGRADABLE		(0x02 | DENY_UPGRADES)
> > +#define WRITE_UPGRADABLE	(0x04 | DENY_UPGRADES)
> 
> I'm not a huge fan of how this bitmask is set up, to be honest. I get
> that you did it this way to make it disable-by-default but given that we
> probably will want to add restrictions in the future that would break
> backward compatibility (imagine an execute restriction that blocks
> execve(2) -- adding the feature would break all existing programs if you
> follow this scheme).

Ah, I wanted to have the upgradable options as white list, because with a
blacklist approach you have the issue that if there is ever a restriction
added that has overlap with an existing restriction, it would disable part
of a restriction you thought you had set. But maybe we just need to prevent
such a scenario, and I agree the whitelist option is even worse.

> 
> -- 
> Aleksa Sarai
> https://www.cyphar.com/

Btw, I just saw you gave a cool talk at FOSDEM this year, and I missed it,
even though I was there! Thanks for linking the cve-2019-5736, was really
interesting to read.

Thanks,
Jori.

Re: [RFC PATCH v2 2/3] vfs: transitive upgrade restrictions for fds

Posted by Aleksa Sarai 1 day, 6 hours ago

On 2026-03-29, Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> > Unfortunately there is a pretty big issue with doing it this way (which
> > I mentioned in one of the changelogs back then[2]): There are lots of
> > VFS operations that imply operations on a file (through a magic-link)
> > that are not blocked. truncate(2) and mount(MS_BIND)/open_tree(2) are
> > the most problematic examples, but this applies to basically any syscall
> > that takes a path argument. If you don't block those then re-opening
> > restrictions are functionally useless.
> 
> Ah yes. If I am correct, it would block all the fXXX syscalls from doing
> harm (at least w.r.t. read/write operations) because they use the fmode for
> rights checking on the fd, and this cannot be changed without going through
> an open() variant.
> 
> Hence the issue is the case when we pass a magic path to e.g. truncate()
> as it does no upgrade restriction check right now on the struct file. So
> we hade to do this for every relevant syscall. And the question is... where.

They would also be bypassed by AT_EMPTY_PATH, so you don't even need
magic-links to cause the issue. This is why I considered doing it with
"struct path" and doing the blocking in *_permission() -- all of the
standard vfs operations already have .

> > It also would be really nice if you could block more than just trailing
> > component operations -- having a directory file descriptor that blocks
> > lookups could be quite handy for a bunch of reasons.
> 
> Yes, so (at the very least) we also want RESTRICT_LOOKUP for directory fds.

Well, definitely not initially. You also don't really need to call it
RESTRICT_LOOKUP, blocking exec would be the "unixy" thing to do (and it
would be nice to have that to block fexecveat(2) too, but that is a
_VERY_ deep rabbithole).

> > I think the only workable solution to block all of these issue entirely
> > and in a comprehensive way is to have something akin to capsicum
> > capabilities[3] tied to file descriptors and have all of the VFS
> > operations check them (though I think that the way this was attempted in
> > the past[4] was far from ideal).
> > 
> 
> I went through Drysdale's implementation a bit. He links the capability check
> to the translation of an fd to a struct file. I agree this is a bit invasive
> (as he writes himself), and perhaps we can do better. Is this what you mean
> by "far from ideal"?

There were quite a few other issues with it, but that is one of them.

> > I have tried my hand at a few lighter-weight prototypes over the years
> > (mainly trying to add the necessary checks to every generic_permission()
> > call, and adding some more generic_permission() calls as well...). My
> > last prototype was adding the restriction information to "struct path"
> > but that would bloat too many structures to be merge-able. I was
> > planning on looking at this again later this year, but if you can come
> > up with a nice way of getting a minimal version of capsicum working,
> > that'd be amazing. :D
> 
> I would really like to try; it is a very nice problem for me to tackle;
> you need to gain experience somehow :)

Sorry, I probably should've couched my reply with a bit more caution --
I really appreciate the enthusiasm, but this is quite a hairy topic for
quite a number of reasons. It will involve touching everything
fd-related in the kernel, it touches on an already-rejected patchset,
and is a topic that can get very heated easily. I would not rate the
chance of getting something merged very highly, and I think there are
more fruitful things for you to tackle and get your hands dirty.

Honestly, I think most people will be more than happy with just getting
the O_EMPTYPATH part merged -- looking back, I really should've just
done that back in 2019. :/

> I wonder how checking all this in generic_permission() would work. The
> access to the fd that the procfs magic link provides is essentially an
> issue of path traversal, and in generic_permission() you just have the
> inode in question. Ah but of course, you can use the mode bits of the
> magic link to encode the information, as you suggest. What downside did
> you encounter using this idea?

It wasn't based on the magic-link mode, I added a capability set to
"struct path" and then went and adjusted all generic_permission()s to
take the new capability mask. For stuff that did file_permission() or
path_permission(), no code change was needed.

Unfortunately this ran into quite a few hairy issues that made it
basically unmergeable (bloating "struct path" is very bad as it is
embedded everywhere, the whole model is kind of questionable because it
makes "struct path" have state about the lookup that produced it, some
VFS APIs deal with inodes or dentries directly and thus would need more
tree-wide fixes, and most importantly it just felt really ugly).

The open_tree(2) / mount(MS_BIND) issue is particularly problematic --
ideally you would want to block a RW bind-mount for a read-only file,
but there isn't really an obvious mechanism for passing down those kinds
of restrictions. Actually, fsconfig() arguments like "upperdir" to
overlayfs (or even "source") are likely even worse -- now you need
per-filesystem-option handling.

I don't think these issues are insurmountable, it's just that the
problem is much harder than it looks at first glance and I would humbly
suggest that working on reviving a very dead patchset is not the best
use of your time. Container runtimes really *really* want this, which is
the main reason I keep coming back to it (and we ended up with it in the
UAPI group proposal page) but I think it's a very niche thing that has a
big risk of being a lot of wasted effort.

-- 
Aleksa Sarai
https://www.cyphar.com/

[RFC PATCH v2 1/3] vfs: add support for empty path to openat2(2)
[RFC PATCH v2 2/3] vfs: transitive upgrade restrictions for fds
[RFC PATCH v2 3/3] selftest: add tests for OPENAT2_EMPTY_PATH and allowed_upgrades