[v8] proc: subset=pid: Relax check of mount visibility

[PATCH v8 4/5] proc: Relax check of mount visibility

Posted by Alexey Gladkov 1 month, 2 weeks ago

When /proc is mounted with the subset=pid option, all system files from
the root of the file system are not accessible in userspace. Only
dynamic information about processes is available, which cannot be
hidden with overmount.

For this reason, checking for full visibility is not relevant if
mounting is performed with the subset=pid option.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
---
 fs/namespace.c                 | 29 ++++++++++++++++-------------
 fs/proc/root.c                 | 17 ++++++++++-------
 include/linux/fs/super_types.h |  2 ++
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index c58674a20cad..7daa86315c05 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6116,7 +6116,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 		/* This mount is not fully visible if it's root directory
 		 * is not the root directory of the filesystem.
 		 */
-		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
+		if (!(sb->s_iflags & SB_I_USERNS_ALLOW_REVEALING) &&
+		    mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
 			continue;
 
 		/* A local view of the mount flags */
@@ -6136,18 +6137,20 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
 			continue;
 
-		/* This mount is not fully visible if there are any
-		 * locked child mounts that cover anything except for
-		 * empty directories.
-		 */
-		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
-			struct inode *inode = child->mnt_mountpoint->d_inode;
-			/* Only worry about locked mounts */
-			if (!(child->mnt.mnt_flags & MNT_LOCKED))
-				continue;
-			/* Is the directory permanently empty? */
-			if (!is_empty_dir_inode(inode))
-				goto next;
+		if (!(sb->s_iflags & SB_I_USERNS_ALLOW_REVEALING)) {
+			/* This mount is not fully visible if there are any
+			 * locked child mounts that cover anything except for
+			 * empty directories.
+			 */
+			list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+				struct inode *inode = child->mnt_mountpoint->d_inode;
+				/* Only worry about locked mounts */
+				if (!IS_MNT_LOCKED(child))
+					continue;
+				/* Is the directory permanently empty? */
+				if (!is_empty_dir_inode(inode))
+					goto next;
+			}
 		}
 		/* Preserve the locked attributes */
 		*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 535a168046e3..e029d3587494 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -223,18 +223,21 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	return 0;
 }
 
-static int proc_apply_options(struct proc_fs_info *fs_info,
+static int proc_apply_options(struct super_block *s,
 			       struct fs_context *fc,
 			       struct user_namespace *user_ns)
 {
 	struct proc_fs_context *ctx = fc->fs_private;
+	struct proc_fs_info *fs_info = proc_sb_info(s);
 
 	if (ctx->mask & (1 << Opt_gid))
 		fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
 	if (ctx->mask & (1 << Opt_hidepid))
 		fs_info->hide_pid = ctx->hidepid;
 	if (ctx->mask & (1 << Opt_subset)) {
-		if (ctx->pidonly != PROC_PIDONLY_ON && fs_info->pidonly == PROC_PIDONLY_ON)
+		if (ctx->pidonly == PROC_PIDONLY_ON)
+			s->s_iflags |= SB_I_USERNS_ALLOW_REVEALING;
+		else if (fs_info->pidonly == PROC_PIDONLY_ON)
 			return invalf(fc, "proc: subset=pid cannot be unset\n");
 		fs_info->pidonly = ctx->pidonly;
 	}
@@ -259,9 +262,6 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 
 	fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
 	fs_info->mounter_cred = get_cred(fc->cred);
-	ret = proc_apply_options(fs_info, fc, current_user_ns());
-	if (ret)
-		return ret;
 
 	/* User space would break if executables or devices appear on proc */
 	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
@@ -273,6 +273,10 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_time_gran = 1;
 	s->s_fs_info = fs_info;
 
+	ret = proc_apply_options(s, fc, current_user_ns());
+	if (ret)
+		return ret;
+
 	/*
 	 * procfs isn't actually a stacking filesystem; however, there is
 	 * too much magic going on inside it to permit stacking things on
@@ -306,11 +310,10 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 static int proc_reconfigure(struct fs_context *fc)
 {
 	struct super_block *sb = fc->root->d_sb;
-	struct proc_fs_info *fs_info = proc_sb_info(sb);
 
 	sync_filesystem(sb);
 
-	return proc_apply_options(fs_info, fc, current_user_ns());
+	return proc_apply_options(sb, fc, current_user_ns());
 }
 
 static int proc_get_tree(struct fs_context *fc)
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 6bd3009e09b3..5e640b9140df 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -333,4 +333,6 @@ struct super_block {
 #define SB_I_NOIDMAP	0x00002000	/* No idmapped mounts on this superblock */
 #define SB_I_ALLOW_HSM	0x00004000	/* Allow HSM events on this superblock */
 
+#define SB_I_USERNS_ALLOW_REVEALING	0x00008000 /* Skip full visibility check */
+
 #endif /* _LINUX_FS_SUPER_TYPES_H */
-- 
2.53.0

Re: [PATCH v8 4/5] proc: Relax check of mount visibility

Posted by Christian Brauner 1 month, 2 weeks ago

On Fri, Feb 13, 2026 at 11:44:29AM +0100, Alexey Gladkov wrote:
> When /proc is mounted with the subset=pid option, all system files from
> the root of the file system are not accessible in userspace. Only
> dynamic information about processes is available, which cannot be
> hidden with overmount.
> 
> For this reason, checking for full visibility is not relevant if
> mounting is performed with the subset=pid option.
> 
> Signed-off-by: Alexey Gladkov <legion@kernel.org>
> ---
>  fs/namespace.c                 | 29 ++++++++++++++++-------------
>  fs/proc/root.c                 | 17 ++++++++++-------
>  include/linux/fs/super_types.h |  2 ++
>  3 files changed, 28 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index c58674a20cad..7daa86315c05 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -6116,7 +6116,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
>  		/* This mount is not fully visible if it's root directory
>  		 * is not the root directory of the filesystem.
>  		 */
> -		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
> +		if (!(sb->s_iflags & SB_I_USERNS_ALLOW_REVEALING) &&
> +		    mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
>  			continue;
>  
>  		/* A local view of the mount flags */
> @@ -6136,18 +6137,20 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
>  		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
>  			continue;

There are a few things that I find problematic here.

Even before your change the mount flags of the first fully visible
procfs mount would be picked up. If the caller was unlucky they could
stumble upon the most restricted procfs mount in the mount namespace
rbtree. Leading to weird scenarios where a user cannot write to the
procfs instance they just mounted but could to another one that is also
in their namespace.

The other thing is that with this change specifically:

    if (!(sb->s_iflags & SB_I_USERNS_ALLOW_REVEALING) &&
        mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)

we start caring about mount options of even partially exposed procfs
mounts. IOW, if someone had a bind-mount of e.g., /proc/pressure
somewhere that got inherited via CLONE_NEWNS then we suddenly take the
mount options of that into account for a new /proc/<pid>/* only instance.
I think we should continue caring only about procfs mounts that are
visible from their root.

The the other problem is that it is really annoying that we walk all
mounts in a mount namespace just to find procfs and sysfs mounts in
there. Currently a lot of workloads still do the CLONE_NEWNS dance
meaning they inherit all the crap from the host and then proceed to
setup their new rootfs. Busy container workloads that can be a lot.

So let's just be honest about it and treat procfs and sysfs as the
snowflakes that they have become and record their instances in a
separate per mount namespace hlist as in the (untested) patch below [1].

Also SB_I_USERNS_ALLOW_REVEALING seems unnecessary. The only time we
care about that flag is when we setup a new superblock. So this could
easily be a struct fs_context bitfield that just exists for the duration
of the creation of the new superblock and mount. So maybe pass that down
to mount_too_revealing() and further down into the actual helper.

[1]:
From 4bbd41e7a3ef91667dd334f31b1b6bf8caec0599 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 17 Feb 2026 12:02:34 +0100
Subject: [PATCH] namespace: record fully visible mounts in list

Instead of wading through all the mounts in the mount namespace rbtree
to find fully visible procfs and sysfs mounts, be honest about them
being special cruft and record them in a separate per-mount namespace
list.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h     |  4 ++++
 fs/namespace.c | 19 +++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index e0816c11a198..5df134d56d47 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -25,6 +25,7 @@ struct mnt_namespace {
 	__u32			n_fsnotify_mask;
 	struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
 #endif
+	struct hlist_head	mnt_visible_mounts; /* SB_I_USERNS_VISIBLE mounts */
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
 	refcount_t		passive; /* number references not pinning @mounts */
@@ -90,6 +91,7 @@ struct mount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
 	struct hlist_head mnt_stuck_children;
+	struct hlist_node mnt_ns_visible; /* link in ns->mnt_visible_mounts */
 	struct mount *overmount;	/* mounted on ->mnt_root */
 } __randomize_layout;
 
@@ -207,6 +209,8 @@ static inline void move_from_ns(struct mount *mnt)
 		ns->mnt_first_node = rb_next(&mnt->mnt_node);
 	rb_erase(&mnt->mnt_node, &ns->mounts);
 	RB_CLEAR_NODE(&mnt->mnt_node);
+	if (!hlist_unhashed(&mnt->mnt_ns_visible))
+		hlist_del_init(&mnt->mnt_ns_visible);
 }
 
 bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index a67cbe42746d..764081c690d5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -321,6 +321,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+		INIT_HLIST_NODE(&mnt->mnt_ns_visible);
 		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
 	}
@@ -1098,6 +1099,10 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 	rb_link_node(&mnt->mnt_node, parent, link);
 	rb_insert_color(&mnt->mnt_node, &ns->mounts);
 
+	if ((mnt->mnt.mnt_sb->s_iflags & SB_I_USERNS_VISIBLE) &&
+	    mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root)
+		hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts);
+
 	mnt_notify_add(mnt);
 }
 
@@ -6295,22 +6300,20 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 				int *new_mnt_flags)
 {
 	int new_flags = *new_mnt_flags;
-	struct mount *mnt, *n;
+	struct mount *mnt;
+
+	/* Don't acquire namespace semaphore without a good reason. */
+	if (hlist_empty(&ns->mnt_visible_mounts))
+		return false;
 
 	guard(namespace_shared)();
-	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
+	hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) {
 		struct mount *child;
 		int mnt_flags;
 
 		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
 			continue;
 
-		/* This mount is not fully visible if it's root directory
-		 * is not the root directory of the filesystem.
-		 */
-		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
-			continue;
-
 		/* A local view of the mount flags */
 		mnt_flags = mnt->mnt.mnt_flags;
 
-- 
2.47.3

[PATCH v8 1/5] docs: proc: add documentation about mount restrictions
[PATCH v8 2/5] proc: subset=pid: Show /proc/self/net only for CAP_NET_ADMIN
[PATCH v8 3/5] proc: Disable cancellation of subset=pid option
[PATCH v8 4/5] proc: Relax check of mount visibility
[PATCH v8 5/5] docs: proc: add documentation about relaxing visibility restrictions