[PATCH v4 2/4] procfs: add "pidns" mount option

Aleksa Sarai posted 4 patches 2 months ago
[PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Aleksa Sarai 2 months ago
Since the introduction of pid namespaces, their interaction with procfs
has been entirely implicit in ways that require a lot of dancing around
by programs that need to construct sandboxes with different PID
namespaces.

Being able to explicitly specify the pid namespace to use when
constructing a procfs super block will allow programs to no longer need
to fork off a process which does then does unshare(2) / setns(2) and
forks again in order to construct a procfs in a pidns.

So, provide a "pidns" mount option which allows such users to just
explicitly state which pid namespace they want that procfs instance to
use. This interface can be used with fsconfig(2) either with a file
descriptor or a path:

  fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
  fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);

or with classic mount(2) / mount(8):

  // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
  mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");

As this new API is effectively shorthand for setns(2) followed by
mount(2), the permission model for this mirrors pidns_install() to avoid
opening up new attack surfaces by loosening the existing permission
model.

In order to avoid having to RCU-protect all users of proc_pid_ns() (to
avoid UAFs), attempting to reconfigure an existing procfs instance's pid
namespace will error out with -EBUSY. Creating new procfs instances is
quite cheap, so this should not be an impediment to most users, and lets
us avoid a lot of churn in fs/proc/* for a feature that it seems
unlikely userspace would use.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 Documentation/filesystems/proc.rst |  8 ++++
 fs/proc/root.c                     | 98 +++++++++++++++++++++++++++++++++++---
 2 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 5236cb52e357..5a157dadea0b 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -2360,6 +2360,7 @@ The following mount options are supported:
 	hidepid=	Set /proc/<pid>/ access mode.
 	gid=		Set the group authorized to learn processes information.
 	subset=		Show only the specified subset of procfs.
+	pidns=		Specify a the namespace used by this procfs.
 	=========	========================================================
 
 hidepid=off or hidepid=0 means classic mode - everybody may access all
@@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group.
 subset=pid hides all top level files and directories in the procfs that
 are not related to tasks.
 
+pidns= specifies a pid namespace (either as a string path to something like
+`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
+will be used by the procfs instance when translating pids. By default, procfs
+will use the calling process's active pid namespace. Note that the pid
+namespace of an existing procfs instance cannot be modified (attempting to do
+so will give an `-EBUSY` error).
+
 Chapter 5: Filesystem behavior
 ==============================
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ed86ac710384..fd1f1c8a939a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
 	Opt_gid,
 	Opt_hidepid,
 	Opt_subset,
+	Opt_pidns,
 };
 
 static const struct fs_parameter_spec proc_fs_parameters[] = {
-	fsparam_u32("gid",	Opt_gid),
+	fsparam_u32("gid",		Opt_gid),
 	fsparam_string("hidepid",	Opt_hidepid),
 	fsparam_string("subset",	Opt_subset),
+	fsparam_file_or_string("pidns",	Opt_pidns),
 	{}
 };
 
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
 	return 0;
 }
 
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+				  struct fs_parameter *param,
+				  struct fs_parse_result *result)
+{
+	struct proc_fs_context *ctx = fc->fs_private;
+	struct pid_namespace *target, *active = task_active_pid_ns(current);
+	struct ns_common *ns;
+	struct file *ns_filp __free(fput) = NULL;
+
+	switch (param->type) {
+	case fs_value_is_file:
+		/* came through fsconfig, steal the file reference */
+		ns_filp = no_free_ptr(param->file);
+		break;
+	case fs_value_is_string:
+		ns_filp = filp_open(param->string, O_RDONLY, 0);
+		break;
+	default:
+		WARN_ON_ONCE(true);
+		break;
+	}
+	if (!ns_filp)
+		ns_filp = ERR_PTR(-EBADF);
+	if (IS_ERR(ns_filp)) {
+		errorfc(fc, "could not get file from pidns argument");
+		return PTR_ERR(ns_filp);
+	}
+
+	if (!proc_ns_file(ns_filp))
+		return invalfc(fc, "pidns argument is not an nsfs file");
+	ns = get_proc_ns(file_inode(ns_filp));
+	if (ns->ops->type != CLONE_NEWPID)
+		return invalfc(fc, "pidns argument is not a pidns file");
+	target = container_of(ns, struct pid_namespace, ns);
+
+	/*
+	 * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+	 * permission model should be the same as pidns_install().
+	 */
+	if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+		errorfc(fc, "insufficient permissions to set pidns");
+		return -EPERM;
+	}
+	if (!pidns_is_ancestor(target, active))
+		return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+	put_pid_ns(ctx->pid_ns);
+	ctx->pid_ns = get_pid_ns(target);
+	put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+	return 0;
+}
+#endif /* CONFIG_PID_NS */
+
 static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct proc_fs_context *ctx = fc->fs_private;
 	struct fs_parse_result result;
-	int opt;
+	int opt, err;
 
 	opt = fs_parse(fc, proc_fs_parameters, param, &result);
 	if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		break;
 
 	case Opt_hidepid:
-		if (proc_parse_hidepid_param(fc, param))
-			return -EINVAL;
+		err = proc_parse_hidepid_param(fc, param);
+		if (err)
+			return err;
 		break;
 
 	case Opt_subset:
-		if (proc_parse_subset_param(fc, param->string) < 0)
-			return -EINVAL;
+		err = proc_parse_subset_param(fc, param->string);
+		if (err)
+			return err;
+		break;
+
+	case Opt_pidns:
+#ifdef CONFIG_PID_NS
+		/*
+		 * We would have to RCU-protect every proc_pid_ns() or
+		 * proc_sb_info() access if we allowed this to be reconfigured
+		 * for an existing procfs instance. Luckily, procfs instances
+		 * are cheap to create, and mount-beneath would let you
+		 * atomically replace an instance even with overmounts.
+		 */
+		if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+			errorfc(fc, "cannot reconfigure pidns for existing procfs");
+			return -EBUSY;
+		}
+		err = proc_parse_pidns_param(fc, param, &result);
+		if (err)
+			return err;
 		break;
+#else
+		errorfc(fc, "pidns mount flag not supported on this system");
+		return -EOPNOTSUPP;
+#endif
 
 	default:
 		return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
 		fs_info->hide_pid = ctx->hidepid;
 	if (ctx->mask & (1 << Opt_subset))
 		fs_info->pidonly = ctx->pidonly;
+	if (ctx->mask & (1 << Opt_pidns) &&
+	    !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+		put_pid_ns(fs_info->pid_ns);
+		fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+	}
 }
 
 static int proc_fill_super(struct super_block *s, struct fs_context *fc)

-- 
2.50.1
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Aleksa Sarai 2 months ago
On 2025-08-05, Aleksa Sarai <cyphar@cyphar.com> wrote:
> Since the introduction of pid namespaces, their interaction with procfs
> has been entirely implicit in ways that require a lot of dancing around
> by programs that need to construct sandboxes with different PID
> namespaces.
> 
> Being able to explicitly specify the pid namespace to use when
> constructing a procfs super block will allow programs to no longer need
> to fork off a process which does then does unshare(2) / setns(2) and
> forks again in order to construct a procfs in a pidns.
> 
> So, provide a "pidns" mount option which allows such users to just
> explicitly state which pid namespace they want that procfs instance to
> use. This interface can be used with fsconfig(2) either with a file
> descriptor or a path:
> 
>   fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
>   fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);
> 
> or with classic mount(2) / mount(8):
> 
>   // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
>   mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");
> 
> As this new API is effectively shorthand for setns(2) followed by
> mount(2), the permission model for this mirrors pidns_install() to avoid
> opening up new attack surfaces by loosening the existing permission
> model.
> 
> In order to avoid having to RCU-protect all users of proc_pid_ns() (to
> avoid UAFs), attempting to reconfigure an existing procfs instance's pid
> namespace will error out with -EBUSY. Creating new procfs instances is
> quite cheap, so this should not be an impediment to most users, and lets
> us avoid a lot of churn in fs/proc/* for a feature that it seems
> unlikely userspace would use.
> 
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
>  Documentation/filesystems/proc.rst |  8 ++++
>  fs/proc/root.c                     | 98 +++++++++++++++++++++++++++++++++++---
>  2 files changed, 100 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> index 5236cb52e357..5a157dadea0b 100644
> --- a/Documentation/filesystems/proc.rst
> +++ b/Documentation/filesystems/proc.rst
> @@ -2360,6 +2360,7 @@ The following mount options are supported:
>  	hidepid=	Set /proc/<pid>/ access mode.
>  	gid=		Set the group authorized to learn processes information.
>  	subset=		Show only the specified subset of procfs.
> +	pidns=		Specify a the namespace used by this procfs.
>  	=========	========================================================
>  
>  hidepid=off or hidepid=0 means classic mode - everybody may access all
> @@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group.
>  subset=pid hides all top level files and directories in the procfs that
>  are not related to tasks.
>  
> +pidns= specifies a pid namespace (either as a string path to something like
> +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
> +will be used by the procfs instance when translating pids. By default, procfs
> +will use the calling process's active pid namespace. Note that the pid
> +namespace of an existing procfs instance cannot be modified (attempting to do
> +so will give an `-EBUSY` error).
> +
>  Chapter 5: Filesystem behavior
>  ==============================
>  
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index ed86ac710384..fd1f1c8a939a 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -38,12 +38,14 @@ enum proc_param {
>  	Opt_gid,
>  	Opt_hidepid,
>  	Opt_subset,
> +	Opt_pidns,
>  };
>  
>  static const struct fs_parameter_spec proc_fs_parameters[] = {
> -	fsparam_u32("gid",	Opt_gid),
> +	fsparam_u32("gid",		Opt_gid),
>  	fsparam_string("hidepid",	Opt_hidepid),
>  	fsparam_string("subset",	Opt_subset),
> +	fsparam_file_or_string("pidns",	Opt_pidns),
>  	{}
>  };
>  
> @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PID_NS
> +static int proc_parse_pidns_param(struct fs_context *fc,
> +				  struct fs_parameter *param,
> +				  struct fs_parse_result *result)
> +{
> +	struct proc_fs_context *ctx = fc->fs_private;
> +	struct pid_namespace *target, *active = task_active_pid_ns(current);
> +	struct ns_common *ns;
> +	struct file *ns_filp __free(fput) = NULL;
> +
> +	switch (param->type) {
> +	case fs_value_is_file:
> +		/* came through fsconfig, steal the file reference */
> +		ns_filp = no_free_ptr(param->file);
> +		break;
> +	case fs_value_is_string:
> +		ns_filp = filp_open(param->string, O_RDONLY, 0);
> +		break;

I just realised that we probably also want to support FSCONFIG_SET_PATH
here, but fsparam_file_or_string() doesn't handle that at the moment. I
think we probably want to have fsparam_file_or_path() which would act
like:

 1. A path with FSCONFIG_SET_STRING and FSCONFIG_SET_PATH.
 2. A file with FSCONFIG_SET_FD.

These are the semantics I would already expect from these kinds of
flags, but at the moment FSCONFIG_SET_PATH is entirely disallowed.

@Amir:

I wonder if overlayfs (the only other user of fsparam_file_or_string())
would also prefer having these semantics? We could just migrate
fsparam_file_or_string() to fsparam_file_or_path() everwhere, since I'm
pretty sure these are the semantics userspace expects anyway.

> +	default:
> +		WARN_ON_ONCE(true);
> +		break;
> +	}
> +	if (!ns_filp)
> +		ns_filp = ERR_PTR(-EBADF);
> +	if (IS_ERR(ns_filp)) {
> +		errorfc(fc, "could not get file from pidns argument");
> +		return PTR_ERR(ns_filp);
> +	}
> +
> +	if (!proc_ns_file(ns_filp))
> +		return invalfc(fc, "pidns argument is not an nsfs file");
> +	ns = get_proc_ns(file_inode(ns_filp));
> +	if (ns->ops->type != CLONE_NEWPID)
> +		return invalfc(fc, "pidns argument is not a pidns file");
> +	target = container_of(ns, struct pid_namespace, ns);
> +
> +	/*
> +	 * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
> +	 * permission model should be the same as pidns_install().
> +	 */
> +	if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
> +		errorfc(fc, "insufficient permissions to set pidns");
> +		return -EPERM;
> +	}
> +	if (!pidns_is_ancestor(target, active))
> +		return invalfc(fc, "cannot set pidns to non-descendant pidns");
> +
> +	put_pid_ns(ctx->pid_ns);
> +	ctx->pid_ns = get_pid_ns(target);
> +	put_user_ns(fc->user_ns);
> +	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
> +	return 0;
> +}
> +#endif /* CONFIG_PID_NS */
> +
>  static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
>  {
>  	struct proc_fs_context *ctx = fc->fs_private;
>  	struct fs_parse_result result;
> -	int opt;
> +	int opt, err;
>  
>  	opt = fs_parse(fc, proc_fs_parameters, param, &result);
>  	if (opt < 0)
> @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
>  		break;
>  
>  	case Opt_hidepid:
> -		if (proc_parse_hidepid_param(fc, param))
> -			return -EINVAL;
> +		err = proc_parse_hidepid_param(fc, param);
> +		if (err)
> +			return err;
>  		break;
>  
>  	case Opt_subset:
> -		if (proc_parse_subset_param(fc, param->string) < 0)
> -			return -EINVAL;
> +		err = proc_parse_subset_param(fc, param->string);
> +		if (err)
> +			return err;
> +		break;
> +
> +	case Opt_pidns:
> +#ifdef CONFIG_PID_NS
> +		/*
> +		 * We would have to RCU-protect every proc_pid_ns() or
> +		 * proc_sb_info() access if we allowed this to be reconfigured
> +		 * for an existing procfs instance. Luckily, procfs instances
> +		 * are cheap to create, and mount-beneath would let you
> +		 * atomically replace an instance even with overmounts.
> +		 */
> +		if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
> +			errorfc(fc, "cannot reconfigure pidns for existing procfs");
> +			return -EBUSY;
> +		}
> +		err = proc_parse_pidns_param(fc, param, &result);
> +		if (err)
> +			return err;
>  		break;
> +#else
> +		errorfc(fc, "pidns mount flag not supported on this system");
> +		return -EOPNOTSUPP;
> +#endif
>  
>  	default:
>  		return -EINVAL;
> @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
>  		fs_info->hide_pid = ctx->hidepid;
>  	if (ctx->mask & (1 << Opt_subset))
>  		fs_info->pidonly = ctx->pidonly;
> +	if (ctx->mask & (1 << Opt_pidns) &&
> +	    !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
> +		put_pid_ns(fs_info->pid_ns);
> +		fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
> +	}
>  }
>  
>  static int proc_fill_super(struct super_block *s, struct fs_context *fc)
> 
> -- 
> 2.50.1
> 

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Askar Safin 2 months ago
> I just realised that we probably also want to support FSCONFIG_SET_PATH

I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH. Moreover, fsparam_path macro is present since 5.1. And for all this time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody used it, so this will not break anything.

If you okay with that, I can submit patch, removing it.

--
Askar Safin
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Aleksa Sarai 1 month, 4 weeks ago
On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote:
> > I just realised that we probably also want to support FSCONFIG_SET_PATH
> 
> I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH.
> Moreover, fsparam_path macro is present since 5.1. And for all this
> time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody
> used it, so this will not break anything.
> 
> If you okay with that, I can submit patch, removing it.

I would prefer you didn't -- "*at()" semantics are very useful to a lot
of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff
to support it, and probably also overlayfs...

I suspect the primary issue is that when migrating to the new mount API,
filesystem devs just went with the easiest thing to use
(FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I
suspect the lack of documentation around fsconfig(2) played a part too.

My impression is that interest in the minutia about fsconfig(2) is quite
low on the list of priorities for most filesystem devs, and so the neat
aspects of fsconfig(2) haven't been fully utilised. (In LPC last year,
we struggled to come to an agreement on how filesystems should use the
read(2)-based error interface.)

We can very easily move fsparam_string() or fsparam_file_or_string()
parameters to fsparam_path() and a future fsparam_file_or_path(). I
would much prefer that as a user.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Aleksa Sarai 1 month, 4 weeks ago
On 2025-08-07, Aleksa Sarai <cyphar@cyphar.com> wrote:
> On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote:
> > > I just realised that we probably also want to support FSCONFIG_SET_PATH
> > 
> > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH.
> > Moreover, fsparam_path macro is present since 5.1. And for all this
> > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody
> > used it, so this will not break anything.
> > 
> > If you okay with that, I can submit patch, removing it.
> 
> I would prefer you didn't -- "*at()" semantics are very useful to a lot
> of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff
> to support it, and probably also overlayfs...
> 
> I suspect the primary issue is that when migrating to the new mount API,
> filesystem devs just went with the easiest thing to use
> (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I
> suspect the lack of documentation around fsconfig(2) played a part too.
> 
> My impression is that interest in the minutia about fsconfig(2) is quite
> low on the list of priorities for most filesystem devs, and so the neat
> aspects of fsconfig(2) haven't been fully utilised. (In LPC last year,
> we struggled to come to an agreement on how filesystems should use the
> read(2)-based error interface.)
> 
> We can very easily move fsparam_string() or fsparam_file_or_string()
> parameters to fsparam_path() and a future fsparam_file_or_path(). I
> would much prefer that as a user.

Actually, fsparam_bdev() accepts FSCONFIG_SET_PATH in a very roundabout
way (and the checker doesn't verify anything...?). So there is at least
one user (ext4's "journal_path"), it's just not well-documented (which
I'm trying to fix ;]).

My plan is to update fs_lookup_param() to be more useful for the (fairly
common) use-case of wanting to support paths and file descriptors, and
going through to clean up some of these unused fsparam_* helpers (or
fsparam_* helpers being abused to implement stuff that the fs_parser
core already supports).

At the very least, overlayfs, ext4, and this procfs patchset can make
use of it.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Christian Brauner 1 month, 3 weeks ago
On Thu, Aug 07, 2025 at 05:17:56PM +1000, Aleksa Sarai wrote:
> On 2025-08-07, Aleksa Sarai <cyphar@cyphar.com> wrote:
> > On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote:
> > > > I just realised that we probably also want to support FSCONFIG_SET_PATH
> > > 
> > > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH.
> > > Moreover, fsparam_path macro is present since 5.1. And for all this
> > > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody
> > > used it, so this will not break anything.
> > > 
> > > If you okay with that, I can submit patch, removing it.
> > 
> > I would prefer you didn't -- "*at()" semantics are very useful to a lot
> > of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff
> > to support it, and probably also overlayfs...
> > 
> > I suspect the primary issue is that when migrating to the new mount API,
> > filesystem devs just went with the easiest thing to use
> > (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I
> > suspect the lack of documentation around fsconfig(2) played a part too.
> > 
> > My impression is that interest in the minutia about fsconfig(2) is quite
> > low on the list of priorities for most filesystem devs, and so the neat
> > aspects of fsconfig(2) haven't been fully utilised. (In LPC last year,
> > we struggled to come to an agreement on how filesystems should use the
> > read(2)-based error interface.)
> > 
> > We can very easily move fsparam_string() or fsparam_file_or_string()
> > parameters to fsparam_path() and a future fsparam_file_or_path(). I
> > would much prefer that as a user.
> 
> Actually, fsparam_bdev() accepts FSCONFIG_SET_PATH in a very roundabout
> way (and the checker doesn't verify anything...?). So there is at least
> one user (ext4's "journal_path"), it's just not well-documented (which
> I'm trying to fix ;]).
> 
> My plan is to update fs_lookup_param() to be more useful for the (fairly
> common) use-case of wanting to support paths and file descriptors, and
> going through to clean up some of these unused fsparam_* helpers (or
> fsparam_* helpers being abused to implement stuff that the fs_parser
> core already supports).
> 
> At the very least, overlayfs, ext4, and this procfs patchset can make
> use of it.

I've never bothered with actually iplementing FSCONFIG_SET_PATH
semantics because I think it's really weird to allow *at semantics when
setting filesystem parameters. I always thought it's better to force
userspace to provide a file descriptor for the final destination instead
of doing some arcane lookup variant for mount configuration. But I'm
happy to be convinced of its usefulness...
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Aleksa Sarai 1 month, 3 weeks ago
On 2025-08-08, Christian Brauner <brauner@kernel.org> wrote:
> On Thu, Aug 07, 2025 at 05:17:56PM +1000, Aleksa Sarai wrote:
> > On 2025-08-07, Aleksa Sarai <cyphar@cyphar.com> wrote:
> > > On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote:
> > > > > I just realised that we probably also want to support FSCONFIG_SET_PATH
> > > > 
> > > > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH.
> > > > Moreover, fsparam_path macro is present since 5.1. And for all this
> > > > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody
> > > > used it, so this will not break anything.
> > > > 
> > > > If you okay with that, I can submit patch, removing it.
> > > 
> > > I would prefer you didn't -- "*at()" semantics are very useful to a lot
> > > of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff
> > > to support it, and probably also overlayfs...
> > > 
> > > I suspect the primary issue is that when migrating to the new mount API,
> > > filesystem devs just went with the easiest thing to use
> > > (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I
> > > suspect the lack of documentation around fsconfig(2) played a part too.
> > > 
> > > My impression is that interest in the minutia about fsconfig(2) is quite
> > > low on the list of priorities for most filesystem devs, and so the neat
> > > aspects of fsconfig(2) haven't been fully utilised. (In LPC last year,
> > > we struggled to come to an agreement on how filesystems should use the
> > > read(2)-based error interface.)
> > > 
> > > We can very easily move fsparam_string() or fsparam_file_or_string()
> > > parameters to fsparam_path() and a future fsparam_file_or_path(). I
> > > would much prefer that as a user.
> > 
> > Actually, fsparam_bdev() accepts FSCONFIG_SET_PATH in a very roundabout
> > way (and the checker doesn't verify anything...?). So there is at least
> > one user (ext4's "journal_path"), it's just not well-documented (which
> > I'm trying to fix ;]).
> > 
> > My plan is to update fs_lookup_param() to be more useful for the (fairly
> > common) use-case of wanting to support paths and file descriptors, and
> > going through to clean up some of these unused fsparam_* helpers (or
> > fsparam_* helpers being abused to implement stuff that the fs_parser
> > core already supports).
> > 
> > At the very least, overlayfs, ext4, and this procfs patchset can make
> > use of it.
> 
> I've never bothered with actually iplementing FSCONFIG_SET_PATH
> semantics because I think it's really weird to allow *at semantics when
> setting filesystem parameters. I always thought it's better to force
> userspace to provide a file descriptor for the final destination instead
> of doing some arcane lookup variant for mount configuration. But I'm
> happy to be convinced of its usefulness...

I do think it's useful, and here's my thought process...

Most filesystems have to take string path parameters in order to support
mount(2) and work with mount(8). Yes, fsparam_fd() will accept
FSCONFIG_SET_STRING by parsing it as a decimal string, but there are
only two users of fsparam_fd() and honestly I'm not convinced this is a
particularly sane API for anything other than strict backcompat reasons
(the API only makes sense as a file descriptor and you want mount(8) to
be able to use it).

So you end up with most parameters supporting paths set using
FSCONFIG_SET_STRING anyway, meaning in-kernel lookups can't be taken off
the table. And if we accept paths for lookup, then (for the same reason
we have *at(2) syscalls) it is preferable to allow specifying dirfds. So
FSCONFIG_SET_PATH should also be supported.

And as there is no infrastructure to block FSCONFIG_SET_PATH_EMPTY
arguments (yes, you can do it manually, but the *only* user of
fs_lookup_param() doesn't), then anything that accepts FSCONFIG_SET_PATH
currently also accepts FSCONFIG_SET_PATH_EMPTY which is "morally
equivalent" to FSCONFIG_SET_FD. So unless you block
FSCONFIG_SET_PATH_EMPTY then FSCONFIG_SET_FD should probably also be
supported (there is the re-opening distinction, of course, but that is
not relevant if you use filename_lookup() -- which is what filesystems
will do in practice).

So my impression is that most users (if they had an fsconfig(2) man page
to read...) would expect parameters that accept paths to either:

* Work with FSCONFIG_SET_STRING and FSCONFIG_SET_PATH only; or
* Work with FSCONFIG_SET_STRING, FSCONFIG_SET_PATH,
  FSCONFIG_SET_PATH_EMPTY, and FSCONFIG_SET_FD.

Currently, none of our parameters work that way.

 * ext4's journal_path takes FSCONFIG_SET_STRING, FSCONFIG_SET_PATH, and
   FSCONFIG_SET_PATH_EMPTY.
 * overlayfs takes FSCONFIG_SET_FD and FSCONFIG_SET_STRING.

I only fully realised how inconsistent this is while working on the
fsconfig(2) man pages -- at the moment I have a very long paragraph
explaining that there is this distinction in-kernel, but this really
doesn't seem intentional to me. I would be very confused as a user that
FSCONFIG_SET_PATH is useless for most filesystem *path* parameters, even
though the filesystem accepts them as FSCONFIG_SET_STRING.

As for practical uses, it would be nice to not have to open 500 files in
order to create a 500-layer overlayfs.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/
Re: [PATCH v4 2/4] procfs: add "pidns" mount option
Posted by Randy Dunlap 2 months ago
Hi,

On 8/4/25 10:45 PM, Aleksa Sarai wrote:
> Since the introduction of pid namespaces, their interaction with procfs
> has been entirely implicit in ways that require a lot of dancing around
> by programs that need to construct sandboxes with different PID
> namespaces.
> 
> Being able to explicitly specify the pid namespace to use when
> constructing a procfs super block will allow programs to no longer need
> to fork off a process which does then does unshare(2) / setns(2) and
> forks again in order to construct a procfs in a pidns.
> 
> So, provide a "pidns" mount option which allows such users to just
> explicitly state which pid namespace they want that procfs instance to
> use. This interface can be used with fsconfig(2) either with a file
> descriptor or a path:
> 
>   fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
>   fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);
> 
> or with classic mount(2) / mount(8):
> 
>   // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
>   mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");
> 
> As this new API is effectively shorthand for setns(2) followed by
> mount(2), the permission model for this mirrors pidns_install() to avoid
> opening up new attack surfaces by loosening the existing permission
> model.
> 
> In order to avoid having to RCU-protect all users of proc_pid_ns() (to
> avoid UAFs), attempting to reconfigure an existing procfs instance's pid
> namespace will error out with -EBUSY. Creating new procfs instances is
> quite cheap, so this should not be an impediment to most users, and lets
> us avoid a lot of churn in fs/proc/* for a feature that it seems
> unlikely userspace would use.
> 
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
>  Documentation/filesystems/proc.rst |  8 ++++
>  fs/proc/root.c                     | 98 +++++++++++++++++++++++++++++++++++---
>  2 files changed, 100 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> index 5236cb52e357..5a157dadea0b 100644
> --- a/Documentation/filesystems/proc.rst
> +++ b/Documentation/filesystems/proc.rst
> @@ -2360,6 +2360,7 @@ The following mount options are supported:
>  	hidepid=	Set /proc/<pid>/ access mode.
>  	gid=		Set the group authorized to learn processes information.
>  	subset=		Show only the specified subset of procfs.
> +	pidns=		Specify a the namespace used by this procfs.

			drop ^^ a

>  	=========	========================================================
>  
>  hidepid=off or hidepid=0 means classic mode - everybody may access all
> @@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group.
>  subset=pid hides all top level files and directories in the procfs that
>  are not related to tasks.
>  
> +pidns= specifies a pid namespace (either as a string path to something like
> +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
> +will be used by the procfs instance when translating pids. By default, procfs
> +will use the calling process's active pid namespace. Note that the pid
> +namespace of an existing procfs instance cannot be modified (attempting to do
> +so will give an `-EBUSY` error).
> +
>  Chapter 5: Filesystem behavior
>  ==============================
>  
-- 
~Randy