Since the introduction of pid namespaces, their interaction with procfs
has been entirely implicit in ways that require a lot of dancing around
by programs that need to construct sandboxes with different PID
namespaces.
Being able to explicitly specify the pid namespace to use when
constructing a procfs super block will allow programs to no longer need
to fork off a process which does then does unshare(2) / setns(2) and
forks again in order to construct a procfs in a pidns.
So, provide a "pidns" mount option which allows such users to just
explicitly state which pid namespace they want that procfs instance to
use. This interface can be used with fsconfig(2) either with a file
descriptor or a path:
fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);
or with classic mount(2) / mount(8):
// mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");
As this new API is effectively shorthand for setns(2) followed by
mount(2), the permission model for this mirrors pidns_install() to avoid
opening up new attack surfaces by loosening the existing permission
model.
In order to avoid having to RCU-protect all users of proc_pid_ns() (to
avoid UAFs), attempting to reconfigure an existing procfs instance's pid
namespace will error out with -EBUSY. Creating new procfs instances is
quite cheap, so this should not be an impediment to most users, and lets
us avoid a lot of churn in fs/proc/* for a feature that it seems
unlikely userspace would use.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
Documentation/filesystems/proc.rst | 8 ++++
fs/proc/root.c | 98 +++++++++++++++++++++++++++++++++++---
2 files changed, 100 insertions(+), 6 deletions(-)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 5236cb52e357..5a157dadea0b 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -2360,6 +2360,7 @@ The following mount options are supported:
hidepid= Set /proc/<pid>/ access mode.
gid= Set the group authorized to learn processes information.
subset= Show only the specified subset of procfs.
+ pidns= Specify a the namespace used by this procfs.
========= ========================================================
hidepid=off or hidepid=0 means classic mode - everybody may access all
@@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group.
subset=pid hides all top level files and directories in the procfs that
are not related to tasks.
+pidns= specifies a pid namespace (either as a string path to something like
+`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
+will be used by the procfs instance when translating pids. By default, procfs
+will use the calling process's active pid namespace. Note that the pid
+namespace of an existing procfs instance cannot be modified (attempting to do
+so will give an `-EBUSY` error).
+
Chapter 5: Filesystem behavior
==============================
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ed86ac710384..fd1f1c8a939a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
- fsparam_u32("gid", Opt_gid),
+ fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
+
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ops->type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
+}
+#endif /* CONFIG_PID_NS */
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
- if (proc_parse_hidepid_param(fc, param))
- return -EINVAL;
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
break;
case Opt_subset:
- if (proc_parse_subset_param(fc, param->string) < 0)
- return -EINVAL;
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
default:
return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
--
2.50.1
On 2025-08-05, Aleksa Sarai <cyphar@cyphar.com> wrote: > Since the introduction of pid namespaces, their interaction with procfs > has been entirely implicit in ways that require a lot of dancing around > by programs that need to construct sandboxes with different PID > namespaces. > > Being able to explicitly specify the pid namespace to use when > constructing a procfs super block will allow programs to no longer need > to fork off a process which does then does unshare(2) / setns(2) and > forks again in order to construct a procfs in a pidns. > > So, provide a "pidns" mount option which allows such users to just > explicitly state which pid namespace they want that procfs instance to > use. This interface can be used with fsconfig(2) either with a file > descriptor or a path: > > fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd); > fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0); > > or with classic mount(2) / mount(8): > > // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc > mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid"); > > As this new API is effectively shorthand for setns(2) followed by > mount(2), the permission model for this mirrors pidns_install() to avoid > opening up new attack surfaces by loosening the existing permission > model. > > In order to avoid having to RCU-protect all users of proc_pid_ns() (to > avoid UAFs), attempting to reconfigure an existing procfs instance's pid > namespace will error out with -EBUSY. Creating new procfs instances is > quite cheap, so this should not be an impediment to most users, and lets > us avoid a lot of churn in fs/proc/* for a feature that it seems > unlikely userspace would use. > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > --- > Documentation/filesystems/proc.rst | 8 ++++ > fs/proc/root.c | 98 +++++++++++++++++++++++++++++++++++--- > 2 files changed, 100 insertions(+), 6 deletions(-) > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst > index 5236cb52e357..5a157dadea0b 100644 > --- a/Documentation/filesystems/proc.rst > +++ b/Documentation/filesystems/proc.rst > @@ -2360,6 +2360,7 @@ The following mount options are supported: > hidepid= Set /proc/<pid>/ access mode. > gid= Set the group authorized to learn processes information. > subset= Show only the specified subset of procfs. > + pidns= Specify a the namespace used by this procfs. > ========= ======================================================== > > hidepid=off or hidepid=0 means classic mode - everybody may access all > @@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group. > subset=pid hides all top level files and directories in the procfs that > are not related to tasks. > > +pidns= specifies a pid namespace (either as a string path to something like > +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that > +will be used by the procfs instance when translating pids. By default, procfs > +will use the calling process's active pid namespace. Note that the pid > +namespace of an existing procfs instance cannot be modified (attempting to do > +so will give an `-EBUSY` error). > + > Chapter 5: Filesystem behavior > ============================== > > diff --git a/fs/proc/root.c b/fs/proc/root.c > index ed86ac710384..fd1f1c8a939a 100644 > --- a/fs/proc/root.c > +++ b/fs/proc/root.c > @@ -38,12 +38,14 @@ enum proc_param { > Opt_gid, > Opt_hidepid, > Opt_subset, > + Opt_pidns, > }; > > static const struct fs_parameter_spec proc_fs_parameters[] = { > - fsparam_u32("gid", Opt_gid), > + fsparam_u32("gid", Opt_gid), > fsparam_string("hidepid", Opt_hidepid), > fsparam_string("subset", Opt_subset), > + fsparam_file_or_string("pidns", Opt_pidns), > {} > }; > > @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) > return 0; > } > > +#ifdef CONFIG_PID_NS > +static int proc_parse_pidns_param(struct fs_context *fc, > + struct fs_parameter *param, > + struct fs_parse_result *result) > +{ > + struct proc_fs_context *ctx = fc->fs_private; > + struct pid_namespace *target, *active = task_active_pid_ns(current); > + struct ns_common *ns; > + struct file *ns_filp __free(fput) = NULL; > + > + switch (param->type) { > + case fs_value_is_file: > + /* came through fsconfig, steal the file reference */ > + ns_filp = no_free_ptr(param->file); > + break; > + case fs_value_is_string: > + ns_filp = filp_open(param->string, O_RDONLY, 0); > + break; I just realised that we probably also want to support FSCONFIG_SET_PATH here, but fsparam_file_or_string() doesn't handle that at the moment. I think we probably want to have fsparam_file_or_path() which would act like: 1. A path with FSCONFIG_SET_STRING and FSCONFIG_SET_PATH. 2. A file with FSCONFIG_SET_FD. These are the semantics I would already expect from these kinds of flags, but at the moment FSCONFIG_SET_PATH is entirely disallowed. @Amir: I wonder if overlayfs (the only other user of fsparam_file_or_string()) would also prefer having these semantics? We could just migrate fsparam_file_or_string() to fsparam_file_or_path() everwhere, since I'm pretty sure these are the semantics userspace expects anyway. > + default: > + WARN_ON_ONCE(true); > + break; > + } > + if (!ns_filp) > + ns_filp = ERR_PTR(-EBADF); > + if (IS_ERR(ns_filp)) { > + errorfc(fc, "could not get file from pidns argument"); > + return PTR_ERR(ns_filp); > + } > + > + if (!proc_ns_file(ns_filp)) > + return invalfc(fc, "pidns argument is not an nsfs file"); > + ns = get_proc_ns(file_inode(ns_filp)); > + if (ns->ops->type != CLONE_NEWPID) > + return invalfc(fc, "pidns argument is not a pidns file"); > + target = container_of(ns, struct pid_namespace, ns); > + > + /* > + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the > + * permission model should be the same as pidns_install(). > + */ > + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { > + errorfc(fc, "insufficient permissions to set pidns"); > + return -EPERM; > + } > + if (!pidns_is_ancestor(target, active)) > + return invalfc(fc, "cannot set pidns to non-descendant pidns"); > + > + put_pid_ns(ctx->pid_ns); > + ctx->pid_ns = get_pid_ns(target); > + put_user_ns(fc->user_ns); > + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); > + return 0; > +} > +#endif /* CONFIG_PID_NS */ > + > static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) > { > struct proc_fs_context *ctx = fc->fs_private; > struct fs_parse_result result; > - int opt; > + int opt, err; > > opt = fs_parse(fc, proc_fs_parameters, param, &result); > if (opt < 0) > @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) > break; > > case Opt_hidepid: > - if (proc_parse_hidepid_param(fc, param)) > - return -EINVAL; > + err = proc_parse_hidepid_param(fc, param); > + if (err) > + return err; > break; > > case Opt_subset: > - if (proc_parse_subset_param(fc, param->string) < 0) > - return -EINVAL; > + err = proc_parse_subset_param(fc, param->string); > + if (err) > + return err; > + break; > + > + case Opt_pidns: > +#ifdef CONFIG_PID_NS > + /* > + * We would have to RCU-protect every proc_pid_ns() or > + * proc_sb_info() access if we allowed this to be reconfigured > + * for an existing procfs instance. Luckily, procfs instances > + * are cheap to create, and mount-beneath would let you > + * atomically replace an instance even with overmounts. > + */ > + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { > + errorfc(fc, "cannot reconfigure pidns for existing procfs"); > + return -EBUSY; > + } > + err = proc_parse_pidns_param(fc, param, &result); > + if (err) > + return err; > break; > +#else > + errorfc(fc, "pidns mount flag not supported on this system"); > + return -EOPNOTSUPP; > +#endif > > default: > return -EINVAL; > @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, > fs_info->hide_pid = ctx->hidepid; > if (ctx->mask & (1 << Opt_subset)) > fs_info->pidonly = ctx->pidonly; > + if (ctx->mask & (1 << Opt_pidns) && > + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { > + put_pid_ns(fs_info->pid_ns); > + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); > + } > } > > static int proc_fill_super(struct super_block *s, struct fs_context *fc) > > -- > 2.50.1 > -- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
> I just realised that we probably also want to support FSCONFIG_SET_PATH I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH. Moreover, fsparam_path macro is present since 5.1. And for all this time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody used it, so this will not break anything. If you okay with that, I can submit patch, removing it. -- Askar Safin
On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote: > > I just realised that we probably also want to support FSCONFIG_SET_PATH > > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH. > Moreover, fsparam_path macro is present since 5.1. And for all this > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody > used it, so this will not break anything. > > If you okay with that, I can submit patch, removing it. I would prefer you didn't -- "*at()" semantics are very useful to a lot of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff to support it, and probably also overlayfs... I suspect the primary issue is that when migrating to the new mount API, filesystem devs just went with the easiest thing to use (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I suspect the lack of documentation around fsconfig(2) played a part too. My impression is that interest in the minutia about fsconfig(2) is quite low on the list of priorities for most filesystem devs, and so the neat aspects of fsconfig(2) haven't been fully utilised. (In LPC last year, we struggled to come to an agreement on how filesystems should use the read(2)-based error interface.) We can very easily move fsparam_string() or fsparam_file_or_string() parameters to fsparam_path() and a future fsparam_file_or_path(). I would much prefer that as a user. -- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
On 2025-08-07, Aleksa Sarai <cyphar@cyphar.com> wrote: > On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote: > > > I just realised that we probably also want to support FSCONFIG_SET_PATH > > > > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH. > > Moreover, fsparam_path macro is present since 5.1. And for all this > > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody > > used it, so this will not break anything. > > > > If you okay with that, I can submit patch, removing it. > > I would prefer you didn't -- "*at()" semantics are very useful to a lot > of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff > to support it, and probably also overlayfs... > > I suspect the primary issue is that when migrating to the new mount API, > filesystem devs just went with the easiest thing to use > (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I > suspect the lack of documentation around fsconfig(2) played a part too. > > My impression is that interest in the minutia about fsconfig(2) is quite > low on the list of priorities for most filesystem devs, and so the neat > aspects of fsconfig(2) haven't been fully utilised. (In LPC last year, > we struggled to come to an agreement on how filesystems should use the > read(2)-based error interface.) > > We can very easily move fsparam_string() or fsparam_file_or_string() > parameters to fsparam_path() and a future fsparam_file_or_path(). I > would much prefer that as a user. Actually, fsparam_bdev() accepts FSCONFIG_SET_PATH in a very roundabout way (and the checker doesn't verify anything...?). So there is at least one user (ext4's "journal_path"), it's just not well-documented (which I'm trying to fix ;]). My plan is to update fs_lookup_param() to be more useful for the (fairly common) use-case of wanting to support paths and file descriptors, and going through to clean up some of these unused fsparam_* helpers (or fsparam_* helpers being abused to implement stuff that the fs_parser core already supports). At the very least, overlayfs, ext4, and this procfs patchset can make use of it. -- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
On Thu, Aug 07, 2025 at 05:17:56PM +1000, Aleksa Sarai wrote: > On 2025-08-07, Aleksa Sarai <cyphar@cyphar.com> wrote: > > On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote: > > > > I just realised that we probably also want to support FSCONFIG_SET_PATH > > > > > > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH. > > > Moreover, fsparam_path macro is present since 5.1. And for all this > > > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody > > > used it, so this will not break anything. > > > > > > If you okay with that, I can submit patch, removing it. > > > > I would prefer you didn't -- "*at()" semantics are very useful to a lot > > of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff > > to support it, and probably also overlayfs... > > > > I suspect the primary issue is that when migrating to the new mount API, > > filesystem devs just went with the easiest thing to use > > (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I > > suspect the lack of documentation around fsconfig(2) played a part too. > > > > My impression is that interest in the minutia about fsconfig(2) is quite > > low on the list of priorities for most filesystem devs, and so the neat > > aspects of fsconfig(2) haven't been fully utilised. (In LPC last year, > > we struggled to come to an agreement on how filesystems should use the > > read(2)-based error interface.) > > > > We can very easily move fsparam_string() or fsparam_file_or_string() > > parameters to fsparam_path() and a future fsparam_file_or_path(). I > > would much prefer that as a user. > > Actually, fsparam_bdev() accepts FSCONFIG_SET_PATH in a very roundabout > way (and the checker doesn't verify anything...?). So there is at least > one user (ext4's "journal_path"), it's just not well-documented (which > I'm trying to fix ;]). > > My plan is to update fs_lookup_param() to be more useful for the (fairly > common) use-case of wanting to support paths and file descriptors, and > going through to clean up some of these unused fsparam_* helpers (or > fsparam_* helpers being abused to implement stuff that the fs_parser > core already supports). > > At the very least, overlayfs, ext4, and this procfs patchset can make > use of it. I've never bothered with actually iplementing FSCONFIG_SET_PATH semantics because I think it's really weird to allow *at semantics when setting filesystem parameters. I always thought it's better to force userspace to provide a file descriptor for the final destination instead of doing some arcane lookup variant for mount configuration. But I'm happy to be convinced of its usefulness...
On 2025-08-08, Christian Brauner <brauner@kernel.org> wrote: > On Thu, Aug 07, 2025 at 05:17:56PM +1000, Aleksa Sarai wrote: > > On 2025-08-07, Aleksa Sarai <cyphar@cyphar.com> wrote: > > > On 2025-08-06, Askar Safin <safinaskar@zohomail.com> wrote: > > > > > I just realised that we probably also want to support FSCONFIG_SET_PATH > > > > > > > > I just checked kernel code. Indeed nobody uses FSCONFIG_SET_PATH. > > > > Moreover, fsparam_path macro is present since 5.1. And for all this > > > > time nobody used it. So, let's just remove FSCONFIG_SET_PATH. Nobody > > > > used it, so this will not break anything. > > > > > > > > If you okay with that, I can submit patch, removing it. > > > > > > I would prefer you didn't -- "*at()" semantics are very useful to a lot > > > of programs (*especially* AT_EMPTY_PATH). I would like the pidns= stuff > > > to support it, and probably also overlayfs... > > > > > > I suspect the primary issue is that when migrating to the new mount API, > > > filesystem devs just went with the easiest thing to use > > > (FSCONFIG_SET_STRING) even though FSCONFIG_SET_PATH would be better. I > > > suspect the lack of documentation around fsconfig(2) played a part too. > > > > > > My impression is that interest in the minutia about fsconfig(2) is quite > > > low on the list of priorities for most filesystem devs, and so the neat > > > aspects of fsconfig(2) haven't been fully utilised. (In LPC last year, > > > we struggled to come to an agreement on how filesystems should use the > > > read(2)-based error interface.) > > > > > > We can very easily move fsparam_string() or fsparam_file_or_string() > > > parameters to fsparam_path() and a future fsparam_file_or_path(). I > > > would much prefer that as a user. > > > > Actually, fsparam_bdev() accepts FSCONFIG_SET_PATH in a very roundabout > > way (and the checker doesn't verify anything...?). So there is at least > > one user (ext4's "journal_path"), it's just not well-documented (which > > I'm trying to fix ;]). > > > > My plan is to update fs_lookup_param() to be more useful for the (fairly > > common) use-case of wanting to support paths and file descriptors, and > > going through to clean up some of these unused fsparam_* helpers (or > > fsparam_* helpers being abused to implement stuff that the fs_parser > > core already supports). > > > > At the very least, overlayfs, ext4, and this procfs patchset can make > > use of it. > > I've never bothered with actually iplementing FSCONFIG_SET_PATH > semantics because I think it's really weird to allow *at semantics when > setting filesystem parameters. I always thought it's better to force > userspace to provide a file descriptor for the final destination instead > of doing some arcane lookup variant for mount configuration. But I'm > happy to be convinced of its usefulness... I do think it's useful, and here's my thought process... Most filesystems have to take string path parameters in order to support mount(2) and work with mount(8). Yes, fsparam_fd() will accept FSCONFIG_SET_STRING by parsing it as a decimal string, but there are only two users of fsparam_fd() and honestly I'm not convinced this is a particularly sane API for anything other than strict backcompat reasons (the API only makes sense as a file descriptor and you want mount(8) to be able to use it). So you end up with most parameters supporting paths set using FSCONFIG_SET_STRING anyway, meaning in-kernel lookups can't be taken off the table. And if we accept paths for lookup, then (for the same reason we have *at(2) syscalls) it is preferable to allow specifying dirfds. So FSCONFIG_SET_PATH should also be supported. And as there is no infrastructure to block FSCONFIG_SET_PATH_EMPTY arguments (yes, you can do it manually, but the *only* user of fs_lookup_param() doesn't), then anything that accepts FSCONFIG_SET_PATH currently also accepts FSCONFIG_SET_PATH_EMPTY which is "morally equivalent" to FSCONFIG_SET_FD. So unless you block FSCONFIG_SET_PATH_EMPTY then FSCONFIG_SET_FD should probably also be supported (there is the re-opening distinction, of course, but that is not relevant if you use filename_lookup() -- which is what filesystems will do in practice). So my impression is that most users (if they had an fsconfig(2) man page to read...) would expect parameters that accept paths to either: * Work with FSCONFIG_SET_STRING and FSCONFIG_SET_PATH only; or * Work with FSCONFIG_SET_STRING, FSCONFIG_SET_PATH, FSCONFIG_SET_PATH_EMPTY, and FSCONFIG_SET_FD. Currently, none of our parameters work that way. * ext4's journal_path takes FSCONFIG_SET_STRING, FSCONFIG_SET_PATH, and FSCONFIG_SET_PATH_EMPTY. * overlayfs takes FSCONFIG_SET_FD and FSCONFIG_SET_STRING. I only fully realised how inconsistent this is while working on the fsconfig(2) man pages -- at the moment I have a very long paragraph explaining that there is this distinction in-kernel, but this really doesn't seem intentional to me. I would be very confused as a user that FSCONFIG_SET_PATH is useless for most filesystem *path* parameters, even though the filesystem accepts them as FSCONFIG_SET_STRING. As for practical uses, it would be nice to not have to open 500 files in order to create a 500-layer overlayfs. -- Aleksa Sarai Senior Software Engineer (Containers) SUSE Linux GmbH https://www.cyphar.com/
Hi, On 8/4/25 10:45 PM, Aleksa Sarai wrote: > Since the introduction of pid namespaces, their interaction with procfs > has been entirely implicit in ways that require a lot of dancing around > by programs that need to construct sandboxes with different PID > namespaces. > > Being able to explicitly specify the pid namespace to use when > constructing a procfs super block will allow programs to no longer need > to fork off a process which does then does unshare(2) / setns(2) and > forks again in order to construct a procfs in a pidns. > > So, provide a "pidns" mount option which allows such users to just > explicitly state which pid namespace they want that procfs instance to > use. This interface can be used with fsconfig(2) either with a file > descriptor or a path: > > fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd); > fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0); > > or with classic mount(2) / mount(8): > > // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc > mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid"); > > As this new API is effectively shorthand for setns(2) followed by > mount(2), the permission model for this mirrors pidns_install() to avoid > opening up new attack surfaces by loosening the existing permission > model. > > In order to avoid having to RCU-protect all users of proc_pid_ns() (to > avoid UAFs), attempting to reconfigure an existing procfs instance's pid > namespace will error out with -EBUSY. Creating new procfs instances is > quite cheap, so this should not be an impediment to most users, and lets > us avoid a lot of churn in fs/proc/* for a feature that it seems > unlikely userspace would use. > > Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> > --- > Documentation/filesystems/proc.rst | 8 ++++ > fs/proc/root.c | 98 +++++++++++++++++++++++++++++++++++--- > 2 files changed, 100 insertions(+), 6 deletions(-) > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst > index 5236cb52e357..5a157dadea0b 100644 > --- a/Documentation/filesystems/proc.rst > +++ b/Documentation/filesystems/proc.rst > @@ -2360,6 +2360,7 @@ The following mount options are supported: > hidepid= Set /proc/<pid>/ access mode. > gid= Set the group authorized to learn processes information. > subset= Show only the specified subset of procfs. > + pidns= Specify a the namespace used by this procfs. drop ^^ a > ========= ======================================================== > > hidepid=off or hidepid=0 means classic mode - everybody may access all > @@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group. > subset=pid hides all top level files and directories in the procfs that > are not related to tasks. > > +pidns= specifies a pid namespace (either as a string path to something like > +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that > +will be used by the procfs instance when translating pids. By default, procfs > +will use the calling process's active pid namespace. Note that the pid > +namespace of an existing procfs instance cannot be modified (attempting to do > +so will give an `-EBUSY` error). > + > Chapter 5: Filesystem behavior > ============================== > -- ~Randy
© 2016 - 2025 Red Hat, Inc.