[v1] ns: support file handles

[PATCH 27/32] nsfs: support file handles

Posted by Christian Brauner 5 months ago

A while ago we added support for file handles to pidfs so pidfds can be
encoded and decoded as file handles. Userspace has adopted this quickly
and it's proven very useful. Pidfd file handles are exhaustive meaning
they don't require a handle on another pidfd to pass to
open_by_handle_at() so it can derive the filesystem to decode in.

Implement the exhaustive file handles for namespaces as well.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                | 176 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/exportfs.h |   6 ++
 2 files changed, 182 insertions(+)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 6f8008177133..a1585a2f4f03 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -13,6 +13,12 @@
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
 #include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <linux/exportfs.h>
+#include <linux/nstree.h>
+#include <net/net_namespace.h>
 
 #include "mount.h"
 #include "internal.h"
@@ -417,12 +423,182 @@ static const struct stashed_operations nsfs_stashed_ops = {
 	.put_data = nsfs_put_data,
 };
 
+struct nsfs_fid {
+	u64 ns_id;
+	u32 ns_type;
+	u32 ns_inum;
+} __attribute__ ((packed));
+
+#define NSFS_FID_SIZE (sizeof(struct nsfs_fid) / sizeof(u32))
+
+static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			  struct inode *parent)
+{
+	struct nsfs_fid *fid = (struct nsfs_fid *)fh;
+	struct ns_common *ns = inode->i_private;
+	int len = *max_len;
+
+	/*
+	 * TODO:
+	 * For hierarchical namespaces we should start to encode the
+	 * parent namespace. Then userspace can walk a namespace
+	 * hierarchy purely based on file handles.
+	 */
+	if (parent)
+		return FILEID_INVALID;
+
+	if (len < NSFS_FID_SIZE) {
+		*max_len = NSFS_FID_SIZE;
+		return FILEID_INVALID;
+	}
+
+	len  = NSFS_FID_SIZE;
+
+	fid->ns_id = ns->ns_id;
+	fid->ns_type = ns->ops->type;
+	fid->ns_inum = inode->i_ino;
+	*max_len = len;
+	return FILEID_NSFS;
+}
+
+static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					int fh_len, int fh_type)
+{
+	struct path path __free(path_put) = {};
+	struct nsfs_fid *fid = (struct nsfs_fid *)fh;
+	struct user_namespace *owning_ns = NULL;
+	struct ns_common *ns;
+	int ret;
+
+	if (fh_len < NSFS_FID_SIZE)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_NSFS:
+		break;
+	default:
+		return NULL;
+	}
+
+	scoped_guard(rcu) {
+		ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
+		if (!ns)
+			return NULL;
+
+		VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
+		VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type);
+		VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
+
+		if (!refcount_inc_not_zero(&ns->count))
+			return NULL;
+	}
+
+	switch (ns->ops->type) {
+#ifdef CONFIG_CGROUPS
+	case CLONE_NEWCGROUP:
+		if (!current_in_namespace(to_cg_ns(ns)))
+			owning_ns = to_cg_ns(ns)->user_ns;
+		break;
+#endif
+#ifdef CONFIG_IPC_NS
+	case CLONE_NEWIPC:
+		if (!current_in_namespace(to_ipc_ns(ns)))
+			owning_ns = to_ipc_ns(ns)->user_ns;
+		break;
+#endif
+	case CLONE_NEWNS:
+		if (!current_in_namespace(to_mnt_ns(ns)))
+			owning_ns = to_mnt_ns(ns)->user_ns;
+		break;
+#ifdef CONFIG_NET_NS
+	case CLONE_NEWNET:
+		if (!current_in_namespace(to_net_ns(ns)))
+			owning_ns = to_net_ns(ns)->user_ns;
+		break;
+#endif
+#ifdef CONFIG_PID_NS
+	case CLONE_NEWPID:
+		if (!current_in_namespace(to_pid_ns(ns))) {
+			owning_ns = to_pid_ns(ns)->user_ns;
+		} else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
+			ns->ops->put(ns);
+			return ERR_PTR(-EPERM);
+		}
+		break;
+#endif
+#ifdef CONFIG_TIME_NS
+	case CLONE_NEWTIME:
+		if (!current_in_namespace(to_time_ns(ns)))
+			owning_ns = to_time_ns(ns)->user_ns;
+		break;
+#endif
+#ifdef CONFIG_USER_NS
+	case CLONE_NEWUSER:
+		if (!current_in_namespace(to_user_ns(ns)))
+			owning_ns = to_user_ns(ns);
+		break;
+#endif
+#ifdef CONFIG_UTS_NS
+	case CLONE_NEWUTS:
+		if (!current_in_namespace(to_uts_ns(ns)))
+			owning_ns = to_uts_ns(ns)->user_ns;
+		break;
+#endif
+	default:
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
+		ns->ops->put(ns);
+		return ERR_PTR(-EPERM);
+	}
+
+	/* path_from_stashed() unconditionally consumes the reference. */
+	ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return no_free_ptr(path.dentry);
+}
+
+/*
+ * Make sure that we reject any nonsensical flags that users pass via
+ * open_by_handle_at().
+ */
+#define VALID_FILE_HANDLE_OPEN_FLAGS \
+	(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
+
+static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
+				   unsigned int oflags)
+{
+	if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
+		return -EINVAL;
+
+	/* nsfs_fh_to_dentry() is performs further permission checks. */
+	return 0;
+}
+
+static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+{
+	/* Clear O_LARGEFILE as open_by_handle_at() forces it. */
+	oflags &= ~O_LARGEFILE;
+	return file_open_root(path, "", oflags, 0);
+}
+
+static const struct export_operations nsfs_export_operations = {
+	.encode_fh	= nsfs_encode_fh,
+	.fh_to_dentry	= nsfs_fh_to_dentry,
+	.open		= nsfs_export_open,
+	.permission	= nsfs_export_permission,
+};
+
 static int nsfs_init_fs_context(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
 	if (!ctx)
 		return -ENOMEM;
 	ctx->ops = &nsfs_ops;
+	ctx->eops = &nsfs_export_operations;
 	ctx->dops = &ns_dentry_operations;
 	fc->s_fs_info = (void *)&nsfs_stashed_ops;
 	return 0;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index cfb0dd1ea49c..3aac58a520c7 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -122,6 +122,12 @@ enum fid_type {
 	FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
 	FILEID_BCACHEFS_WITH_PARENT = 0xb2,
 
+	/*
+	 *
+	 * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number.
+	 */
+	FILEID_NSFS = 0xf1,
+
 	/*
 	 * 64 bit unique kernfs id
 	 */

-- 
2.47.3

Re: [PATCH 27/32] nsfs: support file handles

Posted by Amir Goldstein 5 months ago

On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@kernel.org> wrote:
>
> A while ago we added support for file handles to pidfs so pidfds can be
> encoded and decoded as file handles. Userspace has adopted this quickly
> and it's proven very useful.

> Pidfd file handles are exhaustive meaning
> they don't require a handle on another pidfd to pass to
> open_by_handle_at() so it can derive the filesystem to decode in.
>
> Implement the exhaustive file handles for namespaces as well.

I think you decide to split the "exhaustive" part to another patch,
so better drop this paragraph?

I am missing an explanation about the permissions for
opening these file handles.

My understanding of the code is that the opener needs to meet one of
the conditions:
1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
2. current task is in the opened namespace

But I do not fully understand the rationale behind the 2nd condition,
that is, when is it useful?
And as far as I can tell, your selftest does not cover this condition
(only both true or both false)?

I suggest to start with allowing only the useful and important
cases, so if cond #1 is useful enough, drop cond #2 and we can add
it later if needed and then your selftests already cover cond #1 true and false.

>
> Signed-off-by: Christian Brauner <brauner@kernel.org>

After documenting the permissions, with ot without dropping cond #2
feel free to add:

Reviewed-by: Amir Goldstein <amir73il@gmail.com>

> ---
>  fs/nsfs.c                | 176 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/exportfs.h |   6 ++
>  2 files changed, 182 insertions(+)
>
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index 6f8008177133..a1585a2f4f03 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -13,6 +13,12 @@
>  #include <linux/nsfs.h>
>  #include <linux/uaccess.h>
>  #include <linux/mnt_namespace.h>
> +#include <linux/ipc_namespace.h>
> +#include <linux/time_namespace.h>
> +#include <linux/utsname.h>
> +#include <linux/exportfs.h>
> +#include <linux/nstree.h>
> +#include <net/net_namespace.h>
>
>  #include "mount.h"
>  #include "internal.h"
> @@ -417,12 +423,182 @@ static const struct stashed_operations nsfs_stashed_ops = {
>         .put_data = nsfs_put_data,
>  };
>
> +struct nsfs_fid {
> +       u64 ns_id;
> +       u32 ns_type;
> +       u32 ns_inum;
> +} __attribute__ ((packed));
> +
> +#define NSFS_FID_SIZE (sizeof(struct nsfs_fid) / sizeof(u32))
> +
> +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> +                         struct inode *parent)
> +{
> +       struct nsfs_fid *fid = (struct nsfs_fid *)fh;
> +       struct ns_common *ns = inode->i_private;
> +       int len = *max_len;
> +
> +       /*
> +        * TODO:
> +        * For hierarchical namespaces we should start to encode the
> +        * parent namespace. Then userspace can walk a namespace
> +        * hierarchy purely based on file handles.
> +        */
> +       if (parent)
> +               return FILEID_INVALID;
> +
> +       if (len < NSFS_FID_SIZE) {
> +               *max_len = NSFS_FID_SIZE;
> +               return FILEID_INVALID;
> +       }
> +
> +       len  = NSFS_FID_SIZE;
> +
> +       fid->ns_id = ns->ns_id;
> +       fid->ns_type = ns->ops->type;
> +       fid->ns_inum = inode->i_ino;
> +       *max_len = len;
> +       return FILEID_NSFS;
> +}
> +
> +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
> +                                       int fh_len, int fh_type)
> +{
> +       struct path path __free(path_put) = {};
> +       struct nsfs_fid *fid = (struct nsfs_fid *)fh;
> +       struct user_namespace *owning_ns = NULL;
> +       struct ns_common *ns;
> +       int ret;
> +
> +       if (fh_len < NSFS_FID_SIZE)
> +               return NULL;
> +
> +       switch (fh_type) {
> +       case FILEID_NSFS:
> +               break;
> +       default:
> +               return NULL;
> +       }
> +
> +       scoped_guard(rcu) {
> +               ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
> +               if (!ns)
> +                       return NULL;
> +
> +               VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
> +               VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type);
> +               VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
> +
> +               if (!refcount_inc_not_zero(&ns->count))
> +                       return NULL;
> +       }
> +
> +       switch (ns->ops->type) {
> +#ifdef CONFIG_CGROUPS
> +       case CLONE_NEWCGROUP:
> +               if (!current_in_namespace(to_cg_ns(ns)))
> +                       owning_ns = to_cg_ns(ns)->user_ns;
> +               break;
> +#endif
> +#ifdef CONFIG_IPC_NS
> +       case CLONE_NEWIPC:
> +               if (!current_in_namespace(to_ipc_ns(ns)))
> +                       owning_ns = to_ipc_ns(ns)->user_ns;
> +               break;
> +#endif
> +       case CLONE_NEWNS:
> +               if (!current_in_namespace(to_mnt_ns(ns)))
> +                       owning_ns = to_mnt_ns(ns)->user_ns;
> +               break;
> +#ifdef CONFIG_NET_NS
> +       case CLONE_NEWNET:
> +               if (!current_in_namespace(to_net_ns(ns)))
> +                       owning_ns = to_net_ns(ns)->user_ns;
> +               break;
> +#endif
> +#ifdef CONFIG_PID_NS
> +       case CLONE_NEWPID:
> +               if (!current_in_namespace(to_pid_ns(ns))) {
> +                       owning_ns = to_pid_ns(ns)->user_ns;
> +               } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
> +                       ns->ops->put(ns);
> +                       return ERR_PTR(-EPERM);
> +               }
> +               break;
> +#endif
> +#ifdef CONFIG_TIME_NS
> +       case CLONE_NEWTIME:
> +               if (!current_in_namespace(to_time_ns(ns)))
> +                       owning_ns = to_time_ns(ns)->user_ns;
> +               break;
> +#endif
> +#ifdef CONFIG_USER_NS
> +       case CLONE_NEWUSER:
> +               if (!current_in_namespace(to_user_ns(ns)))
> +                       owning_ns = to_user_ns(ns);
> +               break;
> +#endif
> +#ifdef CONFIG_UTS_NS
> +       case CLONE_NEWUTS:
> +               if (!current_in_namespace(to_uts_ns(ns)))
> +                       owning_ns = to_uts_ns(ns)->user_ns;
> +               break;
> +#endif
> +       default:
> +               return ERR_PTR(-EOPNOTSUPP);
> +       }
> +
> +       if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
> +               ns->ops->put(ns);
> +               return ERR_PTR(-EPERM);
> +       }
> +
> +       /* path_from_stashed() unconditionally consumes the reference. */
> +       ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
> +       if (ret)
> +               return ERR_PTR(ret);
> +
> +       return no_free_ptr(path.dentry);
> +}
> +
> +/*
> + * Make sure that we reject any nonsensical flags that users pass via
> + * open_by_handle_at().
> + */
> +#define VALID_FILE_HANDLE_OPEN_FLAGS \
> +       (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
> +
> +static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
> +                                  unsigned int oflags)
> +{
> +       if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
> +               return -EINVAL;
> +
> +       /* nsfs_fh_to_dentry() is performs further permission checks. */
> +       return 0;
> +}
> +
> +static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
> +{
> +       /* Clear O_LARGEFILE as open_by_handle_at() forces it. */
> +       oflags &= ~O_LARGEFILE;
> +       return file_open_root(path, "", oflags, 0);
> +}
> +
> +static const struct export_operations nsfs_export_operations = {
> +       .encode_fh      = nsfs_encode_fh,
> +       .fh_to_dentry   = nsfs_fh_to_dentry,
> +       .open           = nsfs_export_open,
> +       .permission     = nsfs_export_permission,
> +};
> +
>  static int nsfs_init_fs_context(struct fs_context *fc)
>  {
>         struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
>         if (!ctx)
>                 return -ENOMEM;
>         ctx->ops = &nsfs_ops;
> +       ctx->eops = &nsfs_export_operations;
>         ctx->dops = &ns_dentry_operations;
>         fc->s_fs_info = (void *)&nsfs_stashed_ops;
>         return 0;
> diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> index cfb0dd1ea49c..3aac58a520c7 100644
> --- a/include/linux/exportfs.h
> +++ b/include/linux/exportfs.h
> @@ -122,6 +122,12 @@ enum fid_type {
>         FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
>         FILEID_BCACHEFS_WITH_PARENT = 0xb2,
>
> +       /*
> +        *
> +        * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number.
> +        */
> +       FILEID_NSFS = 0xf1,
> +
>         /*
>          * 64 bit unique kernfs id
>          */
>
> --
> 2.47.3
>

Re: [PATCH 27/32] nsfs: support file handles

Posted by Christian Brauner 4 months, 4 weeks ago

On Wed, Sep 10, 2025 at 07:21:22PM +0200, Amir Goldstein wrote:
> On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> > A while ago we added support for file handles to pidfs so pidfds can be
> > encoded and decoded as file handles. Userspace has adopted this quickly
> > and it's proven very useful.
> 
> > Pidfd file handles are exhaustive meaning
> > they don't require a handle on another pidfd to pass to
> > open_by_handle_at() so it can derive the filesystem to decode in.
> >
> > Implement the exhaustive file handles for namespaces as well.
> 
> I think you decide to split the "exhaustive" part to another patch,
> so better drop this paragraph?

Yes, good point. I've dont that.

> I am missing an explanation about the permissions for
> opening these file handles.
> 
> My understanding of the code is that the opener needs to meet one of
> the conditions:
> 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
> 2. current task is in the opened namespace

Yes.

> 
> But I do not fully understand the rationale behind the 2nd condition,
> that is, when is it useful?

A caller is always able to open a file descriptor to it's own set of
namespaces. File handles will behave the same way.

> And as far as I can tell, your selftest does not cover this condition
> (only both true or both false)?

I've added this now.

> 
> I suggest to start with allowing only the useful and important
> cases, so if cond #1 is useful enough, drop cond #2 and we can add
> it later if needed and then your selftests already cover cond #1 true and false.
> 
> >
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> After documenting the permissions, with ot without dropping cond #2
> feel free to add:
> 
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>

Thanks!

> 
> > ---
> >  fs/nsfs.c                | 176 +++++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/exportfs.h |   6 ++
> >  2 files changed, 182 insertions(+)
> >
> > diff --git a/fs/nsfs.c b/fs/nsfs.c
> > index 6f8008177133..a1585a2f4f03 100644
> > --- a/fs/nsfs.c
> > +++ b/fs/nsfs.c
> > @@ -13,6 +13,12 @@
> >  #include <linux/nsfs.h>
> >  #include <linux/uaccess.h>
> >  #include <linux/mnt_namespace.h>
> > +#include <linux/ipc_namespace.h>
> > +#include <linux/time_namespace.h>
> > +#include <linux/utsname.h>
> > +#include <linux/exportfs.h>
> > +#include <linux/nstree.h>
> > +#include <net/net_namespace.h>
> >
> >  #include "mount.h"
> >  #include "internal.h"
> > @@ -417,12 +423,182 @@ static const struct stashed_operations nsfs_stashed_ops = {
> >         .put_data = nsfs_put_data,
> >  };
> >
> > +struct nsfs_fid {
> > +       u64 ns_id;
> > +       u32 ns_type;
> > +       u32 ns_inum;
> > +} __attribute__ ((packed));
> > +
> > +#define NSFS_FID_SIZE (sizeof(struct nsfs_fid) / sizeof(u32))
> > +
> > +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
> > +                         struct inode *parent)
> > +{
> > +       struct nsfs_fid *fid = (struct nsfs_fid *)fh;
> > +       struct ns_common *ns = inode->i_private;
> > +       int len = *max_len;
> > +
> > +       /*
> > +        * TODO:
> > +        * For hierarchical namespaces we should start to encode the
> > +        * parent namespace. Then userspace can walk a namespace
> > +        * hierarchy purely based on file handles.
> > +        */
> > +       if (parent)
> > +               return FILEID_INVALID;
> > +
> > +       if (len < NSFS_FID_SIZE) {
> > +               *max_len = NSFS_FID_SIZE;
> > +               return FILEID_INVALID;
> > +       }
> > +
> > +       len  = NSFS_FID_SIZE;
> > +
> > +       fid->ns_id = ns->ns_id;
> > +       fid->ns_type = ns->ops->type;
> > +       fid->ns_inum = inode->i_ino;
> > +       *max_len = len;
> > +       return FILEID_NSFS;
> > +}
> > +
> > +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
> > +                                       int fh_len, int fh_type)
> > +{
> > +       struct path path __free(path_put) = {};
> > +       struct nsfs_fid *fid = (struct nsfs_fid *)fh;
> > +       struct user_namespace *owning_ns = NULL;
> > +       struct ns_common *ns;
> > +       int ret;
> > +
> > +       if (fh_len < NSFS_FID_SIZE)
> > +               return NULL;
> > +
> > +       switch (fh_type) {
> > +       case FILEID_NSFS:
> > +               break;
> > +       default:
> > +               return NULL;
> > +       }
> > +
> > +       scoped_guard(rcu) {
> > +               ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
> > +               if (!ns)
> > +                       return NULL;
> > +
> > +               VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
> > +               VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type);
> > +               VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
> > +
> > +               if (!refcount_inc_not_zero(&ns->count))
> > +                       return NULL;
> > +       }
> > +
> > +       switch (ns->ops->type) {
> > +#ifdef CONFIG_CGROUPS
> > +       case CLONE_NEWCGROUP:
> > +               if (!current_in_namespace(to_cg_ns(ns)))
> > +                       owning_ns = to_cg_ns(ns)->user_ns;
> > +               break;
> > +#endif
> > +#ifdef CONFIG_IPC_NS
> > +       case CLONE_NEWIPC:
> > +               if (!current_in_namespace(to_ipc_ns(ns)))
> > +                       owning_ns = to_ipc_ns(ns)->user_ns;
> > +               break;
> > +#endif
> > +       case CLONE_NEWNS:
> > +               if (!current_in_namespace(to_mnt_ns(ns)))
> > +                       owning_ns = to_mnt_ns(ns)->user_ns;
> > +               break;
> > +#ifdef CONFIG_NET_NS
> > +       case CLONE_NEWNET:
> > +               if (!current_in_namespace(to_net_ns(ns)))
> > +                       owning_ns = to_net_ns(ns)->user_ns;
> > +               break;
> > +#endif
> > +#ifdef CONFIG_PID_NS
> > +       case CLONE_NEWPID:
> > +               if (!current_in_namespace(to_pid_ns(ns))) {
> > +                       owning_ns = to_pid_ns(ns)->user_ns;
> > +               } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
> > +                       ns->ops->put(ns);
> > +                       return ERR_PTR(-EPERM);
> > +               }
> > +               break;
> > +#endif
> > +#ifdef CONFIG_TIME_NS
> > +       case CLONE_NEWTIME:
> > +               if (!current_in_namespace(to_time_ns(ns)))
> > +                       owning_ns = to_time_ns(ns)->user_ns;
> > +               break;
> > +#endif
> > +#ifdef CONFIG_USER_NS
> > +       case CLONE_NEWUSER:
> > +               if (!current_in_namespace(to_user_ns(ns)))
> > +                       owning_ns = to_user_ns(ns);
> > +               break;
> > +#endif
> > +#ifdef CONFIG_UTS_NS
> > +       case CLONE_NEWUTS:
> > +               if (!current_in_namespace(to_uts_ns(ns)))
> > +                       owning_ns = to_uts_ns(ns)->user_ns;
> > +               break;
> > +#endif
> > +       default:
> > +               return ERR_PTR(-EOPNOTSUPP);
> > +       }
> > +
> > +       if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
> > +               ns->ops->put(ns);
> > +               return ERR_PTR(-EPERM);
> > +       }
> > +
> > +       /* path_from_stashed() unconditionally consumes the reference. */
> > +       ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
> > +       if (ret)
> > +               return ERR_PTR(ret);
> > +
> > +       return no_free_ptr(path.dentry);
> > +}
> > +
> > +/*
> > + * Make sure that we reject any nonsensical flags that users pass via
> > + * open_by_handle_at().
> > + */
> > +#define VALID_FILE_HANDLE_OPEN_FLAGS \
> > +       (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
> > +
> > +static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
> > +                                  unsigned int oflags)
> > +{
> > +       if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
> > +               return -EINVAL;
> > +
> > +       /* nsfs_fh_to_dentry() is performs further permission checks. */
> > +       return 0;
> > +}
> > +
> > +static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
> > +{
> > +       /* Clear O_LARGEFILE as open_by_handle_at() forces it. */
> > +       oflags &= ~O_LARGEFILE;
> > +       return file_open_root(path, "", oflags, 0);
> > +}
> > +
> > +static const struct export_operations nsfs_export_operations = {
> > +       .encode_fh      = nsfs_encode_fh,
> > +       .fh_to_dentry   = nsfs_fh_to_dentry,
> > +       .open           = nsfs_export_open,
> > +       .permission     = nsfs_export_permission,
> > +};
> > +
> >  static int nsfs_init_fs_context(struct fs_context *fc)
> >  {
> >         struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
> >         if (!ctx)
> >                 return -ENOMEM;
> >         ctx->ops = &nsfs_ops;
> > +       ctx->eops = &nsfs_export_operations;
> >         ctx->dops = &ns_dentry_operations;
> >         fc->s_fs_info = (void *)&nsfs_stashed_ops;
> >         return 0;
> > diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> > index cfb0dd1ea49c..3aac58a520c7 100644
> > --- a/include/linux/exportfs.h
> > +++ b/include/linux/exportfs.h
> > @@ -122,6 +122,12 @@ enum fid_type {
> >         FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
> >         FILEID_BCACHEFS_WITH_PARENT = 0xb2,
> >
> > +       /*
> > +        *
> > +        * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number.
> > +        */
> > +       FILEID_NSFS = 0xf1,
> > +
> >         /*
> >          * 64 bit unique kernfs id
> >          */
> >
> > --
> > 2.47.3
> >

Re: [PATCH 27/32] nsfs: support file handles

Posted by Amir Goldstein 4 months, 4 weeks ago

On Thu, Sep 11, 2025 at 11:31 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Wed, Sep 10, 2025 at 07:21:22PM +0200, Amir Goldstein wrote:
> > On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > A while ago we added support for file handles to pidfs so pidfds can be
> > > encoded and decoded as file handles. Userspace has adopted this quickly
> > > and it's proven very useful.
> >
> > > Pidfd file handles are exhaustive meaning
> > > they don't require a handle on another pidfd to pass to
> > > open_by_handle_at() so it can derive the filesystem to decode in.
> > >
> > > Implement the exhaustive file handles for namespaces as well.
> >
> > I think you decide to split the "exhaustive" part to another patch,
> > so better drop this paragraph?
>
> Yes, good point. I've dont that.
>
> > I am missing an explanation about the permissions for
> > opening these file handles.
> >
> > My understanding of the code is that the opener needs to meet one of
> > the conditions:
> > 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
> > 2. current task is in the opened namespace
>
> Yes.
>
> >
> > But I do not fully understand the rationale behind the 2nd condition,
> > that is, when is it useful?
>
> A caller is always able to open a file descriptor to it's own set of
> namespaces. File handles will behave the same way.
>

I understand why it's safe, and I do not object to it at all,
I just feel that I do not fully understand the use case of how ns file handles
are expected to be used.
A process can always open /proc/self/ns/mnt
What's the use case where a process may need to open its own ns by handle?

I will explain. For CAP_SYS_ADMIN I can see why keeping handles that
do not keep an elevated refcount of ns object could be useful in the same
way that an NFS client keeps file handles without keeping the file object alive.

But if you do not have CAP_SYS_ADMIN and can only open your own ns
by handle, what is the application that could make use of this?
and what's the benefit of such application keeping a file handle instead of
ns fd?

Sorry. I feel that I may be missing something in the big picture.

Thanks,
Amir.

Re: [PATCH 27/32] nsfs: support file handles

Posted by Christian Brauner 4 months, 4 weeks ago

On Thu, Sep 11, 2025 at 01:36:28PM +0200, Amir Goldstein wrote:
> On Thu, Sep 11, 2025 at 11:31 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Wed, Sep 10, 2025 at 07:21:22PM +0200, Amir Goldstein wrote:
> > > On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@kernel.org> wrote:
> > > >
> > > > A while ago we added support for file handles to pidfs so pidfds can be
> > > > encoded and decoded as file handles. Userspace has adopted this quickly
> > > > and it's proven very useful.
> > >
> > > > Pidfd file handles are exhaustive meaning
> > > > they don't require a handle on another pidfd to pass to
> > > > open_by_handle_at() so it can derive the filesystem to decode in.
> > > >
> > > > Implement the exhaustive file handles for namespaces as well.
> > >
> > > I think you decide to split the "exhaustive" part to another patch,
> > > so better drop this paragraph?
> >
> > Yes, good point. I've dont that.
> >
> > > I am missing an explanation about the permissions for
> > > opening these file handles.
> > >
> > > My understanding of the code is that the opener needs to meet one of
> > > the conditions:
> > > 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
> > > 2. current task is in the opened namespace
> >
> > Yes.
> >
> > >
> > > But I do not fully understand the rationale behind the 2nd condition,
> > > that is, when is it useful?
> >
> > A caller is always able to open a file descriptor to it's own set of
> > namespaces. File handles will behave the same way.
> >
> 
> I understand why it's safe, and I do not object to it at all,
> I just feel that I do not fully understand the use case of how ns file handles
> are expected to be used.
> A process can always open /proc/self/ns/mnt
> What's the use case where a process may need to open its own ns by handle?
> 
> I will explain. For CAP_SYS_ADMIN I can see why keeping handles that
> do not keep an elevated refcount of ns object could be useful in the same
> way that an NFS client keeps file handles without keeping the file object alive.
> 
> But if you do not have CAP_SYS_ADMIN and can only open your own ns
> by handle, what is the application that could make use of this?
> and what's the benefit of such application keeping a file handle instead of
> ns fd?

A process is not always able to open /proc/self/ns/. That requires
procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be
overmounted. However, they can derive a namespace fd from their own
pidfd. And that also always works if it's their own namespace.

There's no need to introduce unnecessary behavioral differences between
/proc/self/ns/, pidfd-derived namespace fs, and file-handle-derived
namespace fds. That's just going to be confusing.

The other thing is that there are legitimate use-case for encoding your
own namespace. For example, you might store file handles to your set of
namespaces in a file on-disk so you can verify when you get rexeced that
they're still valid and so on. This is akin to the pidfd use-case.

Or just plainly for namespace comparison reasons where you keep a file
handle to your own namespaces and can then easily check against others.

Re: [PATCH 27/32] nsfs: support file handles

Posted by Aleksa Sarai 4 months, 3 weeks ago

On 2025-09-12, Christian Brauner <brauner@kernel.org> wrote:
> On Thu, Sep 11, 2025 at 01:36:28PM +0200, Amir Goldstein wrote:
> > On Thu, Sep 11, 2025 at 11:31 AM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Wed, Sep 10, 2025 at 07:21:22PM +0200, Amir Goldstein wrote:
> > > > On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > >
> > > > > A while ago we added support for file handles to pidfs so pidfds can be
> > > > > encoded and decoded as file handles. Userspace has adopted this quickly
> > > > > and it's proven very useful.
> > > >
> > > > > Pidfd file handles are exhaustive meaning
> > > > > they don't require a handle on another pidfd to pass to
> > > > > open_by_handle_at() so it can derive the filesystem to decode in.
> > > > >
> > > > > Implement the exhaustive file handles for namespaces as well.
> > > >
> > > > I think you decide to split the "exhaustive" part to another patch,
> > > > so better drop this paragraph?
> > >
> > > Yes, good point. I've dont that.
> > >
> > > > I am missing an explanation about the permissions for
> > > > opening these file handles.
> > > >
> > > > My understanding of the code is that the opener needs to meet one of
> > > > the conditions:
> > > > 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
> > > > 2. current task is in the opened namespace
> > >
> > > Yes.
> > >
> > > >
> > > > But I do not fully understand the rationale behind the 2nd condition,
> > > > that is, when is it useful?
> > >
> > > A caller is always able to open a file descriptor to it's own set of
> > > namespaces. File handles will behave the same way.
> > >
> > 
> > I understand why it's safe, and I do not object to it at all,
> > I just feel that I do not fully understand the use case of how ns file handles
> > are expected to be used.
> > A process can always open /proc/self/ns/mnt
> > What's the use case where a process may need to open its own ns by handle?
> > 
> > I will explain. For CAP_SYS_ADMIN I can see why keeping handles that
> > do not keep an elevated refcount of ns object could be useful in the same
> > way that an NFS client keeps file handles without keeping the file object alive.
> > 
> > But if you do not have CAP_SYS_ADMIN and can only open your own ns
> > by handle, what is the application that could make use of this?
> > and what's the benefit of such application keeping a file handle instead of
> > ns fd?
> 
> A process is not always able to open /proc/self/ns/. That requires
> procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be
> overmounted. However, they can derive a namespace fd from their own
> pidfd. And that also always works if it's their own namespace.

It's also important to note that if /proc/self and /proc/thread-self are
overmounted, you can get into scenarios where /proc/$pid will refer to
the wrong process (container runtimes run into this scenario a lot --
when configuring a container there is a point where we are in a new
pidns but still see the host /proc, which leads to lots of fun bugs).

> There's no need to introduce unnecessary behavioral differences between
> /proc/self/ns/, pidfd-derived namespace fs, and file-handle-derived
> namespace fds. That's just going to be confusing.
> 
> The other thing is that there are legitimate use-case for encoding your
> own namespace. For example, you might store file handles to your set of
> namespaces in a file on-disk so you can verify when you get rexeced that
> they're still valid and so on. This is akin to the pidfd use-case.
> 
> Or just plainly for namespace comparison reasons where you keep a file
> handle to your own namespaces and can then easily check against others.

I agree wholeheartedly.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/

Re: [PATCH 27/32] nsfs: support file handles

Posted by Amir Goldstein 4 months, 4 weeks ago

On Fri, Sep 12, 2025 at 10:20 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Thu, Sep 11, 2025 at 01:36:28PM +0200, Amir Goldstein wrote:
> > On Thu, Sep 11, 2025 at 11:31 AM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Wed, Sep 10, 2025 at 07:21:22PM +0200, Amir Goldstein wrote:
> > > > On Wed, Sep 10, 2025 at 4:39 PM Christian Brauner <brauner@kernel.org> wrote:
> > > > >
> > > > > A while ago we added support for file handles to pidfs so pidfds can be
> > > > > encoded and decoded as file handles. Userspace has adopted this quickly
> > > > > and it's proven very useful.
> > > >
> > > > > Pidfd file handles are exhaustive meaning
> > > > > they don't require a handle on another pidfd to pass to
> > > > > open_by_handle_at() so it can derive the filesystem to decode in.
> > > > >
> > > > > Implement the exhaustive file handles for namespaces as well.
> > > >
> > > > I think you decide to split the "exhaustive" part to another patch,
> > > > so better drop this paragraph?
> > >
> > > Yes, good point. I've dont that.
> > >
> > > > I am missing an explanation about the permissions for
> > > > opening these file handles.
> > > >
> > > > My understanding of the code is that the opener needs to meet one of
> > > > the conditions:
> > > > 1. user has CAP_SYS_ADMIN in the userns owning the opened namespace
> > > > 2. current task is in the opened namespace
> > >
> > > Yes.
> > >
> > > >
> > > > But I do not fully understand the rationale behind the 2nd condition,
> > > > that is, when is it useful?
> > >
> > > A caller is always able to open a file descriptor to it's own set of
> > > namespaces. File handles will behave the same way.
> > >
> >
> > I understand why it's safe, and I do not object to it at all,
> > I just feel that I do not fully understand the use case of how ns file handles
> > are expected to be used.
> > A process can always open /proc/self/ns/mnt
> > What's the use case where a process may need to open its own ns by handle?
> >
> > I will explain. For CAP_SYS_ADMIN I can see why keeping handles that
> > do not keep an elevated refcount of ns object could be useful in the same
> > way that an NFS client keeps file handles without keeping the file object alive.
> >
> > But if you do not have CAP_SYS_ADMIN and can only open your own ns
> > by handle, what is the application that could make use of this?
> > and what's the benefit of such application keeping a file handle instead of
> > ns fd?
>
> A process is not always able to open /proc/self/ns/. That requires
> procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be
> overmounted. However, they can derive a namespace fd from their own
> pidfd. And that also always works if it's their own namespace.
>
> There's no need to introduce unnecessary behavioral differences between
> /proc/self/ns/, pidfd-derived namespace fs, and file-handle-derived
> namespace fds. That's just going to be confusing.
>
> The other thing is that there are legitimate use-case for encoding your
> own namespace. For example, you might store file handles to your set of
> namespaces in a file on-disk so you can verify when you get rexeced that
> they're still valid and so on. This is akin to the pidfd use-case.
>
> Or just plainly for namespace comparison reasons where you keep a file
> handle to your own namespaces and can then easily check against others.

OK. As I said no objections I was just curious about this use case.

FWIW, comparing current ns to a stored file handle does not really require
permission to open_by_handle_at(). name_to_handle_at() the current ns
and binary compare to the stored file handle should be a viable option.

This was exactly the reason for introducing AT_HANDLE_FID, so that fanotify
unprivileged watcher with no permission to open_by_handle_at() could compare
an fid reported in an event with another fid they obtained earlier with
name_to_handle_at() and kept in a map.

Thanks for the explanation!
Amir.