From nobody Thu Sep 11 01:49:18 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7C5B3314CB; Wed, 10 Sep 2025 14:38:47 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1757515127; cv=none; b=J7eDdcrB3skHJt0EC8KKXJ0shNTWklT9RlKuRHcrV8iKDW96sF/g8tvv1wMz87TgoR/VTUaNjWthc2bFrlOpkDklMDGMA4wAJpJXiiElI4LRCbA4WNOtZ4+S3BjfkrmY0TNhDD2Kone8ttg3gqsjSgwuIqy5BLE9mkLtEqeViQY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1757515127; c=relaxed/simple; bh=h6pPZagManEkVWvCyxTyeCe5dDf8QrfREZWSn3EHa7o=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=svUn1pG2xsE6a3wVIT3FQJRYG1KdC5xFRWCtw6/qCmrOvEEGsQsqKNobqzQ0irOZaMRVYdSdtWjVFWOwKq8GfXk/oYuBVx5lG14ooQZwUJFSH98NOIcvUU5n1WS7Pkb/TFS8nm4ihrjmeJlU8TMd2LIl5zbrggD4bfVtidmdRL0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=PdCB3yVq; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="PdCB3yVq" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 11C9DC4CEF0; Wed, 10 Sep 2025 14:38:41 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1757515127; bh=h6pPZagManEkVWvCyxTyeCe5dDf8QrfREZWSn3EHa7o=; h=From:Date:Subject:References:In-Reply-To:To:Cc:From; b=PdCB3yVq9t0T0x2kBA0rA0PYCOyjE2lxO91lE6ATW/fBUa121rQhN6ibg0YaUHzHS Jib6zzShmwa9TZNNgU/ZugYwA1LYvNz/mBSIEdjtRwtYCPT9qdpF09i+Z73cP3P8pi ppBqKoAZ1ZLwZF/GyVUeM7sDXbK02C7S9E9VeM4+u8woT85I5gcarCGJAQ5kK9wWEM koYNmZl/5o/6hi7TPdcSoKWAaqB2wstzuDkVuLC6+D3HDcc5w9f9KuEUaIAuQLXOyo yyQXCnIZk/O2v6f77noVl3JaAU9uJoTi8g6NkPtgkgMAjfFWoYHu2sGLT3jmCW8MnV mr29U2fIP/+aw== From: Christian Brauner Date: Wed, 10 Sep 2025 16:37:02 +0200 Subject: [PATCH 17/32] mnt: support iterator Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20250910-work-namespace-v1-17-4dd56e7359d8@kernel.org> References: <20250910-work-namespace-v1-0-4dd56e7359d8@kernel.org> In-Reply-To: <20250910-work-namespace-v1-0-4dd56e7359d8@kernel.org> To: Jan Kara , Amir Goldstein , linux-fsdevel@vger.kernel.org Cc: Josef Bacik , Jeff Layton , Mike Yuan , =?utf-8?q?Zbigniew_J=C4=99drzejewski-Szmek?= , Lennart Poettering , Daan De Meyer , Aleksa Sarai , Alexander Viro , Jens Axboe , Tejun Heo , Johannes Weiner , =?utf-8?q?Michal_Koutn=C3=BD?= , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Simon Horman , Chuck Lever , linux-nfs@vger.kernel.org, linux-kselftest@vger.kernel.org, linux-block@vger.kernel.org, linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, netdev@vger.kernel.org, Christian Brauner X-Mailer: b4 0.14.3-dev-385fa X-Developer-Signature: v=1; a=openpgp-sha256; l=10962; i=brauner@kernel.org; h=from:subject:message-id; bh=h6pPZagManEkVWvCyxTyeCe5dDf8QrfREZWSn3EHa7o=; b=owGbwMvMwCU28Zj0gdSKO4sYT6slMWQc7OW48S/48JWpzS+nBz6rMSqYrdnxv9Vey+Tw2vv2T DueKkZrd5SyMIhxMciKKbI4tJuEyy3nqdhslKkBM4eVCWQIAxenAExkTzUjw4Z7r3OPFLntzdjG aP9g9n//Yob9O7LDpvM9kTy3dJFe5heGPxxFv5dPcD9+u+rAc99b2/2uqvEp7uATvnChINbPwsn yHT8A X-Developer-Key: i=brauner@kernel.org; a=openpgp; fpr=4880B8C9BD0E5106FC070F4F7B3C391EFEA93624 Move the mount namespace to the generic iterator. This allows us to drop a bunch of members from struct mnt_namespace. t Signed-off-by: Christian Brauner --- fs/mount.h | 10 +--- fs/namespace.c | 141 +++++++++++++----------------------------------------= ---- fs/nsfs.c | 4 +- 3 files changed, 35 insertions(+), 120 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9..76bf863c9ae2 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -17,11 +17,7 @@ struct mnt_namespace { }; struct user_namespace *user_ns; struct ucounts *ucounts; - u64 seq; /* Sequence number to prevent loops */ - union { - wait_queue_head_t poll; - struct rcu_head mnt_ns_rcu; - }; + wait_queue_head_t poll; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY @@ -30,8 +26,6 @@ struct mnt_namespace { #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ - struct list_head mnt_ns_list; /* entry in the sequential list of mounts n= amespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; =20 @@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct den= try *dentry) =20 static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->seq =3D=3D 0; + return ns->ns.ns_id =3D=3D 0; } =20 static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namespace.c b/fs/namespace.c index 14c5cdbdd6e1..40a8d75f6b16 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -33,6 +33,7 @@ #include #include #include +#include =20 #include "pnode.h" #include "internal.h" @@ -80,13 +81,10 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ -static DEFINE_SEQLOCK(mnt_ns_tree_lock); =20 #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif -static struct rb_root mnt_ns_tree =3D RB_ROOT; /* protected by mnt_ns_tree= _lock */ -static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ =20 enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE =3D (1 << 0), @@ -119,53 +117,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); =20 static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *n= ode) { + struct ns_common *ns; + if (!node) return NULL; - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); -} - -static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct mnt_namespace *ns_a =3D node_to_mnt_ns(a); - struct mnt_namespace *ns_b =3D node_to_mnt_ns(b); - u64 seq_a =3D ns_a->seq; - u64 seq_b =3D ns_b->seq; - - if (seq_a < seq_b) - return -1; - if (seq_a > seq_b) - return 1; - return 0; -} - -static inline void mnt_ns_tree_write_lock(void) -{ - write_seqlock(&mnt_ns_tree_lock); -} - -static inline void mnt_ns_tree_write_unlock(void) -{ - write_sequnlock(&mnt_ns_tree_lock); -} - -static void mnt_ns_tree_add(struct mnt_namespace *ns) -{ - struct rb_node *node, *prev; - - mnt_ns_tree_write_lock(); - node =3D rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev =3D rb_prev(&ns->mnt_ns_tree_node); - if (!prev) - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); - else - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); - mnt_ns_tree_write_unlock(); - - WARN_ON_ONCE(node); + ns =3D rb_entry(node, struct ns_common, ns_tree_node); + return container_of(ns, struct mnt_namespace, ns); } =20 static void mnt_ns_release(struct mnt_namespace *ns) @@ -181,32 +138,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, i= f (_T) mnt_ns_release(_T)) =20 static void mnt_ns_release_rcu(struct rcu_head *rcu) { - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } =20 static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!is_anon_ns(ns)) { - mnt_ns_tree_write_lock(); - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); - list_bidir_del_rcu(&ns->mnt_ns_list); - mnt_ns_tree_write_unlock(); - } - - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); -} - -static int mnt_ns_find(const void *key, const struct rb_node *node) -{ - const u64 mnt_ns_id =3D *(u64 *)key; - const struct mnt_namespace *ns =3D node_to_mnt_ns(node); + if (!is_anon_ns(ns)) + ns_tree_remove(ns); =20 - if (mnt_ns_id < ns->seq) - return -1; - if (mnt_ns_id > ns->seq) - return 1; - return 0; + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } =20 /* @@ -225,28 +166,21 @@ static int mnt_ns_find(const void *key, const struct = rb_node *node) */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; - struct rb_node *node; - unsigned int seq; + struct mnt_namespace *mnt_ns; + struct ns_common *ns; =20 guard(rcu)(); - do { - seq =3D read_seqbegin(&mnt_ns_tree_lock); - node =3D rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); - if (node) - break; - } while (read_seqretry(&mnt_ns_tree_lock, seq)); - - if (!node) + ns =3D ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); + if (!ns) return NULL; =20 /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ - ns =3D node_to_mnt_ns(node); - refcount_inc(&ns->passive); - return ns; + mnt_ns =3D container_of(ns, struct mnt_namespace, ns); + refcount_inc(&mnt_ns->passive); + return mnt_ns; } =20 static inline void lock_mount_hash(void) @@ -1017,7 +951,7 @@ static inline bool check_anonymous_mnt(struct mount *m= nt) return false; =20 seq =3D mnt->mnt_ns->seq_origin; - return !seq || (seq =3D=3D current->nsproxy->mnt_ns->seq); + return !seq || (seq =3D=3D current->nsproxy->mnt_ns->ns.ns_id); } =20 /* @@ -2155,19 +2089,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace = *mnt) =20 struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, b= ool previous) { + struct ns_common *ns; + guard(rcu)(); =20 for (;;) { - struct list_head *list; - - if (previous) - list =3D rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); - else - list =3D rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); - if (list_is_head(list, &mnt_ns_list)) - return ERR_PTR(-ENOENT); + ns =3D ns_tree_adjoined_rcu(mntns, previous); + if (IS_ERR(ns)) + return ERR_CAST(ns); =20 - mntns =3D list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + mntns =3D to_mnt_ns(ns); =20 /* * The last passive reference count is put with RCU @@ -2207,7 +2138,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!mnt_ns) return false; =20 - return current->nsproxy->mnt_ns->seq >=3D mnt_ns->seq; + return current->nsproxy->mnt_ns->ns.ns_id >=3D mnt_ns->ns.ns_id; } =20 struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, @@ -3070,7 +3001,7 @@ static struct file *open_detached_copy(struct path *p= ath, bool recursive) if (is_anon_ns(src_mnt_ns)) ns->seq_origin =3D src_mnt_ns->seq_origin; else - ns->seq_origin =3D src_mnt_ns->seq; + ns->seq_origin =3D src_mnt_ns->ns.ns_id; } =20 mnt =3D __do_loopback(path, recursive); @@ -4153,15 +4084,6 @@ static void free_mnt_ns(struct mnt_namespace *ns) mnt_ns_tree_remove(ns); } =20 -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq =3D ATOMIC64_INIT(1); - static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, = bool anon) { struct mnt_namespace *new_ns; @@ -4185,11 +4107,11 @@ static struct mnt_namespace *alloc_mnt_ns(struct us= er_namespace *user_ns, bool a return ERR_PTR(ret); } if (!anon) - new_ns->seq =3D atomic64_inc_return(&mnt_ns_seq); + ns_tree_gen_id(&new_ns->ns); + RB_CLEAR_NODE(&new_ns->ns.ns_tree_node); + INIT_LIST_HEAD(&new_ns->ns.ns_list_node); refcount_set(&new_ns->passive, 1); new_ns->mounts =3D RB_ROOT; - INIT_LIST_HEAD(&new_ns->mnt_ns_list); - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns =3D get_user_ns(user_ns); new_ns->ucounts =3D ucounts; @@ -4275,7 +4197,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags= , struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); =20 - mnt_ns_tree_add(new_ns); + ns_tree_add_raw(new_ns); return new_ns; } =20 @@ -5385,7 +5307,7 @@ static int statmount_sb_source(struct kstatmount *s, = struct seq_file *seq) static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace= *ns) { s->sm.mask |=3D STATMOUNT_MNT_NS_ID; - s->sm.mnt_ns_id =3D ns->seq; + s->sm.mnt_ns_id =3D ns->ns.ns_id; } =20 static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) @@ -6090,7 +6012,6 @@ static void __init init_mount_tree(void) ns =3D alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - ns->seq =3D atomic64_inc_return(&mnt_ns_seq); ns->ns.inum =3D PROC_MNT_INIT_INO; m =3D real_mount(mnt); ns->root =3D m; @@ -6105,7 +6026,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); =20 - mnt_ns_tree_add(ns); + ns_tree_add(ns); } =20 void __init mnt_init(void) diff --git a/fs/nsfs.c b/fs/nsfs.c index 34f0b35d3ead..6f8008177133 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -139,7 +139,7 @@ static int copy_ns_info_to_user(const struct mnt_namesp= ace *mnt_ns, * the size value will be set to the size the kernel knows about. */ kinfo->size =3D min(usize, sizeof(*kinfo)); - kinfo->mnt_ns_id =3D mnt_ns->seq; + kinfo->mnt_ns_id =3D mnt_ns->ns.ns_id; kinfo->nr_mounts =3D READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) @@ -221,7 +221,7 @@ static long ns_ioctl(struct file *filp, unsigned int io= ctl, =20 mnt_ns =3D container_of(ns, struct mnt_namespace, ns); idp =3D (__u64 __user *)arg; - id =3D mnt_ns->seq; + id =3D mnt_ns->ns.ns_id; return put_user(id, idp); } case NS_GET_PID_FROM_PIDNS: --=20 2.47.3