From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE5862F0661;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=SuBPxSoFFvFes6+CvtewIjBJhuDdmmZJJrEYzmyZBRTMkz5+0H+4vNkcPuOGBx6Pe4GwED4+iO7KGnzQelimhJYNHCATWQSqYLd+lMYhVyEGNHoXHQeVCwPsm6FDjZiAacJk+5URAY48h6dmeY/n+M9gqG5K3kJTFWNetTE58m8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=oJDs9cs0PMUxO+Y7vEWjvv4x8YaRwIGuciOUq8hARic=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=sZHpjCJy8s7veVfYkgTQLAj9QMFqP6MA0nfKjAgBsyaKGA9pXGaMM9z1R6n+aLEVvzH9r7Q4zx88Oft/t07RowXyb2wzRDV3Lwd8YIAKV8FzMPHrPFAUOcRZPwyQPbJY3WnseruAE8jt1C2dKgf+oX/bTEJp9NQsMgAw8ohOPWc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynI-005Y1u-W6;
	Tue, 12 Aug 2025 23:52:42 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 01/11] VFS: discard err2 in filename_create()
Date: Tue, 12 Aug 2025 12:25:04 +1000
Message-ID: <20250812235228.3072318-2-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

During the time when "err2" holds a value, "error" does not hold any
meaningful value.  So there is no need for two different err variables.
This patch discards "err2" and uses "error' throughout.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/namei.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index cd43ff89fbaa..62c1e2268942 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4114,7 +4114,6 @@ static struct dentry *filename_create(int dfd, struct=
 filename *name,
 	unsigned int reval_flag =3D lookup_flags & LOOKUP_REVAL;
 	unsigned int create_flags =3D LOOKUP_CREATE | LOOKUP_EXCL;
 	int type;
-	int err2;
 	int error;
=20
 	error =3D filename_parentat(dfd, name, reval_flag, path, &last, &type);
@@ -4129,7 +4128,7 @@ static struct dentry *filename_create(int dfd, struct=
 filename *name,
 		goto out;
=20
 	/* don't fail immediately if it's r/o, at least try to report other error=
s */
-	err2 =3D mnt_want_write(path->mnt);
+	error =3D mnt_want_write(path->mnt);
 	/*
 	 * Do the final lookup.  Suppress 'create' if there is a trailing
 	 * '/', and a directory wasn't requested.
@@ -4142,17 +4141,16 @@ static struct dentry *filename_create(int dfd, stru=
ct filename *name,
 	if (IS_ERR(dentry))
 		goto unlock;
=20
-	if (unlikely(err2)) {
-		error =3D err2;
+	if (unlikely(error))
 		goto fail;
-	}
+
 	return dentry;
 fail:
 	dput(dentry);
 	dentry =3D ERR_PTR(error);
 unlock:
 	inode_unlock(path->dentry->d_inode);
-	if (!err2)
+	if (!error)
 		mnt_drop_write(path->mnt);
 out:
 	path_put(path);
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE5F42D46C6;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=G7UUeNf3nY1cPVy/GwPbrDJfRVR9hkj8CBS9DEVRUbN/VxExGC38pD6g0EJECSmbd8Wnc2VtZsQEK1SmlX16ZpfPeFPojeSDVE0WftupEPOsroi8WHcB6u6ADozuTFtiWHkVKFvqYse1ZdIy2P3YK2XKM5XwiCMXuyLJzthAXv8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=I90cfJ5hxWQAKE5THF1GA3BmLbMCQgMmZktcIxEsAWo=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=qE78BvY6hRnXd/b+pBTZlTbH/NgiKZUaMMoNXONZHPNY2r1obIHEBue9M1b5cGeqj56mEvNmvssabdEHihL/p+jZpUXcDW3+Awg4NHyBpGLrygW0DP3nnKNFi54C/MJGlqci8ROno8hf8zbaWutGMmtUmum1mP/wHpFwMCB/ocM=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynJ-005Y1w-PX;
	Tue, 12 Aug 2025 23:52:43 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 02/11] VFS: introduce dentry_lookup() and friends
Date: Tue, 12 Aug 2025 12:25:05 +1000
Message-ID: <20250812235228.3072318-3-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch is the first step in introducing a new API for locked
operation on names in directories.  It supports operations that create or
remove names.  Rename operations will also be part of this new API but
require different specific interfaces.

The plan is to lock just the dentry (or dentries), not the whole
directory.  dentry_lookup() combines locking the directory and
performing a lookup prior to a change to the directory.  On success it
returns a dentry which is consider to be locked, though at this stage
the whole parent directory is actually locked.

dentry_lookup_noperm() does the same without needing a mnt_idmap and
without checking permissions.  This is useful for internal filesystem
management (e.g.  creating virtual files in response to events) and in
other cases similar to lookup_noperm().

__dentry_lookup() is a VFS-internal interface which does no permissions
checking and assumes that the hash of the name has already been stored
in the qstr.  This is useful following filename_parentat().

done_dentry_lookup() is provided which performs the inverse of putting
the dentry and unlocking.

Like lookup_one_qstr_excl(), dentry_lookup() returns -ENOENT if
LOOKUP_CREATE was NOT given and the name cannot be found,, and returns
-EEXIST if LOOKUP_EXCL WAS given and the name CAN be found.

These functions replace all uses of lookup_one_qstr_excl() in namei.c
except for those used for rename.

They also allow simple_start_creating() to be simplified into a
static-inline.

A __free() class is provided to allow done_dentry_lookup() to be called
transparently on scope exit.  dget() is extended to ignore ERR_PTR()s
so that "return dget(dentry);" is always safe when dentry was provided
by dentry_lookup() and the variable was declared __free(dentry_lookup).

lookup_noperm_common() and lookup_one_common() are moved earlier in
namei.c.

Note that done_dentry_lookup() can NOT currently be used for vfs_mkdir() as
that function consumes the dentry on error, providing a PTR_ERR instead.
This can be passed to done_dentry_lookup(), but the directory won't be
unlocked.   A future patch will resolve this so done_dentry_lookup() can
be used in done_path_create().

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/libfs.c             |  25 -----
 fs/namei.c             | 229 +++++++++++++++++++++++++++++------------
 include/linux/dcache.h |  16 +--
 include/linux/fs.h     |   1 -
 include/linux/namei.h  |  15 +++
 5 files changed, 184 insertions(+), 102 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index ce8c496a6940..f31b80d8de18 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2288,28 +2288,3 @@ void stashed_dentry_prune(struct dentry *dentry)
 	 */
 	cmpxchg(stashed, dentry, NULL);
 }
-
-/* parent must be held exclusive */
-struct dentry *simple_start_creating(struct dentry *parent, const char *na=
me)
-{
-	struct dentry *dentry;
-	struct inode *dir =3D d_inode(parent);
-
-	inode_lock(dir);
-	if (unlikely(IS_DEADDIR(dir))) {
-		inode_unlock(dir);
-		return ERR_PTR(-ENOENT);
-	}
-	dentry =3D lookup_noperm(&QSTR(name), parent);
-	if (IS_ERR(dentry)) {
-		inode_unlock(dir);
-		return dentry;
-	}
-	if (dentry->d_inode) {
-		dput(dentry);
-		inode_unlock(dir);
-		return ERR_PTR(-EEXIST);
-	}
-	return dentry;
-}
-EXPORT_SYMBOL(simple_start_creating);
diff --git a/fs/namei.c b/fs/namei.c
index 62c1e2268942..85b981248a90 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1714,6 +1714,152 @@ struct dentry *lookup_one_qstr_excl(const struct qs=
tr *name,
 }
 EXPORT_SYMBOL(lookup_one_qstr_excl);
=20
+static int lookup_noperm_common(struct qstr *qname, struct dentry *base)
+{
+	const char *name =3D qname->name;
+	u32 len =3D qname->len;
+
+	qname->hash =3D full_name_hash(base, name, len);
+	if (!len)
+		return -EACCES;
+
+	if (is_dot_dotdot(name, len))
+		return -EACCES;
+
+	while (len--) {
+		unsigned int c =3D *(const unsigned char *)name++;
+		if (c =3D=3D '/' || c =3D=3D '\0')
+			return -EACCES;
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err =3D base->d_op->d_hash(base, qname);
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
+static int lookup_one_common(struct mnt_idmap *idmap,
+			     struct qstr *qname, struct dentry *base)
+{
+	int err;
+	err =3D lookup_noperm_common(qname, base);
+	if (err < 0)
+		return err;
+	return inode_permission(idmap, base->d_inode, MAY_EXEC);
+}
+
+/**
+ * __dentry_lookup - lookup and lock a name prior to dir ops
+ * @last: the name in the given directory
+ * @base: the directory in which the name is to be found
+ * @lookup_flags: %LOOKUP_xxx flags
+ *
+ * The name is looked up and necessary locks are taken so that
+ * the name can be created or removed.
+ * The "necessary locks" are currently the inode lock on @base.
+ * The name @last is expected to already have the hash calculated.
+ * No permission checks are performed.
+ * This function is for VFS-internal use only.
+ *
+ * Returns: the dentry, suitably locked, or an ERR_PTR().
+ */
+static struct dentry *__dentry_lookup(struct qstr *last,
+				      struct dentry *base,
+				      unsigned int lookup_flags)
+{
+	struct dentry *dentry;
+
+	inode_lock_nested(base->d_inode, I_MUTEX_PARENT);
+
+	dentry =3D lookup_one_qstr_excl(last, base, lookup_flags);
+	if (IS_ERR(dentry))
+		inode_unlock(base->d_inode);
+	return dentry;
+}
+
+/**
+ * dentry_lookup_noperm - lookup and lock a name prior to dir ops
+ * @last: the name in the given directory
+ * @base: the directory in which the name is to be found
+ * @lookup_flags: %LOOKUP_xxx flags
+ *
+ * The name is looked up and necessary locks are taken so that
+ * the name can be created or removed.
+ * The "necessary locks" are currently the inode lock on @base.
+ * The name @last is NOT expected to have the hash calculated.
+ * No permission checks are performed.
+ * Returns: the dentry, suitably locked, or an ERR_PTR().
+ */
+struct dentry *dentry_lookup_noperm(struct qstr *last,
+				    struct dentry *base,
+				    unsigned int lookup_flags)
+{
+	int err;
+
+	err =3D lookup_noperm_common(last, base);
+	if (err < 0)
+		return ERR_PTR(err);
+	return __dentry_lookup(last, base, lookup_flags);
+}
+EXPORT_SYMBOL(dentry_lookup_noperm);
+
+/**
+ * dentry_lookup - lookup and lock a name prior to dir ops
+ * @idmap: idmap of the mount the lookup is performed from
+ * @last: the name in the given directory
+ * @base: the directory in which the name is to be found
+ * @lookup_flags: %LOOKUP_xxx flags
+ *
+ * The name is looked up and necessary locks are taken so that
+ * the name can be created or removed.
+ * The "necessary locks" are currently the inode lock on @base.
+ * The name @last is NOT expected to already have the hash calculated.
+ * Permission checks are performed to ensure %MAY_EXEC access to @base.
+ *
+ * Returns: the dentry, suitably locked, or an ERR_PTR().
+ */
+struct dentry *dentry_lookup(struct mnt_idmap *idmap,
+			     struct qstr *last,
+			     struct dentry *base,
+			     unsigned int lookup_flags)
+{
+	int err;
+
+	err =3D lookup_one_common(idmap, last, base);
+	if (err < 0)
+		return ERR_PTR(err);
+	return __dentry_lookup(last, base, lookup_flags);
+}
+EXPORT_SYMBOL(dentry_lookup);
+
+/**
+ * done_dentry_lookup - finish a lookup used for create/delete
+ * @dentry:  the target dentry
+ *
+ * Reverse the effects of dentry_lookup() or similar.  If the
+ * @dentry is not an error, the lock and the reference to the dentry
+ * are dropped.
+ *
+ * This interface allows a smooth transition from parent-dir based
+ * locking to dentry based locking.
+ *
+ * This interface is NOT suficient for vfs_mkdir() usage without
+ * extra care as vfs_mkdir() consumes a dentry withouty dropping the lock.
+ */
+void done_dentry_lookup(struct dentry *dentry)
+{
+	if (!IS_ERR(dentry)) {
+		inode_unlock(dentry->d_parent->d_inode);
+		dput(dentry);
+	}
+}
+EXPORT_SYMBOL(done_dentry_lookup);
+
 /**
  * lookup_fast - do fast lockless (but racy) lookup of a dentry
  * @nd: current nameidata
@@ -2756,12 +2902,9 @@ static struct dentry *__kern_path_locked(int dfd, st=
ruct filename *name, struct
 		return ERR_PTR(error);
 	if (unlikely(type !=3D LAST_NORM))
 		return ERR_PTR(-EINVAL);
-	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
-	d =3D lookup_one_qstr_excl(&last, parent_path.dentry, 0);
-	if (IS_ERR(d)) {
-		inode_unlock(parent_path.dentry->d_inode);
+	d =3D __dentry_lookup(&last, parent_path.dentry, 0);
+	if (IS_ERR(d))
 		return d;
-	}
 	path->dentry =3D no_free_ptr(parent_path.dentry);
 	path->mnt =3D no_free_ptr(parent_path.mnt);
 	return d;
@@ -2780,12 +2923,9 @@ struct dentry *kern_path_locked_negative(const char =
*name, struct path *path)
 		return ERR_PTR(error);
 	if (unlikely(type !=3D LAST_NORM))
 		return ERR_PTR(-EINVAL);
-	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
-	d =3D lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE);
-	if (IS_ERR(d)) {
-		inode_unlock(parent_path.dentry->d_inode);
+	d =3D __dentry_lookup(&last, parent_path.dentry, LOOKUP_CREATE);
+	if (IS_ERR(d))
 		return d;
-	}
 	path->dentry =3D no_free_ptr(parent_path.dentry);
 	path->mnt =3D no_free_ptr(parent_path.mnt);
 	return d;
@@ -2863,45 +3003,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vf=
smount *mnt,
 }
 EXPORT_SYMBOL(vfs_path_lookup);
=20
-static int lookup_noperm_common(struct qstr *qname, struct dentry *base)
-{
-	const char *name =3D qname->name;
-	u32 len =3D qname->len;
-
-	qname->hash =3D full_name_hash(base, name, len);
-	if (!len)
-		return -EACCES;
-
-	if (is_dot_dotdot(name, len))
-		return -EACCES;
-
-	while (len--) {
-		unsigned int c =3D *(const unsigned char *)name++;
-		if (c =3D=3D '/' || c =3D=3D '\0')
-			return -EACCES;
-	}
-	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (base->d_flags & DCACHE_OP_HASH) {
-		int err =3D base->d_op->d_hash(base, qname);
-		if (err < 0)
-			return err;
-	}
-	return 0;
-}
-
-static int lookup_one_common(struct mnt_idmap *idmap,
-			     struct qstr *qname, struct dentry *base)
-{
-	int err;
-	err =3D lookup_noperm_common(qname, base);
-	if (err < 0)
-		return err;
-	return inode_permission(idmap, base->d_inode, MAY_EXEC);
-}
-
 /**
  * try_lookup_noperm - filesystem helper to lookup single pathname compone=
nt
  * @name:	qstr storing pathname component to lookup
@@ -4125,7 +4226,7 @@ static struct dentry *filename_create(int dfd, struct=
 filename *name,
 	 * (foo/., foo/.., /////)
 	 */
 	if (unlikely(type !=3D LAST_NORM))
-		goto out;
+		goto put;
=20
 	/* don't fail immediately if it's r/o, at least try to report other error=
s */
 	error =3D mnt_want_write(path->mnt);
@@ -4135,24 +4236,20 @@ static struct dentry *filename_create(int dfd, stru=
ct filename *name,
 	 */
 	if (last.name[last.len] && !want_dir)
 		create_flags &=3D ~LOOKUP_CREATE;
-	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
-	dentry =3D lookup_one_qstr_excl(&last, path->dentry,
-				      reval_flag | create_flags);
+	dentry =3D __dentry_lookup(&last, path->dentry, reval_flag | create_flags=
);
 	if (IS_ERR(dentry))
-		goto unlock;
+		goto drop;
=20
 	if (unlikely(error))
 		goto fail;
-
 	return dentry;
 fail:
-	dput(dentry);
+	done_dentry_lookup(dentry);
 	dentry =3D ERR_PTR(error);
-unlock:
-	inode_unlock(path->dentry->d_inode);
+drop:
 	if (!error)
 		mnt_drop_write(path->mnt);
-out:
+put:
 	path_put(path);
 	return dentry;
 }
@@ -4503,19 +4600,18 @@ int do_rmdir(int dfd, struct filename *name)
 	if (error)
 		goto exit2;
=20
-	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry =3D lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
+	dentry =3D __dentry_lookup(&last, path.dentry, lookup_flags);
 	error =3D PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit3;
+
 	error =3D security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit4;
 	error =3D vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
 exit4:
-	dput(dentry);
+	done_dentry_lookup(dentry);
 exit3:
-	inode_unlock(path.dentry->d_inode);
 	mnt_drop_write(path.mnt);
 exit2:
 	path_put(&path);
@@ -4632,11 +4728,9 @@ int do_unlinkat(int dfd, struct filename *name)
 	if (error)
 		goto exit2;
 retry_deleg:
-	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
-	dentry =3D lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
+	dentry =3D __dentry_lookup(&last, path.dentry, lookup_flags);
 	error =3D PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
-
 		/* Why not before? Because we want correct error value */
 		if (last.name[last.len])
 			goto slashes;
@@ -4648,9 +4742,8 @@ int do_unlinkat(int dfd, struct filename *name)
 		error =3D vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
 				   dentry, &delegated_inode);
 exit3:
-		dput(dentry);
+		done_dentry_lookup(dentry);
 	}
-	inode_unlock(path.dentry->d_inode);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 	inode =3D NULL;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index cc3e1c1a3454..5d53489e5556 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -327,13 +327,13 @@ static inline struct dentry *dget_dlock(struct dentry=
 *dentry)
  * dget - get a reference to a dentry
  * @dentry: dentry to get a reference to
  *
- * Given a dentry or %NULL pointer increment the reference count
- * if appropriate and return the dentry.  A dentry will not be
- * destroyed when it has references.  Conversely, a dentry with
- * no references can disappear for any number of reasons, starting
- * with memory pressure.  In other words, that primitive is
- * used to clone an existing reference; using it on something with
- * zero refcount is a bug.
+ * Given a dentry, PTR_ERR, or %NULL pointer increment the reference
+ * count if appropriate and return the dentry.  A dentry will not be
+ * destroyed when it has references.  Conversely, a dentry with no
+ * references can disappear for any number of reasons, starting with
+ * memory pressure.  In other words, that primitive is used to clone an
+ * existing reference; using it on something with zero refcount is a
+ * bug.
  *
  * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
  * avoidance point of view it is equivalent to spin_lock()/increment
@@ -343,7 +343,7 @@ static inline struct dentry *dget_dlock(struct dentry *=
dentry)
  */
 static inline struct dentry *dget(struct dentry *dentry)
 {
-	if (dentry)
+	if (!IS_ERR_OR_NULL(dentry))
 		lockref_get(&dentry->d_lockref);
 	return dentry;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d7ab4f96d705..db644150b58f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3656,7 +3656,6 @@ extern int simple_fill_super(struct super_block *, un=
signed long,
 			     const struct tree_descr *);
 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **moun=
t, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
-struct dentry *simple_start_creating(struct dentry *, const char *);
=20
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 5d085428e471..932cb94c3538 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -80,6 +80,21 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idm=
ap,
 struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
 					    struct qstr *name,
 					    struct dentry *base);
+struct dentry *dentry_lookup(struct mnt_idmap *idmap,
+			       struct qstr *last, struct dentry *base,
+			       unsigned int lookup_flags);
+struct dentry *dentry_lookup_noperm(struct qstr *name, struct dentry *base,
+				      unsigned int lookup_flags);
+void done_dentry_lookup(struct dentry *dentry);
+/* no_free_ptr() must not be used here - use dget() */
+DEFINE_FREE(dentry_lookup, struct dentry *, if (_T) done_dentry_lookup(_T))
+
+static inline
+struct dentry *simple_start_creating(struct dentry *parent, const char *na=
me)
+{
+	return dentry_lookup_noperm(&QSTR(name), parent,
+				    LOOKUP_CREATE | LOOKUP_EXCL);
+}
=20
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *path, unsigned int flags);
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DFE162D4B6D;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=iV05z4v0d3gnUvprKJzocxpd0r2v/x7bR9Mh9jt0ryGYbLmOdoHJov/5avMIRmdJVJHHPhrhQSOA+DAWbLBugJ6pahtgJ0P7xSSUUMKfBOnWaWNiLMqkN+jbUW2QuJ/SJK6jB1zYwt4BFy+KzAZZJb825e4kNy7uL4wEoVmkOkc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=wY0rj6H9uNX526KJkK3oQKoJg0U9wk/QJcI6qW2yVlg=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=rcoyDzD/BnJxPRfGx/DD2XPnTaNox/xWGCU137RLX0x7YyI6s/OxVQCkjLycdkjRTW9GfijiVUVyyYMCanZa6tM0qlyrnsdgftyGc3ra0dmSdbGBiE0b6XEdLZe/4NzfWnmQh/XmvybfrPFUXdDY7O8w3ERFsOTluHYDgAUeWp0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynK-005Y1y-Ic;
	Tue, 12 Aug 2025 23:52:44 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 03/11] VFS: add dentry_lookup_killable()
Date: Tue, 12 Aug 2025 12:25:06 +1000
Message-ID: <20250812235228.3072318-4-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

btrfs/ioctl.c uses a "killable" lock on the directory when creating an
destroying subvols.  overlayfs also does this.

This patch adds dentry_lookup_killable() for these users.

Possibly all dentry_lookup should be killable as there is no down-side,
but that can come in a later patch.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/namei.c            | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/namei.h |  3 +++
 2 files changed, 40 insertions(+)

diff --git a/fs/namei.c b/fs/namei.c
index 85b981248a90..7af9b464886a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1837,6 +1837,43 @@ struct dentry *dentry_lookup(struct mnt_idmap *idmap,
 }
 EXPORT_SYMBOL(dentry_lookup);
=20
+/**
+ * dentry_lookup_killable - lookup and lock a name prior to dir ops
+ * @last: the name in the given directory
+ * @base: the directory in which the name is to be found
+ * @lookup_flags: %LOOKUP_xxx flags
+ *
+ * The name is looked up and necessary locks are taken so that
+ * the name can be created or removed.
+ * The "necessary locks" are currently the inode lock on @base.
+ * If a fatal signal arrives, or is already pending, the operation is abor=
ted.
+ * The name @last is NOT expected to already have the hash calculated.
+ * Permission checks are performed to ensure %MAY_EXEC access to @base.
+ * Returns: the dentry, suitably locked, or an ERR_PTR().
+ */
+struct dentry *dentry_lookup_killable(struct mnt_idmap *idmap,
+				      struct qstr *last,
+				      struct dentry *base,
+				      unsigned int lookup_flags)
+{
+	struct dentry *dentry;
+	int err;
+
+	err =3D lookup_one_common(idmap, last, base);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	err =3D down_write_killable_nested(&base->d_inode->i_rwsem, I_MUTEX_PAREN=
T);
+	if (err)
+		return ERR_PTR(err);
+
+	dentry =3D lookup_one_qstr_excl(last, base, lookup_flags);
+	if (IS_ERR(dentry))
+		inode_unlock(base->d_inode);
+	return dentry;
+}
+EXPORT_SYMBOL(dentry_lookup_killable);
+
 /**
  * done_dentry_lookup - finish a lookup used for create/delete
  * @dentry:  the target dentry
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 932cb94c3538..facb5852afa9 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -83,6 +83,9 @@ struct dentry *lookup_one_positive_unlocked(struct mnt_id=
map *idmap,
 struct dentry *dentry_lookup(struct mnt_idmap *idmap,
 			       struct qstr *last, struct dentry *base,
 			       unsigned int lookup_flags);
+struct dentry *dentry_lookup_killable(struct mnt_idmap *idmap,
+				      struct qstr *last, struct dentry *base,
+				      unsigned int lookup_flags);
 struct dentry *dentry_lookup_noperm(struct qstr *name, struct dentry *base,
 				      unsigned int lookup_flags);
 void done_dentry_lookup(struct dentry *dentry);
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 03AAE2D8375;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=eX373CJkVVShnR6DqTpS/T04FEIEKW4pTFkI8QiMDH/PsDnVZ0f3ITTG3JZy7GWb1zndCDT0mJz0ELksU7Dzc/AHwuXEMa0tVgFBUMrDG8lQaR/fV+G4vd2olEf1Ad2vTn60CigyNJ3NDLFu8/o3r1Hf/MqpttboAvBFrOkV4mk=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=Ws9vdrtNTe7GKpziIopOHzn03qCiXQiZM2MeU+FDdAY=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Wb1QfPXbiNNaMf8PW8EvvnSKNuUkipKOncn4xZU+iUK55zIMSHEfw13oqC3LQR3Cj4uPkhIdN0sBPu6u2EmiKzlvAnF4I1JMgAy9uL9SwGPlvAUSo4Kq+wm57YY2UdQgl1tK4NSUJor1ht7pIEOSZMXjwZhGymMFM+OF9ViVK0Q=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynL-005Y20-C5;
	Tue, 12 Aug 2025 23:52:45 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 04/11] VFS: introduce dentry_lookup_continue()
Date: Tue, 12 Aug 2025 12:25:07 +1000
Message-ID: <20250812235228.3072318-5-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

A few callers operate on a dentry which they already have - unlike the
normal case where a lookup proceeds an operation.

For these callers dentry_lookup_continue() is provided where other
callers would use dentry_lookup().  The call will fail if, after the
lock was gained, the child is no longer a child of the given parent.

There are a couple of callers that want to lock a dentry in whatever
its current parent is.  For these a NULL parent can be passed, in which
case ->d_parent is used.  In this case the call cannot fail.

The idea behind the name is that the actual lookup occurred some time
ago, and now we are continuing with an operation on the dentry.

When the operation completes done_dentry_lookup() must be called.  An
extra reference is taken when the dentry_lookup_continue() call succeeds
and will be dropped by done_dentry_lookup().

This will be used in smb/server, ecryptfs, and overlayfs, each of which
have their own lock_parent() or parent_lock() or similar; and a few
other places which lock the parent but don't check if the parent is
still correct (often because rename isn't supported so parent cannot be
incorrect).

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/namei.c            | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/namei.h |  2 ++
 2 files changed, 41 insertions(+)

diff --git a/fs/namei.c b/fs/namei.c
index 7af9b464886a..df21b6fa5a0e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1874,6 +1874,45 @@ struct dentry *dentry_lookup_killable(struct mnt_idm=
ap *idmap,
 }
 EXPORT_SYMBOL(dentry_lookup_killable);
=20
+/**
+ * dentry_lookup_continue: lock a dentry if it is still in the given paren=
t, prior to dir ops
+ * @child: the dentry to lock
+ * @parent: the dentry of the assumed parent
+ *
+ * The child is locked - currently by taking i_rwsem on the parent - to
+ * prepare for create/remove operations.  If the given parent is not
+ * %NULL and is no longer the parent of the dentry after the lock is
+ * gained, the lock is released and the call fails (returns
+ * ERR_PTR(-EINVAL).
+ *
+ * On success a reference to the child is taken and returned.  The lock
+ * and reference must both be dropped by done_dentry_lookup() after the
+ * operation completes.
+ */
+struct dentry *dentry_lookup_continue(struct dentry *child,
+				      struct dentry *parent)
+{
+	struct dentry *p =3D parent;
+
+again:
+	if (!parent)
+		p =3D dget_parent(child);
+	inode_lock_nested(d_inode(p), I_MUTEX_PARENT);
+	if (child->d_parent !=3D p) {
+		inode_unlock(d_inode(p));
+		if (!parent) {
+			dput(p);
+			goto again;
+		}
+		return ERR_PTR(-EINVAL);
+	}
+	if (!parent)
+		dput(p);
+	/* get the child to balance with done_dentry_lookup() which puts it. */
+	return dget(child);
+}
+EXPORT_SYMBOL(dentry_lookup_continue);
+
 /**
  * done_dentry_lookup - finish a lookup used for create/delete
  * @dentry:  the target dentry
diff --git a/include/linux/namei.h b/include/linux/namei.h
index facb5852afa9..67eef91603cc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -88,6 +88,8 @@ struct dentry *dentry_lookup_killable(struct mnt_idmap *i=
dmap,
 				      unsigned int lookup_flags);
 struct dentry *dentry_lookup_noperm(struct qstr *name, struct dentry *base,
 				      unsigned int lookup_flags);
+struct dentry *dentry_lookup_continue(struct dentry *child,
+				      struct dentry *parent);
 void done_dentry_lookup(struct dentry *dentry);
 /* no_free_ptr() must not be used here - use dget() */
 DEFINE_FREE(dentry_lookup, struct dentry *, if (_T) done_dentry_lookup(_T))
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1651D2D97B0;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042786; cv=none;
 b=d+cW59WnT9Z74PysFhXypjzuP+O6khVNzgj3LilWArd5VVtbpsm86b5A5phZ2K0eOJ9Fk5TAz8d9mo6+Cf2NQO4EtlxsEgCMGxJKWPcKTjotIgDDOgnvxJymo/EmcMZZQWP5F891xkfP+DLLJHOY7TUoFBnDAxLIE0r/IT/+tEs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042786; c=relaxed/simple;
	bh=Fxzyw9jaYjDcrQDDTrdlviF3rA886DmUXWc9I/mH1Po=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=TgZc7KnYsd303yW2WSubI5lUywODwzN+/y/3yjE6AfqILT8bphRxSU2IRDhIQ5913Oov0KV3KVnN4l9uSsoGk4Cd+bueCsf7HjU2hNyMFihJVF76AnHl6LBfVxSVFo+L5o8I3wv1ekd9sYt5qlY1BPz6/gKeKrP9ZEBmYX5E54Y=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynM-005Y23-70;
	Tue, 12 Aug 2025 23:52:45 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 05/11] VFS: add rename_lookup()
Date: Tue, 12 Aug 2025 12:25:08 +1000
Message-ID: <20250812235228.3072318-6-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

rename_lookup() combines lookup and locking for a rename.

Two names - new_last and old_last - are added to struct renamedata so it
can be passed to rename_lookup() to have the old and new dentries filled
in.

__rename_lookup() in vfs-internal and assumes that the names are already
hashed and skips permission checking.  This is appropriate for use after
filename_parentat().

rename_lookup_noperm() does hash the name but avoids permission
checking.  This will be used by debugfs.

If either old_dentry or new_dentry are not NULL, the corresponding
"last" is ignored and the dentry is used as-is.  This provides similar
functionality to dentry_lookup_continue().  After locks are obtained we
check that the parent is still correct.  If old_parent was not given,
then it is set to the parent of old_dentry which was locked.  new_parent
must never be NULL.

On success new references are geld on old_dentry, new_dentry and old_parent.

done_rename_lookup() unlocks and drops those three references.

No __free() support is provided as done_rename_lookup() cannot be safely
called after rename_lookup() returns an error.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/namei.c            | 318 ++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h    |   4 +
 include/linux/namei.h |   3 +
 3 files changed, 263 insertions(+), 62 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index df21b6fa5a0e..cead810d53c6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3507,6 +3507,233 @@ void unlock_rename(struct dentry *p1, struct dentry=
 *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
=20
+/**
+ * __rename_lookup - lookup and lock names for rename
+ * @rd:           rename data containing relevant details
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ *                LOOKUP_NO_SYMLINKS etc).
+ *
+ * Optionally look up two names and ensure locks are in place for
+ * rename.
+ * Normally @rd.old_dentry and @rd.new_dentry are %NULL and the
+ * old and new directories and last names are given in @rd.  In this
+ * case the names are looked up with appropriate locking and the
+ * results stored in @rd.old_dentry and @rd.new_dentry.
+ *
+ * If either are not NULL, then the corresponding lookup is avoided but
+ * the required locks are still taken.  In this case @rd.old_parent may
+ * be %NULL, otherwise @rd.old_dentry must still have @rd.old_parent as
+ * its d_parent after the locks are obtained.  @rd.new_parent must
+ * always be non-NULL, and must always be the correct parent after
+ * locking.
+ *
+ * On success a reference is held on @rd.old_dentry, @rd.new_dentry,
+ * and @rd.old_parent whether they were originally %NULL or not.  These
+ * references are dropped by done_rename_lookup().  @rd.new_parent
+ * must always be non-NULL and no extra reference is taken.
+ *
+ * The passed in qstrs must have the hash calculated, and no permission
+ * checking is performed.
+ *
+ * Returns: zero or an error.
+ */
+static int
+__rename_lookup(struct renamedata *rd, int lookup_flags)
+{
+	struct dentry *p;
+	struct dentry *d1, *d2;
+	int target_flags =3D LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+	int err;
+
+	if (rd->flags & RENAME_EXCHANGE)
+		target_flags =3D 0;
+	if (rd->flags & RENAME_NOREPLACE)
+		target_flags |=3D LOOKUP_EXCL;
+
+	if (rd->old_dentry) {
+		/* Already have the dentry - need to be sure to lock the correct parent =
*/
+		p =3D lock_rename_child(rd->old_dentry, rd->new_parent);
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+		if (d_unhashed(rd->old_dentry) ||
+		    (rd->old_parent && rd->old_parent !=3D rd->old_dentry->d_parent)) {
+			/* dentry was removed, or moved and explicit parent requested */
+			unlock_rename(rd->old_dentry->d_parent, rd->new_parent);
+			return -EINVAL;
+		}
+		rd->old_parent =3D dget(rd->old_dentry->d_parent);
+		d1 =3D dget(rd->old_dentry);
+	} else {
+		p =3D lock_rename(rd->old_parent, rd->new_parent);
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+		dget(rd->old_parent);
+
+		d1 =3D lookup_one_qstr_excl(&rd->old_last, rd->old_parent,
+					  lookup_flags);
+		if (IS_ERR(d1))
+			goto out_unlock_1;
+	}
+	if (rd->new_dentry) {
+		if (d_unhashed(rd->new_dentry) ||
+		    rd->new_dentry->d_parent !=3D rd->new_parent) {
+			/* new_dentry was moved or removed! */
+			goto out_unlock_2;
+		}
+		d2 =3D dget(rd->new_dentry);
+	} else {
+		d2 =3D lookup_one_qstr_excl(&rd->new_last, rd->new_parent,
+					  lookup_flags | target_flags);
+		if (IS_ERR(d2))
+			goto out_unlock_2;
+	}
+
+	if (d1 =3D=3D p) {
+		/* source is an ancestor of target */
+		err =3D -EINVAL;
+		goto out_unlock_3;
+	}
+
+	if (d2 =3D=3D p) {
+		/* target is an ancestor of source */
+		if (rd->flags & RENAME_EXCHANGE)
+			err =3D -EINVAL;
+		else
+			err =3D -ENOTEMPTY;
+		goto out_unlock_3;
+	}
+
+	rd->old_dentry =3D d1;
+	rd->new_dentry =3D d2;
+	return 0;
+
+out_unlock_3:
+	dput(d2);
+	d2 =3D ERR_PTR(err);
+out_unlock_2:
+	dput(d1);
+	d1 =3D d2;
+out_unlock_1:
+	unlock_rename(rd->old_parent, rd->new_parent);
+	dput(rd->old_parent);
+	return PTR_ERR(d1);
+}
+
+/**
+ * rename_lookup_noperm - lookup and lock names for rename
+ * @rd:           rename data containing relevant details
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ *                LOOKUP_NO_SYMLINKS etc).
+ *
+ * Optionally look up two names and ensure locks are in place for
+ * rename.
+ * Normally @rd.old_dentry and @rd.new_dentry are %NULL and the
+ * old and new directories and last names are given in @rd.  In this
+ * case the names are looked up with appropriate locking and the
+ * results stored in @rd.old_dentry and @rd.new_dentry.
+ *
+ * If either are not NULL, then the corresponding lookup is avoided but
+ * the required locks are still taken.  In this case @rd.old_parent may
+ * be %NULL, otherwise @rd.old_dentry must still have @rd.old_parent as
+ * its d_parent after the locks are obtained.  @rd.new_parent must
+ * always be non-NULL, and must always be the correct parent after
+ * locking.
+ *
+ * On success a reference is held on @rd.old_dentry, @rd.new_dentry,
+ * and @rd.old_parent whether they were originally %NULL or not.  These
+ * references are dropped by done_rename_lookup().  @rd.new_parent
+ * must always be non-NULL and no extra reference is taken.
+ *
+ * The passed in qstrs need not have the hash calculated, and no
+ * permission checking is performed.
+ *
+ * Returns: zero or an error.
+ */
+int rename_lookup_noperm(struct renamedata *rd, int lookup_flags)
+{
+	int err;
+
+	if (!rd->old_dentry) {
+		err =3D lookup_noperm_common(&rd->old_last, rd->old_parent);
+		if (err)
+			return err;
+	}
+	if (!rd->new_dentry) {
+		err =3D lookup_noperm_common(&rd->new_last, rd->new_parent);
+		if (err)
+			return err;
+	}
+	return __rename_lookup(rd, lookup_flags);
+}
+EXPORT_SYMBOL(rename_lookup_noperm);
+
+/**
+ * rename_lookup - lookup and lock names for rename
+ * @rd:           rename data containing relevant details
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ *                LOOKUP_NO_SYMLINKS etc).
+ *
+ * Optionally look up two names and ensure locks are in place for
+ * rename.
+ * Normally @rd.old_dentry and @rd.new_dentry are %NULL and the
+ * old and new directories and last names are given in @rd.  In this
+ * case the names are looked up with appropriate locking and the
+ * results stored in @rd.old_dentry and @rd.new_dentry.
+ *
+ * If either are not NULL, then the corresponding lookup is avoided but
+ * the required locks are still taken.  In this case @rd.old_parent may
+ * be %NULL, otherwise @rd.old_dentry must still have @rd.old_parent as
+ * its d_parent after the locks are obtained.  @rd.new_parent must
+ * always be non-NULL, and must always be the correct parent after
+ * locking.
+ *
+ * On success a reference is held on @rd.old_dentry, @rd.new_dentry,
+ * and @rd.old_parent whether they were originally %NULL or not.  These
+ * references are dropped by done_rename_lookup().  @rd.new_parent
+ * must always be non-NULL and no extra reference is taken.
+ *
+ * The passed in qstrs need not have the hash calculated, and normal
+ * permission checking for MAY_EXEC is performed.
+ *
+ * Returns: zero or an error.
+ */
+int rename_lookup(struct renamedata *rd, int lookup_flags)
+{
+	int err;
+
+	if (!rd->old_dentry) {
+		err =3D lookup_one_common(rd->old_mnt_idmap, &rd->old_last,
+					rd->old_parent);
+		if (err)
+			return err;
+	}
+	if (!rd->new_dentry) {
+		err =3D lookup_one_common(rd->new_mnt_idmap, &rd->new_last,
+					rd->new_parent);
+		if (err)
+			return err;
+	}
+	return __rename_lookup(rd, lookup_flags);
+}
+EXPORT_SYMBOL(rename_lookup);
+
+/**
+ * done_rename_lookup - unlock dentries after rename
+ * @rd: the struct renamedata that was passed to rename_lookup()
+ *
+ * After a successful rename_lookup() (or similar) call, and after
+ * any required renaming is performed, done_rename_rename() must be called
+ * to drop any locks and references that were obtained by the earlier func=
tion.
+ */
+void done_rename_lookup(struct renamedata *rd)
+{
+	unlock_rename(rd->old_parent, rd->new_parent);
+	dput(rd->old_parent);
+	dput(rd->old_dentry);
+	dput(rd->new_dentry);
+}
+EXPORT_SYMBOL(done_rename_lookup);
+
 /**
  * vfs_prepare_mode - prepare the mode to be used for a new inode
  * @idmap:	idmap of the mount the inode was found from
@@ -5336,14 +5563,10 @@ int do_renameat2(int olddfd, struct filename *from,=
 int newdfd,
 		 struct filename *to, unsigned int flags)
 {
 	struct renamedata rd;
-	struct dentry *old_dentry, *new_dentry;
-	struct dentry *trap;
 	struct path old_path, new_path;
-	struct qstr old_last, new_last;
 	int old_type, new_type;
 	struct inode *delegated_inode =3D NULL;
-	unsigned int lookup_flags =3D 0, target_flags =3D
-		LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+	unsigned int lookup_flags =3D 0;
 	bool should_retry =3D false;
 	int error =3D -EINVAL;
=20
@@ -5354,19 +5577,14 @@ int do_renameat2(int olddfd, struct filename *from,=
 int newdfd,
 	    (flags & RENAME_EXCHANGE))
 		goto put_names;
=20
-	if (flags & RENAME_EXCHANGE)
-		target_flags =3D 0;
-	if (flags & RENAME_NOREPLACE)
-		target_flags |=3D LOOKUP_EXCL;
-
 retry:
 	error =3D filename_parentat(olddfd, from, lookup_flags, &old_path,
-				  &old_last, &old_type);
+				  &rd.old_last, &old_type);
 	if (error)
 		goto put_names;
=20
-	error =3D filename_parentat(newdfd, to, lookup_flags, &new_path, &new_las=
t,
-				  &new_type);
+	error =3D filename_parentat(newdfd, to, lookup_flags, &new_path,
+				  &rd.new_last, &new_type);
 	if (error)
 		goto exit1;
=20
@@ -5388,67 +5606,43 @@ int do_renameat2(int olddfd, struct filename *from,=
 int newdfd,
 		goto exit2;
=20
 retry_deleg:
-	trap =3D lock_rename(new_path.dentry, old_path.dentry);
-	if (IS_ERR(trap)) {
-		error =3D PTR_ERR(trap);
+	rd.old_parent	   =3D old_path.dentry;
+	rd.old_mnt_idmap   =3D mnt_idmap(old_path.mnt);
+	rd.old_dentry	   =3D NULL;
+	rd.new_parent	   =3D new_path.dentry;
+	rd.new_mnt_idmap   =3D mnt_idmap(new_path.mnt);
+	rd.new_dentry	   =3D NULL;
+	rd.delegated_inode =3D &delegated_inode;
+	rd.flags	   =3D flags;
+
+	error =3D __rename_lookup(&rd, lookup_flags);
+	if (error)
 		goto exit_lock_rename;
-	}
=20
-	old_dentry =3D lookup_one_qstr_excl(&old_last, old_path.dentry,
-					  lookup_flags);
-	error =3D PTR_ERR(old_dentry);
-	if (IS_ERR(old_dentry))
-		goto exit3;
-	new_dentry =3D lookup_one_qstr_excl(&new_last, new_path.dentry,
-					  lookup_flags | target_flags);
-	error =3D PTR_ERR(new_dentry);
-	if (IS_ERR(new_dentry))
-		goto exit4;
 	if (flags & RENAME_EXCHANGE) {
-		if (!d_is_dir(new_dentry)) {
+		if (!d_is_dir(rd.new_dentry)) {
 			error =3D -ENOTDIR;
-			if (new_last.name[new_last.len])
-				goto exit5;
+			if (rd.new_last.name[rd.new_last.len])
+				goto exit_unlock;
 		}
 	}
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
-	if (!d_is_dir(old_dentry)) {
+	if (!d_is_dir(rd.old_dentry)) {
 		error =3D -ENOTDIR;
-		if (old_last.name[old_last.len])
-			goto exit5;
-		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
-			goto exit5;
-	}
-	/* source should not be ancestor of target */
-	error =3D -EINVAL;
-	if (old_dentry =3D=3D trap)
-		goto exit5;
-	/* target should not be an ancestor of source */
-	if (!(flags & RENAME_EXCHANGE))
-		error =3D -ENOTEMPTY;
-	if (new_dentry =3D=3D trap)
-		goto exit5;
+		if (rd.old_last.name[rd.old_last.len])
+			goto exit_unlock;
+		if (!(flags & RENAME_EXCHANGE) && rd.new_last.name[rd.new_last.len])
+			goto exit_unlock;
+	}
=20
-	error =3D security_path_rename(&old_path, old_dentry,
-				     &new_path, new_dentry, flags);
+	error =3D security_path_rename(&old_path, rd.old_dentry,
+				     &new_path, rd.new_dentry, flags);
 	if (error)
-		goto exit5;
+		goto exit_unlock;
=20
-	rd.old_parent	   =3D old_path.dentry;
-	rd.old_dentry	   =3D old_dentry;
-	rd.old_mnt_idmap   =3D mnt_idmap(old_path.mnt);
-	rd.new_parent	   =3D new_path.dentry;
-	rd.new_dentry	   =3D new_dentry;
-	rd.new_mnt_idmap   =3D mnt_idmap(new_path.mnt);
-	rd.delegated_inode =3D &delegated_inode;
-	rd.flags	   =3D flags;
 	error =3D vfs_rename(&rd);
-exit5:
-	dput(new_dentry);
-exit4:
-	dput(old_dentry);
-exit3:
-	unlock_rename(new_path.dentry, old_path.dentry);
+exit_unlock:
+	done_rename_lookup(&rd);
 exit_lock_rename:
 	if (delegated_inode) {
 		error =3D break_deleg_wait(&delegated_inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index db644150b58f..b12203afa0da 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2011,9 +2011,11 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, s=
truct dentry *,
  * @old_mnt_idmap:     idmap of the old mount the inode was found from
  * @old_parent:        parent of source
  * @old_dentry:                source
+ * @old_last:          name for old_dentry in old_dir, if old_dentry not g=
iven
  * @new_mnt_idmap:     idmap of the new mount the inode was found from
  * @new_parent:        parent of destination
  * @new_dentry:                destination
+ * @new_last:          name for new_dentry in new_dir, if new_dentry not g=
iven
  * @delegated_inode:   returns an inode needing a delegation break
  * @flags:             rename flags
  */
@@ -2021,9 +2023,11 @@ struct renamedata {
 	struct mnt_idmap *old_mnt_idmap;
 	struct dentry *old_parent;
 	struct dentry *old_dentry;
+	struct qstr old_last;
 	struct mnt_idmap *new_mnt_idmap;
 	struct dentry *new_parent;
 	struct dentry *new_dentry;
+	struct qstr new_last;
 	struct inode **delegated_inode;
 	unsigned int flags;
 } __randomize_layout;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 67eef91603cc..26e5778c665f 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -108,6 +108,9 @@ extern int follow_up(struct path *);
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
+int rename_lookup(struct renamedata *rd, int lookup_flags);
+int rename_lookup_noperm(struct renamedata *rd, int lookup_flags);
+void done_rename_lookup(struct renamedata *rd);
=20
 /**
  * mode_strip_umask - handle vfs umask stripping
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 03A452D7396;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=RcuSZ3Wun1Gk5qajZLxMI4BouWWLJ0aROkai0F+QHoe3bLBwszIzETWiLFDt2LTsQH37ufwIJxtSk/jct1TP9jjyqW6rECl8zztD/HhEmHc7Rc7tHjwEXP2w3/u530ontQAfyD/Nzz7RhtFDhsAPsvCOqpRqszj94ua+yh+DIPo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=ZRhRQQI6uo3FxzRK6qWDy6tEgqtjrrXxGIqMJAzbusg=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=JmKu/L0025t/mUsW4VuP9o6sT8Z7XcAhraAyMWW6vCG8gqIw10rwENfEt8vmzFKQfAELbgW8b+Yqr1uPhizGMfmavruDDFuSP6fiJg9cyjjZEL0Xey0iNRSya0Grdet4ypDK7lqr2xPWK0rK/wWM6sykKmL3/8gl9B0O0JfLzSQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynN-005Y25-0b;
	Tue, 12 Aug 2025 23:52:46 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 06/11] VFS: unify old_mnt_idmap and new_mnt_idmap in
 renamedata
Date: Tue, 12 Aug 2025 12:25:09 +1000
Message-ID: <20250812235228.3072318-7-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

A rename can only rename with a single mount.  Callers of vfs_rename()
must and do ensure this is the case.

So there is no point in having two mnt_idmaps in renamedata as they are
always the same.  Only of of them is passed to ->rename in any case.

This patch replaces both with a single "mnt_idmap" and changes all
callers.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/cachefiles/namei.c    |  3 +--
 fs/ecryptfs/inode.c      |  3 +--
 fs/namei.c               | 21 ++++++++++-----------
 fs/nfsd/vfs.c            |  3 +--
 fs/overlayfs/overlayfs.h |  3 +--
 fs/smb/server/vfs.c      |  3 +--
 include/linux/fs.h       |  6 ++----
 7 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 91dfd0231877..d1edb2ac3837 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -387,10 +387,9 @@ int cachefiles_bury_object(struct cachefiles_cache *ca=
che,
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		struct renamedata rd =3D {
-			.old_mnt_idmap	=3D &nop_mnt_idmap,
+			.mnt_idmap	=3D &nop_mnt_idmap,
 			.old_parent	=3D dir,
 			.old_dentry	=3D rep,
-			.new_mnt_idmap	=3D &nop_mnt_idmap,
 			.new_parent	=3D cache->graveyard,
 			.new_dentry	=3D grave,
 		};
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 72fbe1316ab8..abd954c6a14e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode =
*old_dir,
 		goto out_lock;
 	}
=20
-	rd.old_mnt_idmap	=3D &nop_mnt_idmap;
+	rd.mnt_idmap		=3D &nop_mnt_idmap;
 	rd.old_parent		=3D lower_old_dir_dentry;
 	rd.old_dentry		=3D lower_old_dentry;
-	rd.new_mnt_idmap	=3D &nop_mnt_idmap;
 	rd.new_parent		=3D lower_new_dir_dentry;
 	rd.new_dentry		=3D lower_new_dentry;
 	rc =3D vfs_rename(&rd);
diff --git a/fs/namei.c b/fs/namei.c
index cead810d53c6..3f930811e952 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3702,13 +3702,13 @@ int rename_lookup(struct renamedata *rd, int lookup=
_flags)
 	int err;
=20
 	if (!rd->old_dentry) {
-		err =3D lookup_one_common(rd->old_mnt_idmap, &rd->old_last,
+		err =3D lookup_one_common(rd->mnt_idmap, &rd->old_last,
 					rd->old_parent);
 		if (err)
 			return err;
 	}
 	if (!rd->new_dentry) {
-		err =3D lookup_one_common(rd->new_mnt_idmap, &rd->new_last,
+		err =3D lookup_one_common(rd->mnt_idmap, &rd->new_last,
 					rd->new_parent);
 		if (err)
 			return err;
@@ -5418,20 +5418,20 @@ int vfs_rename(struct renamedata *rd)
 	if (source =3D=3D target)
 		return 0;
=20
-	error =3D may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
+	error =3D may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
=20
 	if (!target) {
-		error =3D may_create(rd->new_mnt_idmap, new_dir, new_dentry);
+		error =3D may_create(rd->mnt_idmap, new_dir, new_dentry);
 	} else {
 		new_is_dir =3D d_is_dir(new_dentry);
=20
 		if (!(flags & RENAME_EXCHANGE))
-			error =3D may_delete(rd->new_mnt_idmap, new_dir,
+			error =3D may_delete(rd->mnt_idmap, new_dir,
 					   new_dentry, is_dir);
 		else
-			error =3D may_delete(rd->new_mnt_idmap, new_dir,
+			error =3D may_delete(rd->mnt_idmap, new_dir,
 					   new_dentry, new_is_dir);
 	}
 	if (error)
@@ -5446,13 +5446,13 @@ int vfs_rename(struct renamedata *rd)
 	 */
 	if (new_dir !=3D old_dir) {
 		if (is_dir) {
-			error =3D inode_permission(rd->old_mnt_idmap, source,
+			error =3D inode_permission(rd->mnt_idmap, source,
 						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error =3D inode_permission(rd->new_mnt_idmap, target,
+			error =3D inode_permission(rd->mnt_idmap, target,
 						 MAY_WRITE);
 			if (error)
 				return error;
@@ -5520,7 +5520,7 @@ int vfs_rename(struct renamedata *rd)
 		if (error)
 			goto out;
 	}
-	error =3D old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
+	error =3D old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
 				      new_dir, new_dentry, flags);
 	if (error)
 		goto out;
@@ -5607,10 +5607,9 @@ int do_renameat2(int olddfd, struct filename *from, =
int newdfd,
=20
 retry_deleg:
 	rd.old_parent	   =3D old_path.dentry;
-	rd.old_mnt_idmap   =3D mnt_idmap(old_path.mnt);
+	rd.mnt_idmap	   =3D mnt_idmap(old_path.mnt);
 	rd.old_dentry	   =3D NULL;
 	rd.new_parent	   =3D new_path.dentry;
-	rd.new_mnt_idmap   =3D mnt_idmap(new_path.mnt);
 	rd.new_dentry	   =3D NULL;
 	rd.delegated_inode =3D &delegated_inode;
 	rd.flags	   =3D flags;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 98ab55ba3ced..5f3e99f956ca 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1943,10 +1943,9 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *f=
fhp, char *fname, int flen,
 		goto out_dput_old;
 	} else {
 		struct renamedata rd =3D {
-			.old_mnt_idmap	=3D &nop_mnt_idmap,
+			.mnt_idmap	=3D &nop_mnt_idmap,
 			.old_parent	=3D fdentry,
 			.old_dentry	=3D odentry,
-			.new_mnt_idmap	=3D &nop_mnt_idmap,
 			.new_parent	=3D tdentry,
 			.new_dentry	=3D ndentry,
 		};
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index bb0d7ded8e76..4f84abaa0d68 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, st=
ruct dentry *olddir,
 {
 	int err;
 	struct renamedata rd =3D {
-		.old_mnt_idmap	=3D ovl_upper_mnt_idmap(ofs),
+		.mnt_idmap	=3D ovl_upper_mnt_idmap(ofs),
 		.old_parent	=3D olddir,
 		.old_dentry	=3D olddentry,
-		.new_mnt_idmap	=3D ovl_upper_mnt_idmap(ofs),
 		.new_parent	=3D newdir,
 		.new_dentry	=3D newdentry,
 		.flags		=3D flags,
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 04539037108c..07739055ac9f 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -770,10 +770,9 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const st=
ruct path *old_path,
 		goto out4;
 	}
=20
-	rd.old_mnt_idmap	=3D mnt_idmap(old_path->mnt),
+	rd.mnt_idmap		=3D mnt_idmap(old_path->mnt),
 	rd.old_parent		=3D old_parent,
 	rd.old_dentry		=3D old_child,
-	rd.new_mnt_idmap	=3D mnt_idmap(new_path.mnt),
 	rd.new_parent		=3D new_path.dentry,
 	rd.new_dentry		=3D new_dentry,
 	rd.flags		=3D flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b12203afa0da..172c3d703c43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2008,11 +2008,10 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, =
struct dentry *,
=20
 /**
  * struct renamedata - contains all information required for renaming
- * @old_mnt_idmap:     idmap of the old mount the inode was found from
+ * @mnt_idmap:     idmap of the mount in which the rename is happening.
  * @old_parent:        parent of source
  * @old_dentry:                source
  * @old_last:          name for old_dentry in old_dir, if old_dentry not g=
iven
- * @new_mnt_idmap:     idmap of the new mount the inode was found from
  * @new_parent:        parent of destination
  * @new_dentry:                destination
  * @new_last:          name for new_dentry in new_dir, if new_dentry not g=
iven
@@ -2020,11 +2019,10 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, =
struct dentry *,
  * @flags:             rename flags
  */
 struct renamedata {
-	struct mnt_idmap *old_mnt_idmap;
+	struct mnt_idmap *mnt_idmap;
 	struct dentry *old_parent;
 	struct dentry *old_dentry;
 	struct qstr old_last;
-	struct mnt_idmap *new_mnt_idmap;
 	struct dentry *new_parent;
 	struct dentry *new_dentry;
 	struct qstr new_last;
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1BF022DA748;
	Tue, 12 Aug 2025 23:53:02 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042785; cv=none;
 b=M6dOi1pshgtrnjE5SLKe8fDVYxbgu+LOf++Gav6duEohtsMQScDFBazZZ42DqrfQOheWtQqjD9OOCNAFK/DmFB6VxoEDPfx8fvksXQOlRirzivNRjabgdKTDL2QgdVZ0JfNPVtBDZstOX7ySAhWvRI/1JLXo+yuA3ER393bxm6M=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042785; c=relaxed/simple;
	bh=HLwZNSV4EuquBD4lmvIGOiNO4VtG3s2BTrwse/wYOcM=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=CMA50OLtucp/WznmF75/2I0WZEnfgvy/khCcIA0TT5dOBhVseTm2XOOhZm2zyiSsB+ZbZJSdGd5MvsjHTbb+w1hqRNqK1haXhui1vFwN0Bv4KKnrnP8NwIcslVyqFdOFkKx8o0QBNMdflPIcDFTEvz/fMtB7v/068eTzLTgZbSk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynN-005Y27-QZ;
	Tue, 12 Aug 2025 23:52:47 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 07/11] VFS: Change vfs_mkdir() to unlock on failure.
Date: Tue, 12 Aug 2025 12:25:10 +1000
Message-ID: <20250812235228.3072318-8-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Proposed changes to directory-op locking will lock the dentry rather
than the whole directory.  So the dentry will need to be unlocked.

vfs_mkdir() consumes the dentry on error, so there will be no dentry to
be unlocked.

So this patch changes vfs_mkdir() to unlock on error as well as
releasing the dentry.  This requires various other functions in various
callers to also unlock on error - particularly in nfsd and overlayfs.

At present this results in some clumsy code.  Once the transition to
dentry locking is complete the clumsiness will be gone.

Callers of vfs_mkdir() in ecrypytfs, nfsd, xfs, cachefiles, and
overlayfs are changed to make the new behaviour.

The usage in smb/server does not need any direct change as the change
to done_path_create() is sufficient.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/cachefiles/namei.c    |  9 +++++----
 fs/ecryptfs/inode.c      |  3 ++-
 fs/namei.c               | 15 ++++++++++-----
 fs/nfsd/nfs4recover.c    | 12 +++++-------
 fs/nfsd/vfs.c            | 12 ++++++++++--
 fs/overlayfs/dir.c       | 17 ++++++++---------
 fs/overlayfs/overlayfs.h |  1 +
 fs/overlayfs/super.c     |  7 +++++--
 fs/xfs/scrub/orphanage.c |  2 +-
 9 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d1edb2ac3837..732d78911bed 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -131,8 +131,11 @@ struct dentry *cachefiles_get_directory(struct cachefi=
les_cache *cache,
 		ret =3D cachefiles_inject_write_error();
 		if (ret =3D=3D 0)
 			subdir =3D vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
-		else
+		else {
+			/* vfs_mkdir() unlocks on failure so we must too */
+			inode_unlock(d_inode(dir));
 			subdir =3D ERR_PTR(ret);
+		}
 		if (IS_ERR(subdir)) {
 			trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
 						   cachefiles_trace_mkdir_error);
@@ -196,9 +199,7 @@ struct dentry *cachefiles_get_directory(struct cachefil=
es_cache *cache,
 	return ERR_PTR(-EBUSY);
=20
 mkdir_error:
-	inode_unlock(d_inode(dir));
-	if (!IS_ERR(subdir))
-		dput(subdir);
+	done_dentry_lookup(subdir);
 	pr_err("mkdir %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
=20
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index abd954c6a14e..5d8cb042aa57 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *=
idmap, struct inode *dir,
 				 lower_dentry, mode);
 	rc =3D PTR_ERR(lower_dentry);
 	if (IS_ERR(lower_dentry))
-		goto out;
+		goto out_unlocked;
 	rc =3D 0;
 	if (d_unhashed(lower_dentry))
 		goto out;
@@ -532,6 +532,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *=
idmap, struct inode *dir,
 	set_nlink(dir, lower_dir->i_nlink);
 out:
 	inode_unlock(lower_dir);
+out_unlocked:
 	if (d_really_is_negative(dentry))
 		d_drop(dentry);
 	return ERR_PTR(rc);
diff --git a/fs/namei.c b/fs/namei.c
index 3f930811e952..fb075573157a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1787,6 +1787,9 @@ static struct dentry *__dentry_lookup(struct qstr *la=
st,
  * @last: the name in the given directory
  * @base: the directory in which the name is to be found
  * @lookup_flags: %LOOKUP_xxx flags
+ * If the dentry is an error - as can happen after vfs_mkdir() -
+ * the unlock is skipped as unneeded.
+ *
  *
  * The name is looked up and necessary locks are taken so that
  * the name can be created or removed.
@@ -1921,6 +1924,9 @@ EXPORT_SYMBOL(dentry_lookup_continue);
  * @dentry is not an error, the lock and the reference to the dentry
  * are dropped.
  *
+ * If the dentry is an error - as can happen after vfs_mkdir() -
+ * the unlock is skipped as unneeded.
+ *
  * This interface allows a smooth transition from parent-dir based
  * locking to dentry based locking.
  *
@@ -4570,9 +4576,7 @@ EXPORT_SYMBOL(kern_path_create);
=20
 void done_path_create(struct path *path, struct dentry *dentry)
 {
-	if (!IS_ERR(dentry))
-		dput(dentry);
-	inode_unlock(path->dentry->d_inode);
+	done_dentry_lookup(dentry);
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
@@ -4735,7 +4739,8 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename,=
 umode_t, mode, unsigned, d
  * negative or unhashes it and possibly splices a different one returning =
it,
  * the original dentry is dput() and the alternate is returned.
  *
- * In case of an error the dentry is dput() and an ERR_PTR() is returned.
+ * In case of an error the dentry is dput(), the parent is unlocked, and
+ * an ERR_PTR() is returned.
  */
 struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 			 struct dentry *dentry, umode_t mode)
@@ -4773,7 +4778,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, str=
uct inode *dir,
 	return dentry;
=20
 err:
-	dput(dentry);
+	done_dentry_lookup(dentry);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL(vfs_mkdir);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 2231192ec33f..19f5bc5586bb 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -222,7 +222,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	dentry =3D lookup_one(&nop_mnt_idmap, &QSTR(dname), dir);
 	if (IS_ERR(dentry)) {
 		status =3D PTR_ERR(dentry);
-		goto out_unlock;
+		inode_unlock(d_inode(dir));
+		goto out;
 	}
 	if (d_really_is_positive(dentry))
 		/*
@@ -233,15 +234,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 * In the 4.0 case, we should never get here; but we may
 		 * as well be forgiving and just succeed silently.
 		 */
-		goto out_put;
+		goto out;
 	dentry =3D vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
 	if (IS_ERR(dentry))
 		status =3D PTR_ERR(dentry);
-out_put:
-	if (!status)
-		dput(dentry);
-out_unlock:
-	inode_unlock(d_inode(dir));
+out:
+	done_dentry_lookup(dentry);
 	if (status =3D=3D 0) {
 		if (nn->in_grace)
 			__nfsd4_create_reclaim_record_grace(clp, dname,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5f3e99f956ca..a13e982e5b91 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1492,7 +1492,9 @@ nfsd_check_ignore_resizing(struct iattr *iap)
 		iap->ia_valid &=3D ~ATTR_SIZE;
 }
=20
-/* The parent directory should already be locked: */
+/* The parent directory should already be locked.  The lock
+ * will be dropped on error.
+ */
 __be32
 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		   struct nfsd_attrs *attrs,
@@ -1558,8 +1560,11 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct sv=
c_fh *fhp,
 	err =3D nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
=20
 out:
-	if (!IS_ERR(dchild))
+	if (!IS_ERR(dchild)) {
+		if (err)
+			inode_unlock(dirp);
 		dput(dchild);
+	}
 	return err;
=20
 out_nfserr:
@@ -1616,6 +1621,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fh=
p,
 	if (err !=3D nfs_ok)
 		goto out_unlock;
 	err =3D nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
+	if (err)
+		/* lock will have been dropped */
+		return err;
 	fh_fill_post_attrs(fhp);
 out_unlock:
 	inode_unlock(dentry->d_inode);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 70b8687dc45e..24f7e28b9a4f 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -162,14 +162,18 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, stru=
ct dentry *dir,
 	goto out;
 }
=20
+/* dir will be unlocked on return */
 struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
-			       struct dentry *newdentry, struct ovl_cattr *attr)
+			       struct dentry *newdentry_arg, struct ovl_cattr *attr)
 {
 	struct inode *dir =3D parent->d_inode;
+	struct dentry *newdentry __free(dentry_lookup) =3D newdentry_arg;
 	int err;
=20
-	if (IS_ERR(newdentry))
+	if (IS_ERR(newdentry)) {
+		inode_unlock(dir);
 		return newdentry;
+	}
=20
 	err =3D -ESTALE;
 	if (newdentry->d_inode)
@@ -213,12 +217,9 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, str=
uct dentry *parent,
 		err =3D -EIO;
 	}
 out:
-	if (err) {
-		if (!IS_ERR(newdentry))
-			dput(newdentry);
+	if (err)
 		return ERR_PTR(err);
-	}
-	return newdentry;
+	return dget(newdentry);
 }
=20
 struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
@@ -228,7 +229,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, stru=
ct dentry *workdir,
 	inode_lock(workdir->d_inode);
 	ret =3D ovl_create_real(ofs, workdir,
 			      ovl_lookup_temp(ofs, workdir), attr);
-	inode_unlock(workdir->d_inode);
 	return ret;
 }
=20
@@ -336,7 +336,6 @@ static int ovl_create_upper(struct dentry *dentry, stru=
ct inode *inode,
 				    ovl_lookup_upper(ofs, dentry->d_name.name,
 						     upperdir, dentry->d_name.len),
 				    attr);
-	inode_unlock(udir);
 	if (IS_ERR(newdentry))
 		return PTR_ERR(newdentry);
=20
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4f84abaa0d68..238c26142318 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -250,6 +250,7 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs=
 *ofs,
=20
 	ret =3D vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
 	pr_debug("mkdir(%pd2, 0%o) =3D %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret));
+	/* Note: dir will have been unlocked on failure */
 	return ret;
 }
=20
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index df85a76597e9..5a4b0a05139c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -328,11 +328,13 @@ static struct dentry *ovl_workdir_create(struct ovl_f=
s *ofs,
 		}
=20
 		work =3D ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
-		inode_unlock(dir);
 		err =3D PTR_ERR(work);
 		if (IS_ERR(work))
 			goto out_err;
=20
+		dget(work); /* Need to return this */
+
+		done_dentry_lookup(work);
 		/* Weird filesystem returning with hashed negative (kernfs)? */
 		err =3D -EINVAL;
 		if (d_really_is_negative(work))
@@ -623,7 +625,8 @@ static struct dentry *ovl_lookup_or_create(struct ovl_f=
s *ofs,
 	child =3D ovl_lookup_upper(ofs, name, parent, len);
 	if (!IS_ERR(child) && !child->d_inode)
 		child =3D ovl_create_real(ofs, parent, child, OVL_CATTR(mode));
-	inode_unlock(parent->d_inode);
+	else
+		inode_unlock(parent->d_inode);
 	dput(parent);
=20
 	return child;
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 9c12cb844231..c95bded4e8a7 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -170,7 +170,7 @@ xrep_orphanage_create(
 					     orphanage_dentry, 0750);
 		error =3D PTR_ERR(orphanage_dentry);
 		if (IS_ERR(orphanage_dentry))
-			goto out_unlock_root;
+			goto out_dput_root;
 	}
=20
 	/* Not a directory? Bail out. */
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id EDFE52D7381;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=ccwmdm9jKNLcfzDJgZF5lUQv2oJV7v/+eFZ4suQkahlIeH4Nmd+bNH5C7apjo+epOHur+/wLsgQQDM+LqW+Z3R+ZSR2KzMUq7TpPP1sGm1ea+3R9bKWbR/yiMMFXzgqg3qM6Kr3oEL9bwXm2upG3It2MOdpW2nIcXj9jb6IA4fM=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=8H0TpV7kB+G3cWPZv8G4Hm2H3l6DX+JGvz2rgwlSirA=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=WY9Gmzt0p62BKOQThcSdgUfqy0QdeMsaXZ7xfpTPZOfpMzs7bl23Q6X+Ya+9DpUWnK7WQuWvvzOLd8WOhjUBmc4dnwFmB4DoM634k/hqyGhKFFiq1HjTgtp+v3aJth6BfwZN24rwi8w3d+yiHJt+thIGKg7tMQ9XlRUMcxQ4UOU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynO-005Y29-KL;
	Tue, 12 Aug 2025 23:52:48 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 08/11] VFS: allow d_splice_alias() and d_add() to work on
 hashed dentries.
Date: Tue, 12 Aug 2025 12:25:11 +1000
Message-ID: <20250812235228.3072318-9-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Proposed locking changes will require a dentry to remain hashed during
all directory operations which are currently protected by i_rwsem, or
for there to be a controlled transition from one hashed dentry to
another which maintains the lock - which will then be on the dentry.

The current practice of dropping (unhashing) a dentry before calling
d_splice_alias() and d_add() defeats this need.

This patch changes d_splice_alias() and d_add() to accept a hashed
dentry and to only drop it when necessary immediately before an
alternate dentry is hashed.  These functions will, in a subsequent patch,
transfer the dentry locking across so that the name remains locked in
the directory.

Signed-off-by: NeilBrown <neil@brown.name>
---
 Documentation/filesystems/vfs.rst | 4 ++--
 fs/ceph/file.c                    | 2 --
 fs/ceph/inode.c                   | 3 ---
 fs/dcache.c                       | 8 +++++---
 fs/fuse/dir.c                     | 1 -
 fs/hostfs/hostfs_kern.c           | 1 -
 fs/nfs/dir.c                      | 5 +----
 fs/nfs/nfs4proc.c                 | 1 -
 fs/smb/client/dir.c               | 1 -
 9 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/=
vfs.rst
index 486a91633474..642dd6afb139 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -580,8 +580,8 @@ otherwise noted.
 	dentry before the first mkdir returns.
=20
 	If there is any chance this could happen, then the new inode
-	should be d_drop()ed and attached with d_splice_alias().  The
-	returned dentry (if any) should be returned by ->mkdir().
+	should be attached with d_splice_alias().  The returned dentry
+	(if any) should be returned by ->mkdir().
=20
 ``rmdir``
 	called by the rmdir(2) system call.  Only required if you want
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c02f100f8552..27eb1ac06177 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -755,8 +755,6 @@ static int ceph_finish_async_create(struct inode *dir, =
struct inode *inode,
 			unlock_new_inode(inode);
 		}
 		if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
-			if (!d_unhashed(dentry))
-				d_drop(dentry);
 			dn =3D d_splice_alias(inode, dentry);
 			WARN_ON_ONCE(dn && dn !=3D dentry);
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fc543075b827..7acd6ac0d50f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1480,9 +1480,6 @@ static int splice_dentry(struct dentry **pdn, struct =
inode *in)
 		}
 	}
=20
-	/* dn must be unhashed */
-	if (!d_unhashed(dn))
-		d_drop(dn);
 	realdn =3D d_splice_alias(in, dn);
 	if (IS_ERR(realdn)) {
 		pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
diff --git a/fs/dcache.c b/fs/dcache.c
index 60046ae23d51..0db256098adb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2709,7 +2709,11 @@ static inline void __d_add(struct dentry *dentry, st=
ruct inode *inode,
 		raw_write_seqcount_end(&dentry->d_seq);
 		fsnotify_update_flags(dentry);
 	}
-	__d_rehash(dentry);
+	if (d_unhashed(dentry))
+		__d_rehash(dentry);
+	else if (inode && (dentry->d_flags &
+			   (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) =3D=3D DCACHE_LRU_LIST)
+		this_cpu_dec(nr_dentry_negative);
 	if (dir)
 		end_dir_add(dir, n, d_wait);
 	spin_unlock(&dentry->d_lock);
@@ -2990,8 +2994,6 @@ struct dentry *d_splice_alias_ops(struct inode *inode=
, struct dentry *dentry,
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
=20
-	BUG_ON(!d_unhashed(dentry));
-
 	if (!inode)
 		goto out;
=20
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2d817d7cab26..60e7763da8c8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -834,7 +834,6 @@ static struct dentry *create_new_entry(struct mnt_idmap=
 *idmap, struct fuse_moun
 	}
 	kfree(forget);
=20
-	d_drop(entry);
 	d =3D d_splice_alias(inode, entry);
 	if (IS_ERR(d))
 		return d;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 01e516175bcd..8e51fc623301 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -700,7 +700,6 @@ static struct dentry *hostfs_mkdir(struct mnt_idmap *id=
map, struct inode *ino,
 		dentry =3D ERR_PTR(err);
 	} else {
 		inode =3D hostfs_iget(dentry->d_sb, file);
-		d_drop(dentry);
 		dentry =3D d_splice_alias(inode, dentry);
 	}
 	__putname(file);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d81217923936..250a826d5480 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2136,7 +2136,6 @@ int nfs_atomic_open(struct inode *dir, struct dentry =
*dentry,
 		err =3D PTR_ERR(inode);
 		trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
 		put_nfs_open_context(ctx);
-		d_drop(dentry);
 		switch (err) {
 		case -ENOENT:
 			d_splice_alias(NULL, dentry);
@@ -2157,6 +2156,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry =
*dentry,
 		default:
 			break;
 		}
+		d_drop(dentry);
 		goto out;
 	}
 	file->f_mode |=3D FMODE_CAN_ODIRECT;
@@ -2304,8 +2304,6 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_f=
h *fhandle,
 	struct dentry *d;
 	int error;
=20
-	d_drop(dentry);
-
 	if (fhandle->size =3D=3D 0) {
 		error =3D NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name,
 					       fhandle, fattr);
@@ -2652,7 +2650,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir=
, struct dentry *dentry)
 		old_dentry, dentry);
=20
 	trace_nfs_link_enter(inode, dir, dentry);
-	d_drop(dentry);
 	if (S_ISREG(inode->i_mode))
 		nfs_sync_inode(inode);
 	error =3D NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7d2b67e06cc3..d8739c286a99 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3175,7 +3175,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opend=
ata *opendata,
 	dentry =3D opendata->dentry;
 	if (d_really_is_negative(dentry)) {
 		struct dentry *alias;
-		d_drop(dentry);
 		alias =3D d_splice_alias(igrab(state->inode), dentry);
 		/* d_splice_alias() can't fail here - it's a non-directory */
 		if (alias) {
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 5223edf6d11a..8cbc284c5005 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -439,7 +439,6 @@ static int cifs_do_create(struct inode *inode, struct d=
entry *direntry, unsigned
 			goto out_err;
 		}
=20
-	d_drop(direntry);
 	d_add(direntry, newinode);
=20
 out:
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 14E272D878A;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042786; cv=none;
 b=YGwLAwUgbYQHww2qPQQXyfEJeWKRsShnb7Zlx2OCGTgW14Omr/6P/EheDCEHFoGs2DWagtdc5Bc79BpQBRj3vi+ZJYnWlgrLujb/Th9wdKDcqm5Fky7GGpsAORNiWQlc1BEdVF+tF9gTyIlPWNtorysda3T43wqWf8lDExraXLQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042786; c=relaxed/simple;
	bh=YEbDMsfF/nlYvvDCki2jzQZCDr4K7nmCD6F0mff6Lz0=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=b9NEoIHrXTSeCTFtd8yp2BUK1T2IGoDXNhhs6ekZ/kTf9GCOGdV7v+WSzAniWWYD9x6Q5w6fE0Jhg8zXZV62GWdqARMmOluCvWBtmHKUlaIyTQsh2fhf0K02xFqbgRrYEqqHtFOzavjxUPALsdisZRS/jD2apPtBqPXE1diKn78=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynP-005Y2B-Eh;
	Tue, 12 Aug 2025 23:52:49 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 09/11] VFS: use global wait-queue table for d_alloc_parallel()
Date: Tue, 12 Aug 2025 12:25:12 +1000
Message-ID: <20250812235228.3072318-10-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

d_alloc_parallel() currently requires a wait_queue_head to be passed in.
This must have a life time which extends until the lookup is completed.

Future proposed patches will use d_alloc_parallel() for names being
created/unlinked etc.  Some filesystems combine lookup with create
making a longer code path that the wq needs to live for.  If it is still
to be allocated on-stack this can be cumbersome.

This patch replaces the on-stack wqs with a global array of wqs which
are used as needed.  A wq is NOT allocated when a dentry is first
created but only when a second thread attempts to use the same name and
so is forced to wait.  At this moment a wq is chosen using a hash of the
dentry pointer and that wq is assigned to ->d_wait.  The ->d_lock is
then dropped and the task waits.

When the dentry is finally moved out of "in_lookup" a wake up is only
sent if ->d_wait is not NULL.  This avoids an (uncontended) spin
lock/unlock which saves a couple of atomic operations in a common case.

The wake up passes the dentry that the wake up is for as the "key" and
the waiter will only wake processes waiting on the same key.  This means
that when these global waitqueues are shared (which is inevitable
though unlikely to be frequent), a task will not be woken prematurely.

Signed-off-by: NeilBrown <neil@brown.name>
---
 Documentation/filesystems/porting.rst |  6 +++
 fs/afs/dir_silly.c                    |  4 +-
 fs/dcache.c                           | 77 ++++++++++++++++++++++-----
 fs/fuse/readdir.c                     |  3 +-
 fs/namei.c                            |  6 +--
 fs/nfs/dir.c                          |  6 +--
 fs/nfs/unlink.c                       |  3 +-
 fs/proc/base.c                        |  3 +-
 fs/proc/proc_sysctl.c                 |  3 +-
 fs/smb/client/readdir.c               |  3 +-
 include/linux/dcache.h                |  3 +-
 include/linux/nfs_xdr.h               |  1 -
 12 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesyst=
ems/porting.rst
index 85f590254f07..e4a326e8fa4c 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1285,3 +1285,9 @@ rather than a VMA, as the VMA at this stage is not ye=
t valid.
 The vm_area_desc provides the minimum required information for a filesystem
 to initialise state upon memory mapping of a file-backed region, and output
 parameters for the file system to set this state.
+---
+
+** mandatory**
+
+d_alloc_parallel() no longer requires a waitqueue_head.  It uses one
+from an internal table when needed.
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 0b80eb93fa40..ce76b3b30850 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -237,13 +237,11 @@ int afs_silly_iput(struct dentry *dentry, struct inod=
e *inode)
 	struct dentry *alias;
 	int ret;
=20
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-
 	_enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode);
=20
 	down_read(&dvnode->rmdir_lock);
=20
-	alias =3D d_alloc_parallel(dentry->d_parent, &dentry->d_name, &wq);
+	alias =3D d_alloc_parallel(dentry->d_parent, &dentry->d_name);
 	if (IS_ERR(alias)) {
 		up_read(&dvnode->rmdir_lock);
 		return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 0db256098adb..5473d906783e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2137,8 +2137,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct=
 inode *inode,
 		return found;
 	}
 	if (d_in_lookup(dentry)) {
-		found =3D d_alloc_parallel(dentry->d_parent, name,
-					dentry->d_wait);
+		found =3D d_alloc_parallel(dentry->d_parent, name);
 		if (IS_ERR(found) || !d_in_lookup(found)) {
 			iput(inode);
 			return found;
@@ -2148,7 +2147,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct=
 inode *inode,
 		if (!found) {
 			iput(inode);
 			return ERR_PTR(-ENOMEM);
-		}=20
+		}
 	}
 	res =3D d_splice_alias(inode, found);
 	if (res) {
@@ -2505,6 +2504,46 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
=20
+#define	PAR_LOOKUP_WQ_BITS	8
+#define PAR_LOOKUP_WQS (1 << PAR_LOOKUP_WQ_BITS)
+static wait_queue_head_t par_wait_table[PAR_LOOKUP_WQS] __cacheline_aligne=
d;
+
+static int __init par_wait_init(void)
+{
+	int i;
+
+	for (i =3D 0; i < PAR_LOOKUP_WQS; i++)
+		init_waitqueue_head(&par_wait_table[i]);
+	return 0;
+}
+fs_initcall(par_wait_init);
+
+struct par_wait_key {
+	struct dentry *de;
+	struct wait_queue_entry wqe;
+};
+
+static int d_wait_wake_fn(struct wait_queue_entry *wq_entry,
+			  unsigned mode, int sync, void *key)
+{
+	struct par_wait_key *pwk =3D container_of(wq_entry,
+						 struct par_wait_key, wqe);
+	if (pwk->de =3D=3D key)
+		return default_wake_function(wq_entry, mode, sync, key);
+	return 0;
+}
+
+static inline void d_wake_waiters(struct wait_queue_head *d_wait,
+				  struct dentry *dentry)
+{
+	/* ->d_wait is only set if some thread is actually waiting.
+	 * If we find it is NULL - the common case - then there was no
+	 * contention and there are no waiters to be woken.
+	 */
+	if (d_wait)
+		__wake_up(d_wait, TASK_NORMAL, 0, dentry);
+}
+
 static inline unsigned start_dir_add(struct inode *dir)
 {
 	preempt_disable_nested();
@@ -2517,31 +2556,41 @@ static inline unsigned start_dir_add(struct inode *=
dir)
 }
=20
 static inline void end_dir_add(struct inode *dir, unsigned int n,
-			       wait_queue_head_t *d_wait)
+			       wait_queue_head_t *d_wait, struct dentry *de)
 {
 	smp_store_release(&dir->i_dir_seq, n + 2);
 	preempt_enable_nested();
-	if (wq_has_sleeper(d_wait))
-		wake_up_all(d_wait);
+	d_wake_waiters(d_wait, de);
 }
=20
 static void d_wait_lookup(struct dentry *dentry)
 {
 	if (d_in_lookup(dentry)) {
-		DECLARE_WAITQUEUE(wait, current);
-		add_wait_queue(dentry->d_wait, &wait);
+		struct par_wait_key wk =3D {
+			.de =3D dentry,
+			.wqe =3D {
+				.private =3D current,
+				.func =3D d_wait_wake_fn,
+			},
+		};
+		struct wait_queue_head *wq;
+		if (!dentry->d_wait)
+			dentry->d_wait =3D &par_wait_table[hash_ptr(dentry,
+								  PAR_LOOKUP_WQ_BITS)];
+		wq =3D dentry->d_wait;
+		add_wait_queue(wq, &wk.wqe);
 		do {
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock(&dentry->d_lock);
 			schedule();
 			spin_lock(&dentry->d_lock);
 		} while (d_in_lookup(dentry));
+		remove_wait_queue(wq, &wk.wqe);
 	}
 }
=20
 struct dentry *d_alloc_parallel(struct dentry *parent,
-				const struct qstr *name,
-				wait_queue_head_t *wq)
+				const struct qstr *name)
 {
 	unsigned int hash =3D name->hash;
 	struct hlist_bl_head *b =3D in_lookup_hash(parent, hash);
@@ -2554,6 +2603,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
 		return ERR_PTR(-ENOMEM);
=20
 	new->d_flags |=3D DCACHE_PAR_LOOKUP;
+	new->d_wait =3D NULL;
 	spin_lock(&parent->d_lock);
 	new->d_parent =3D dget_dlock(parent);
 	hlist_add_head(&new->d_sib, &parent->d_children);
@@ -2642,7 +2692,6 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
 		return dentry;
 	}
 	rcu_read_unlock();
-	new->d_wait =3D wq;
 	hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
 	hlist_bl_unlock(b);
 	return new;
@@ -2680,7 +2729,7 @@ static wait_queue_head_t *__d_lookup_unhash(struct de=
ntry *dentry)
 void __d_lookup_unhash_wake(struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
-	wake_up_all(__d_lookup_unhash(dentry));
+	d_wake_waiters(__d_lookup_unhash(dentry), dentry);
 	spin_unlock(&dentry->d_lock);
 }
 EXPORT_SYMBOL(__d_lookup_unhash_wake);
@@ -2715,7 +2764,7 @@ static inline void __d_add(struct dentry *dentry, str=
uct inode *inode,
 			   (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) =3D=3D DCACHE_LRU_LIST)
 		this_cpu_dec(nr_dentry_negative);
 	if (dir)
-		end_dir_add(dir, n, d_wait);
+		end_dir_add(dir, n, d_wait, dentry);
 	spin_unlock(&dentry->d_lock);
 	if (inode)
 		spin_unlock(&inode->i_lock);
@@ -2881,7 +2930,7 @@ static void __d_move(struct dentry *dentry, struct de=
ntry *target,
 	write_seqcount_end(&dentry->d_seq);
=20
 	if (dir)
-		end_dir_add(dir, n, d_wait);
+		end_dir_add(dir, n, d_wait, target);
=20
 	if (dentry->d_parent !=3D old_parent)
 		spin_unlock(&dentry->d_parent->d_lock);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index c2aae2eef086..f588252891af 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -160,7 +160,6 @@ static int fuse_direntplus_link(struct file *file,
 	struct inode *dir =3D d_inode(parent);
 	struct fuse_conn *fc;
 	struct inode *inode;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	int epoch;
=20
 	if (!o->nodeid) {
@@ -197,7 +196,7 @@ static int fuse_direntplus_link(struct file *file,
 	dentry =3D d_lookup(parent, &name);
 	if (!dentry) {
 retry:
-		dentry =3D d_alloc_parallel(parent, &name, &wq);
+		dentry =3D d_alloc_parallel(parent, &name);
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
 	}
diff --git a/fs/namei.c b/fs/namei.c
index fb075573157a..2c98672fdb6a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2012,13 +2012,12 @@ static struct dentry *__lookup_slow(const struct qs=
tr *name,
 {
 	struct dentry *dentry, *old;
 	struct inode *inode =3D dir->d_inode;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
=20
 	/* Don't go there if it's already dead */
 	if (unlikely(IS_DEADDIR(inode)))
 		return ERR_PTR(-ENOENT);
 again:
-	dentry =3D d_alloc_parallel(dir, name, &wq);
+	dentry =3D d_alloc_parallel(dir, name);
 	if (IS_ERR(dentry))
 		return dentry;
 	if (unlikely(!d_in_lookup(dentry))) {
@@ -4028,7 +4027,6 @@ static struct dentry *lookup_open(struct nameidata *n=
d, struct file *file,
 	struct dentry *dentry;
 	int error, create_error =3D 0;
 	umode_t mode =3D op->mode;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
=20
 	if (unlikely(IS_DEADDIR(dir_inode)))
 		return ERR_PTR(-ENOENT);
@@ -4037,7 +4035,7 @@ static struct dentry *lookup_open(struct nameidata *n=
d, struct file *file,
 	dentry =3D d_lookup(dir, &nd->last);
 	for (;;) {
 		if (!dentry) {
-			dentry =3D d_alloc_parallel(dir, &nd->last, &wq);
+			dentry =3D d_alloc_parallel(dir, &nd->last);
 			if (IS_ERR(dentry))
 				return dentry;
 		}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 250a826d5480..bbeb24805a0e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -727,7 +727,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs=
_entry *entry,
 		unsigned long dir_verifier)
 {
 	struct qstr filename =3D QSTR_INIT(entry->name, entry->len);
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	struct dentry *dentry;
 	struct dentry *alias;
 	struct inode *inode;
@@ -756,7 +755,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs=
_entry *entry,
 	dentry =3D d_lookup(parent, &filename);
 again:
 	if (!dentry) {
-		dentry =3D d_alloc_parallel(parent, &filename, &wq);
+		dentry =3D d_alloc_parallel(parent, &filename);
 		if (IS_ERR(dentry))
 			return;
 	}
@@ -2060,7 +2059,6 @@ int nfs_atomic_open(struct inode *dir, struct dentry =
*dentry,
 		    struct file *file, unsigned open_flags,
 		    umode_t mode)
 {
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	struct nfs_open_context *ctx;
 	struct dentry *res;
 	struct iattr attr =3D { .ia_valid =3D ATTR_OPEN };
@@ -2116,7 +2114,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry =
*dentry,
 		d_drop(dentry);
 		switched =3D true;
 		dentry =3D d_alloc_parallel(dentry->d_parent,
-					  &dentry->d_name, &wq);
+					  &dentry->d_name);
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
 		if (unlikely(!d_in_lookup(dentry)))
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b55467911648..894af85830fa 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -124,7 +124,7 @@ static int nfs_call_unlink(struct dentry *dentry, struc=
t inode *inode, struct nf
 	struct dentry *alias;
=20
 	down_read_non_owner(&NFS_I(dir)->rmdir_sem);
-	alias =3D d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
+	alias =3D d_alloc_parallel(dentry->d_parent, &data->args.name);
 	if (IS_ERR(alias)) {
 		up_read_non_owner(&NFS_I(dir)->rmdir_sem);
 		return 0;
@@ -185,7 +185,6 @@ nfs_async_unlink(struct dentry *dentry, const struct qs=
tr *name)
=20
 	data->cred =3D get_current_cred();
 	data->res.dir_attr =3D &data->dir_attr;
-	init_waitqueue_head(&data->wq);
=20
 	status =3D -EBUSY;
 	spin_lock(&dentry->d_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 62d35631ba8c..0b296c94000e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2129,8 +2129,7 @@ bool proc_fill_cache(struct file *file, struct dir_co=
ntext *ctx,
=20
 	child =3D try_lookup_noperm(&qname, dir);
 	if (!child) {
-		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-		child =3D d_alloc_parallel(dir, &qname, &wq);
+		child =3D d_alloc_parallel(dir, &qname);
 		if (IS_ERR(child))
 			goto end_instantiate;
 		if (d_in_lookup(child)) {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 49ab74e0bfde..04a382178c65 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -692,8 +692,7 @@ static bool proc_sys_fill_cache(struct file *file,
=20
 	child =3D d_lookup(dir, &qname);
 	if (!child) {
-		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-		child =3D d_alloc_parallel(dir, &qname, &wq);
+		child =3D d_alloc_parallel(dir, &qname);
 		if (IS_ERR(child))
 			return false;
 		if (d_in_lookup(child)) {
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 4e5460206397..5a92a1ad317d 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -74,7 +74,6 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *nam=
e,
 	struct cifs_sb_info *cifs_sb =3D CIFS_SB(sb);
 	bool posix =3D cifs_sb_master_tcon(cifs_sb)->posix_extensions;
 	bool reparse_need_reval =3D false;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	int rc;
=20
 	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
@@ -106,7 +105,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *n=
ame,
 		    (fattr->cf_flags & CIFS_FATTR_NEED_REVAL))
 			return;
=20
-		dentry =3D d_alloc_parallel(parent, name, &wq);
+		dentry =3D d_alloc_parallel(parent, name);
 	}
 	if (IS_ERR(dentry))
 		return;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5d53489e5556..996259d1bc88 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -241,8 +241,7 @@ extern void d_delete(struct dentry *);
 /* allocate/de-allocate */
 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
 extern struct dentry * d_alloc_anon(struct super_block *);
-extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr=
 *,
-					wait_queue_head_t *);
+extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr=
 *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 /* weird procfs mess; *NOT* exported */
 extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ac4bff6e9913..197c9b30dfdf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1735,7 +1735,6 @@ struct nfs_unlinkdata {
 	struct nfs_removeargs args;
 	struct nfs_removeres res;
 	struct dentry *dentry;
-	wait_queue_head_t wq;
 	const struct cred *cred;
 	struct nfs_fattr dir_attr;
 	long timeout;
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DFE902D6417;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042784; cv=none;
 b=dAziRD8LyXXnRJQj1klx2NSfcGpOb6fAAJ12RSPzjJtcq3SnHArEBx1lw+djbN0yRk5cWlg/gzCwDTqLfbw7h9juXF1EtapoEoGyVPHCv7kzgGSTa0mFQ0bYAZMe3Jn11SMPId1ghP/zXlH+0pnH0GtStByjcAweAK9aJDv7m2A=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042784; c=relaxed/simple;
	bh=aU9DH3a24782eit3wuuN3qUFohwYCIfssw2h5DYMqhw=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=PZS4VWv8+LH53nXcRq7Fa5YW6o+6ueeIrsbwnpj1tLmx9fZ2uaM2gaYGNWqK6YPQwHxVD103RHwR1YXXGpg1D27guA3T/Rstz9hDqNmXDxMSnsnSrLBkKPG0WZmyK3AjpX+nZoOh1FpLjr8sNFhB7r+o/1p7SyiC1YcvG/9iLx0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynQ-005Y2D-85;
	Tue, 12 Aug 2025 23:52:49 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 10/11] VFS: use d_alloc_parallel() in lookup_one_qstr_excl().
Date: Tue, 12 Aug 2025 12:25:13 +1000
Message-ID: <20250812235228.3072318-11-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

lookup_one_qstr_excl() is used for lookups prior to directory
modifications, whether create, unlink, rename, or whatever.

To prepare for allowing modification to happen in parallel, change
lookup_one_qstr_excl() to use d_alloc_parallel().

As a result, ->lookup is now only ever called with a d_in_lookup()
dentry.  Consequently we can remove the d_in_lookup() check from
d_add_ci() which is only used in ->lookup.

If LOOKUP_EXCL or LOOKUP_RENAME_TARGET is passed, the caller must ensure
d_lookup_done() is called at an appropriate time, and must not assume
that it can test for positive or negative dentries without confirming
that the dentry is no longer d_in_lookup() - unless it is filesystem
code acting on itself and *knows* that ->lookup() always completes the
lookup (currently true for all filesystems other than NFS).

Signed-off-by: NeilBrown <neil@brown.name>
---
 Documentation/filesystems/porting.rst | 14 +++++++++
 fs/dcache.c                           | 16 +++-------
 fs/namei.c                            | 45 +++++++++++++++++++++------
 3 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesyst=
ems/porting.rst
index e4a326e8fa4c..c9210d3bd313 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1291,3 +1291,17 @@ parameters for the file system to set this state.
=20
 d_alloc_parallel() no longer requires a waitqueue_head.  It uses one
 from an internal table when needed.
+
+---
+
+** mandatory**
+
+All dentry_lookup* functions may return a d_in_lookup() dentry if passed
+"O_CREATE|O_EXCL" or "O_RENAME_TARGET".  done_dentry_lookup() calls the
+necessary d_lookup_done().  If the caller *knows* which filesystem is
+being used, it may know that this is not possible.  Otherwise it must be
+careful testing if the dentry is positive or negative as the lookup may
+not have been performed yet.
+
+inode_operations.lookup() is now only ever called with a d_in_lookup()
+dentry.
diff --git a/fs/dcache.c b/fs/dcache.c
index 5473d906783e..7e3eb5576fa4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2136,18 +2136,10 @@ struct dentry *d_add_ci(struct dentry *dentry, stru=
ct inode *inode,
 		iput(inode);
 		return found;
 	}
-	if (d_in_lookup(dentry)) {
-		found =3D d_alloc_parallel(dentry->d_parent, name);
-		if (IS_ERR(found) || !d_in_lookup(found)) {
-			iput(inode);
-			return found;
-		}
-	} else {
-		found =3D d_alloc(dentry->d_parent, name);
-		if (!found) {
-			iput(inode);
-			return ERR_PTR(-ENOMEM);
-		}
+	found =3D d_alloc_parallel(dentry->d_parent, name);
+	if (IS_ERR(found) || !d_in_lookup(found)) {
+		iput(inode);
+		return found;
 	}
 	res =3D d_splice_alias(inode, found);
 	if (res) {
diff --git a/fs/namei.c b/fs/namei.c
index 2c98672fdb6a..6a645f3a2b20 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1666,13 +1666,14 @@ static struct dentry *lookup_dcache(const struct qs=
tr *name,
 }
=20
 /*
- * Parent directory has inode locked exclusive.  This is one
- * and only case when ->lookup() gets called on non in-lookup
- * dentries - as the matter of fact, this only gets called
- * when directory is guaranteed to have no in-lookup children
- * at all.
- * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
- * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
+ * Parent directory has inode locked.
+ * d_lookup_done() must be called before the dentry is dput()
+ * if LOOKUP_EXCL or LOOKUP_RENAME_TARGET is set.
+ * If the dentry is not d_in_lookup():
+ *   Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't pass=
ed.
+ *   Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
+ * If it is d_in_lookup() then these conditions can only be checked by the
+ * file system when carrying out the intent (create or rename).
  */
 struct dentry *lookup_one_qstr_excl(const struct qstr *name,
 				    struct dentry *base, unsigned int flags)
@@ -1690,18 +1691,27 @@ struct dentry *lookup_one_qstr_excl(const struct qs=
tr *name,
 	if (unlikely(IS_DEADDIR(dir)))
 		return ERR_PTR(-ENOENT);
=20
-	dentry =3D d_alloc(base, name);
-	if (unlikely(!dentry))
-		return ERR_PTR(-ENOMEM);
+	dentry =3D d_alloc_parallel(base, name);
+	if (unlikely(IS_ERR(dentry)))
+		return dentry;
+	if (unlikely(!d_in_lookup(dentry)))
+		/* Raced with another thread which did the lookup */
+		goto found;
=20
 	old =3D dir->i_op->lookup(dir, dentry, flags);
 	if (unlikely(old)) {
+		d_lookup_done(dentry);
 		dput(dentry);
 		dentry =3D old;
 	}
 found:
 	if (IS_ERR(dentry))
 		return dentry;
+	if (d_in_lookup(dentry))
+		/* We cannot check for errors - the caller will have to
+		 * wait for any create-etc attempt to get relevant errors.
+		 */
+		return dentry;
 	if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
 		dput(dentry);
 		return ERR_PTR(-ENOENT);
@@ -1767,6 +1777,8 @@ static int lookup_one_common(struct mnt_idmap *idmap,
  * This function is for VFS-internal use only.
  *
  * Returns: the dentry, suitably locked, or an ERR_PTR().
+ *    The dentry may be d_in_lookup() if LOOKUP_EXCL or LOOKUP_RENAME_TARG=
ET
+ *    is given, depending on the filesystem.
  */
 static struct dentry *__dentry_lookup(struct qstr *last,
 				      struct dentry *base,
@@ -1796,7 +1808,10 @@ static struct dentry *__dentry_lookup(struct qstr *l=
ast,
  * The "necessary locks" are currently the inode lock on @base.
  * The name @last is NOT expected to have the hash calculated.
  * No permission checks are performed.
+ *
  * Returns: the dentry, suitably locked, or an ERR_PTR().
+ *    The dentry may be d_in_lookup() if LOOKUP_EXCL or LOOKUP_RENAME_TARG=
ET
+ *    is given, depending on the filesystem.
  */
 struct dentry *dentry_lookup_noperm(struct qstr *last,
 				    struct dentry *base,
@@ -1825,6 +1840,8 @@ EXPORT_SYMBOL(dentry_lookup_noperm);
  * Permission checks are performed to ensure %MAY_EXEC access to @base.
  *
  * Returns: the dentry, suitably locked, or an ERR_PTR().
+ *    The dentry may be d_in_lookup() if LOOKUP_EXCL or LOOKUP_RENAME_TARG=
ET
+ *    is given, depending on the filesystem.
  */
 struct dentry *dentry_lookup(struct mnt_idmap *idmap,
 			     struct qstr *last,
@@ -1852,7 +1869,10 @@ EXPORT_SYMBOL(dentry_lookup);
  * If a fatal signal arrives, or is already pending, the operation is abor=
ted.
  * The name @last is NOT expected to already have the hash calculated.
  * Permission checks are performed to ensure %MAY_EXEC access to @base.
+ *
  * Returns: the dentry, suitably locked, or an ERR_PTR().
+ *    The dentry may be d_in_lookup() if LOOKUP_EXCL or LOOKUP_RENAME_TARG=
ET
+ *    is given, depending on the filesystem.
  */
 struct dentry *dentry_lookup_killable(struct mnt_idmap *idmap,
 				      struct qstr *last,
@@ -1937,6 +1957,7 @@ void done_dentry_lookup(struct dentry *dentry)
 {
 	if (!IS_ERR(dentry)) {
 		inode_unlock(dentry->d_parent->d_inode);
+		d_lookup_done(dentry);
 		dput(dentry);
 	}
 }
@@ -3613,9 +3634,11 @@ __rename_lookup(struct renamedata *rd, int lookup_fl=
ags)
 	return 0;
=20
 out_unlock_3:
+	d_lookup_done(d2);
 	dput(d2);
 	d2 =3D ERR_PTR(err);
 out_unlock_2:
+	d_lookup_done(d1);
 	dput(d1);
 	d1 =3D d2;
 out_unlock_1:
@@ -3732,6 +3755,8 @@ EXPORT_SYMBOL(rename_lookup);
  */
 void done_rename_lookup(struct renamedata *rd)
 {
+	d_lookup_done(rd->old_dentry);
+	d_lookup_done(rd->new_dentry);
 	unlock_rename(rd->old_parent, rd->new_parent);
 	dput(rd->old_parent);
 	dput(rd->old_dentry);
--=20
2.50.0.107.gf914562f5916.dirty
From nobody Sat Oct  4 22:34:43 2025
Received: from neil.brown.name (neil.brown.name [103.29.64.221])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id DFDAA2D4B5C;
	Tue, 12 Aug 2025 23:53:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=103.29.64.221
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755042785; cv=none;
 b=B7mK3ajcd4qWz7fGcKi/NmHjnNI5gvYN/6mswx+X/9UJmtkTz02nz95U3AucYH09V/MrQsDnLQcpPSIFYQu9Xb51Ws+uD+tfeWo6NkXi2OAFrbob7yGOcTlOsegJoNeJCJrbHrdhU9kyz0vqwnZOukJrjL86YiJIMaK0C6bnUkQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755042785; c=relaxed/simple;
	bh=9qcyWdG+FPc6f3BpLvDkzYF4KvXRp3c65nSRlmO6sGk=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=hGwXVlzMaUCbPagmjpVsZe0cLUg3G+KEVhUdWNGAW3B4LBEfdhXYc+eXWIu2oIwY+G5UfRFXFujSqac/vxLK8ETdRWuJ7meBNkYg1R9VwIoh3OoA2/fNqQw2sC5jbWHHK9CcKQVjKECCaSpudlK9NdYUnzkcMgCkW8hRQ0LDVyI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name;
 spf=pass smtp.mailfrom=neil.brown.name; arc=none smtp.client-ip=103.29.64.221
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=none (p=none dis=none) header.from=brown.name
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=neil.brown.name
Received: from 196.186.233.220.static.exetel.com.au ([220.233.186.196]
 helo=home.neil.brown.name)
	by neil.brown.name with esmtp (Exim 4.95)
	(envelope-from <mr@neil.brown.name>)
	id 1ulynR-005Y2F-21;
	Tue, 12 Aug 2025 23:52:50 +0000
From: NeilBrown <neil@brown.name>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jan Kara <jack@suse.cz>
Cc: David Howells <dhowells@redhat.com>,
	Marc Dionne <marc.dionne@auristor.com>,
	Xiubo Li <xiubli@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Tyler Hicks <code@tyhicks.com>,
	Miklos Szeredi <miklos@szeredi.hu>,
	Richard Weinberger <richard@nod.at>,
	Anton Ivanov <anton.ivanov@cambridgegreys.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>,
	Amir Goldstein <amir73il@gmail.com>,
	Steve French <sfrench@samba.org>,
	Namjae Jeon <linkinjeon@kernel.org>,
	Carlos Maiolino <cem@kernel.org>,
	linux-fsdevel@vger.kernel.org,
	linux-afs@lists.infradead.org,
	netfs@lists.linux.dev,
	ceph-devel@vger.kernel.org,
	ecryptfs@vger.kernel.org,
	linux-um@lists.infradead.org,
	linux-nfs@vger.kernel.org,
	linux-unionfs@vger.kernel.org,
	linux-cifs@vger.kernel.org,
	linux-xfs@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 11/11] VFS: introduce d_alloc_noblock() and d_alloc_locked()
Date: Tue, 12 Aug 2025 12:25:14 +1000
Message-ID: <20250812235228.3072318-12-neil@brown.name>
X-Mailer: git-send-email 2.50.0.107.gf914562f5916.dirty
In-Reply-To: <20250812235228.3072318-1-neil@brown.name>
References: <20250812235228.3072318-1-neil@brown.name>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Several filesystems use the results of readdir to prime the dcache.
These filesystems use d_alloc_parallel() which can block if there is a
concurrent lookup.  Blocking in that case is pointless as the lookup
will add info to the dcache and there is no value in the readdir waiting
to see if it should add the info too.

Also these calls to d_alloc_parallel() are made while the parent
directory is locked.  A proposed change to locking will lock the parent
later, after d_alloc_parallel().  This means it won't be safe to wait in
d_alloc_parallel() while holding the directory lock.

So this patch introduces d_alloc_noblock() which doesn't block
but instead returns ERR_PTR(-EWOULDBLOCK).  Filesystems that prime the
dcache now use that and ignore -EWOULDBLOCK errors as harmless.

A few filesystems need more than -EWOULDBLOCK - they need to be able to
create the missing dentry within the readdir.  procfs is a good example
as the inode number is not known until the lookup completes, so readdir
must perform a full lookup.

For these filesystems d_alloc_locked() is provided.  It will return a
dentry which is already d_in_lookup() but will also lock it against
concurrent lookup.  The filesystem's ->lookup function must co-operate
by calling lock_lookup() before proceeding with the lookup.  This way we
can ensure exclusion between a lookup performed in ->iterate_shared and
a lookup performed in ->lookup.  Currently this exclusion is provided by
waiting in d_wait_lookup().  The proposed changed to dir locking will
mean that calling d_wait_lookup() (in readdir) while already holding
i_rwsem could deadlock.

Any filesystem (i.e.  xfs) that uses d_add_ci() faces a similar problem
as d_add_ci() calls d_alloc_parallel() while holding i_rwsem and so
will risk deadlock.  d_add_ci() is changed to use d_alloc_locked() and
any filesystem using it must call lock_lookup() in the ->lookup
function, at least when configured for ci.

After the changes to directory locking are complete, filesystems which
opt out of using i_rwsem (which will be for all member-related
activities except ->iterate_shared) lookups will no longer wait for
i_rwsem so d_alloc_locked() will no longer be needed: d_alloc_parallel()
won't deadlock.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/dcache.c             | 147 ++++++++++++++++++++++++++++++++++++++--
 fs/fuse/readdir.c       |  14 ++--
 fs/namei.c              |  57 ++++++++++++++--
 fs/nfs/dir.c            |  13 ++--
 fs/smb/client/readdir.c |   2 +-
 fs/xfs/xfs_iops.c       |   5 ++
 include/linux/dcache.h  |   7 +-
 include/linux/namei.h   |   2 +
 8 files changed, 225 insertions(+), 22 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 7e3eb5576fa4..ca96518f21f1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2121,6 +2121,10 @@ EXPORT_SYMBOL(d_obtain_root);
  *
  * If no entry exists with the exact case name, allocate new dentry with
  * the exact case, and return the spliced entry.
+ *
+ * Any filesystem which calls this in its ->lookup function must
+ * call lock_lookup() at the start of that function and return the
+ * result if IS_ERR_OR_NULL()
  */
 struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 			struct qstr *name)
@@ -2136,7 +2140,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct=
 inode *inode,
 		iput(inode);
 		return found;
 	}
-	found =3D d_alloc_parallel(dentry->d_parent, name);
+	found =3D d_alloc_locked(dentry->d_parent, name);
 	if (IS_ERR(found) || !d_in_lookup(found)) {
 		iput(inode);
 		return found;
@@ -2581,8 +2585,16 @@ static void d_wait_lookup(struct dentry *dentry)
 	}
 }
=20
-struct dentry *d_alloc_parallel(struct dentry *parent,
-				const struct qstr *name)
+/* What to do when __d_alloc_parallel finds a d_in_lookup dentry */
+enum alloc_para {
+	ALLOC_PARA_WAIT,
+	ALLOC_PARA_FAIL,
+	ALLOC_PARA_RETURN,
+};
+
+static inline struct dentry *__d_alloc_parallel(struct dentry *parent,
+						const struct qstr *name,
+						enum alloc_para how)
 {
 	unsigned int hash =3D name->hash;
 	struct hlist_bl_head *b =3D in_lookup_hash(parent, hash);
@@ -2596,6 +2608,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
=20
 	new->d_flags |=3D DCACHE_PAR_LOOKUP;
 	new->d_wait =3D NULL;
+	new->d_lookup_locked =3D false;
 	spin_lock(&parent->d_lock);
 	new->d_parent =3D dget_dlock(parent);
 	hlist_add_head(&new->d_sib, &parent->d_children);
@@ -2663,7 +2676,22 @@ struct dentry *d_alloc_parallel(struct dentry *paren=
t,
 		 * wait for them to finish
 		 */
 		spin_lock(&dentry->d_lock);
-		d_wait_lookup(dentry);
+		if (d_in_lookup(dentry))
+			switch (how) {
+			case ALLOC_PARA_FAIL:
+				spin_unlock(&dentry->d_lock);
+				dput(new);
+				dput(dentry);
+				return ERR_PTR(-EWOULDBLOCK);
+			case ALLOC_PARA_RETURN:
+				spin_unlock(&dentry->d_lock);
+				dput(new);
+				return dentry;
+			case ALLOC_PARA_WAIT:
+				d_wait_lookup(dentry);
+				/* ... and continue */
+			}
+
 		/*
 		 * it's not in-lookup anymore; in principle we should repeat
 		 * everything from dcache lookup, but it's likely to be what
@@ -2692,8 +2720,116 @@ struct dentry *d_alloc_parallel(struct dentry *pare=
nt,
 	dput(dentry);
 	goto retry;
 }
+
+/**
+ * d_alloc_parallel() - allocate a new dentry and ensure uniqueness
+ * @parent - dentry of the parent
+ * @name   - name of the dentry within that parent.
+ *
+ * A new dentry is allocated and, providing it is unique, added to the
+ * relevant index.
+ * If an existing dentry is found with the same parent/name that is
+ * not d_in_lookup(), then that is returned instead.
+ * If the existing dentry is d_in_lookup(), d_alloc_parallel() waits for
+ * that lookup to complete before returning the dentry and then ensures the
+ * match is still valid.
+ * Thus if the returned dentry is d_in_lookup() then the caller has
+ * exclusive access until it completes the lookup.  (but see
+ * d_alloc_locked() below) If the returned dentry is not
+ * d_in_lookup() then a lookup has already completed.
+ *
+ * The @name must already have ->hash set, as can be achieved
+ * by e.g. try_lookup_noperm().
+ *
+ * Returns: the dentry, whether found or allocated, or an error %-ENOMEM.
+ */
+struct dentry *d_alloc_parallel(struct dentry *parent,
+				const struct qstr *name)
+{
+	return __d_alloc_parallel(parent, name, ALLOC_PARA_WAIT);
+}
 EXPORT_SYMBOL(d_alloc_parallel);
=20
+/**
+ * d_alloc_noblock() - find or allocate a new dentry
+ * @parent - dentry of the parent
+ * @name   - name of the dentry within that parent.
+ *
+ * A new dentry is allocated and, providing it is unique, added to the
+ * relevant index.
+ * If an existing dentry is found with the same parent/name that is
+ * not d_in_lookup() then that is returned instead.
+ * If the existing dentry is d_in_lookup(), d_alloc_noblock()
+ * returns with error %-EWOULDBLOCK.
+ * Thus if the returned dentry is d_in_lookup() then the caller has
+ * exclusive access until it completes the lookup.  (but see
+ * d_alloc_locked() below) If the returned dentry is not
+ * d_in_lookup() then a lookup has already completed.
+ *
+ * The @name must already have ->hash set, as can be achieved
+ * by e.g. try_lookup_noperm().
+ *
+ * Returns: the dentry, whether found or allocated, or an error
+ *    %-ENOMEM or %-EWOULDBLOCK.
+ */
+struct dentry *d_alloc_noblock(struct dentry *parent,
+					struct qstr *name)
+{
+	return __d_alloc_parallel(parent, name, ALLOC_PARA_FAIL);
+}
+EXPORT_SYMBOL(d_alloc_noblock);
+
+/**
+ * d_alloc_locked() - allocate a new dentry and ensure uniqueness
+ * @parent - dentry of the parent
+ * @name   - name of the dentry within that parent.
+ *
+ * A new dentry is allocated and, providing it is unique, added to the
+ * relevant index.
+ * If an existing dentry is found with the same parent/name that is not
+ * d_in_lookup() then that is returned instead.  If the existing dentry
+ * is d_in_lookup(), d_alloc_locked() will return it and the caller
+ * should instantiate it.  This requires particular care on the part of
+ * the caller.
+ *
+ * This may only be used by a filesystem on its own dentrys.  Any filesyst=
em
+ * using it must have a lookup inode_operation which first calls
+ * lock_lookup() on the dentry and returns the result if it IS_ERR_OR_NULL=
().
+ * This will ensure only one of the callers of d_alloc_locked() or ->looku=
p()
+ * will instantiate the dentry, but not both.
+ *
+ * Returns: the dentry, whether found or allocated, or an error
+ *    -ENOMEM.
+ */
+struct dentry *d_alloc_locked(struct dentry *parent,
+			      struct qstr *name)
+{
+	struct dentry *dentry;
+again:
+	dentry =3D __d_alloc_parallel(parent, name, ALLOC_PARA_RETURN);
+	if (IS_ERR(dentry))
+		return dentry;
+	if (d_in_lookup(dentry)) {
+		spin_lock(&dentry->d_lock);
+		wait_var_event_spinlock(&dentry->d_lookup_locked,
+					!d_in_lookup(dentry) ||
+					!dentry->d_lookup_locked,
+					&dentry->d_lock);
+		if (d_in_lookup(dentry)) {
+			dentry->d_lookup_locked =3D true;
+			spin_unlock(&dentry->d_lock);
+			return dentry;
+		}
+		spin_unlock(&dentry->d_lock);
+	}
+	if (d_unhashed(dentry)) {
+		dput(dentry);
+		goto again;
+	}
+	return dentry;
+}
+EXPORT_SYMBOL(d_alloc_locked);
+
 /*
  * - Unhash the dentry
  * - Retrieve and clear the waitqueue head in dentry
@@ -2706,6 +2842,9 @@ static wait_queue_head_t *__d_lookup_unhash(struct de=
ntry *dentry)
=20
 	lockdep_assert_held(&dentry->d_lock);
=20
+	if (dentry->d_lookup_locked)
+		wake_up_var_locked(&dentry->d_lookup_locked, &dentry->d_lock);
+
 	b =3D in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
 	hlist_bl_lock(b);
 	dentry->d_flags &=3D ~DCACHE_PAR_LOOKUP;
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index f588252891af..d566db29c51a 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -6,12 +6,12 @@
   See the file COPYING.
 */
=20
-
 #include "fuse_i.h"
 #include <linux/iversion.h>
 #include <linux/posix_acl.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/namei.h>
=20
 static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ct=
x)
 {
@@ -192,14 +192,18 @@ static int fuse_direntplus_link(struct file *file,
 	fc =3D get_fuse_conn(dir);
 	epoch =3D atomic_read(&fc->epoch);
=20
-	name.hash =3D full_name_hash(parent, name.name, name.len);
-	dentry =3D d_lookup(parent, &name);
+	dentry =3D try_lookup_noperm(&name, parent);
 	if (!dentry) {
 retry:
-		dentry =3D d_alloc_parallel(parent, &name);
-		if (IS_ERR(dentry))
+		dentry =3D d_alloc_noblock(parent, &name);
+		if (IS_ERR(dentry)) {
+			if (PTR_ERR(dentry) =3D=3D -EWOULDBLOCK)
+				/* harmless */
+				return 0;
 			return PTR_ERR(dentry);
+		}
 	}
+
 	if (!d_in_lookup(dentry)) {
 		struct fuse_inode *fi;
 		inode =3D d_inode(dentry);
diff --git a/fs/namei.c b/fs/namei.c
index 6a645f3a2b20..5e03458f503c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1665,6 +1665,49 @@ static struct dentry *lookup_dcache(const struct qst=
r *name,
 	return dentry;
 }
=20
+/*
+ * lock_lookup: prepare to lookup exclusively with d_alloc_locked()
+ * @dentry: the dentry that needs to be locked for lookup
+ *
+ * Any filesystem which uses d_alloc_locked() internally must
+ * commense the lookup() inode_operation with a called to lock_lookup(),
+ * and must immediately return the result if it is %NULL or an error.
+ * This protects against races so that only one thread will proceed
+ * to create the relevant inode and instantiate it to the dentry.
+ *
+ * The lock is achieved by setting ->d_lookup_locked while
+ * %DCACHE_PAR_LOOKUP is set.  d_lookup_done() and other functions
+ * which clear %DCACHE_PAR_LOOKUP will wake up any waiters if
+ * ->d_lookup_locked was set.
+ *
+ * Returns: @dentry if the lock was gained, else a suitable return value
+ * for ->lookup, either %NULL if lookup is already compete or  %-EAGAIN
+ * indicating that the dcache lookup needs to be repeated.
+ */
+struct dentry *lock_lookup(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	wait_var_event_spinlock(&dentry->d_lookup_locked,
+				!d_in_lookup(dentry) ||
+				!dentry->d_lookup_locked,
+				&dentry->d_lock);
+	if (d_in_lookup(dentry)) {
+		dentry->d_lookup_locked =3D true;
+		spin_unlock(&dentry->d_lock);
+		/* Continue with normal lookup */
+		return dentry;
+	}
+	spin_unlock(&dentry->d_lock);
+	if (!d_unhashed(dentry))
+		/* Just return dentry */
+		return NULL;
+	/* lookup didn't hash dentry, maybe it substituted a dentry.
+	 * Need to retry
+	 */
+	return ERR_PTR(-EAGAIN);
+}
+EXPORT_SYMBOL(lock_lookup);
+
 /*
  * Parent directory has inode locked.
  * d_lookup_done() must be called before the dentry is dput()
@@ -1682,6 +1725,7 @@ struct dentry *lookup_one_qstr_excl(const struct qstr=
 *name,
 	struct dentry *old;
 	struct inode *dir;
=20
+again:
 	dentry =3D lookup_dcache(name, base, flags);
 	if (dentry)
 		goto found;
@@ -1702,6 +1746,8 @@ struct dentry *lookup_one_qstr_excl(const struct qstr=
 *name,
 	if (unlikely(old)) {
 		d_lookup_done(dentry);
 		dput(dentry);
+		if (old =3D=3D ERR_PTR(-EAGAIN))
+			goto again;
 		dentry =3D old;
 	}
 found:
@@ -2057,6 +2103,8 @@ static struct dentry *__lookup_slow(const struct qstr=
 *name,
 		d_lookup_done(dentry);
 		if (unlikely(old)) {
 			dput(dentry);
+			if (old =3D=3D ERR_PTR(-EAGAIN))
+			    goto again;
 			dentry =3D old;
 		}
 	}
@@ -4057,6 +4105,7 @@ static struct dentry *lookup_open(struct nameidata *n=
d, struct file *file,
 		return ERR_PTR(-ENOENT);
=20
 	file->f_mode &=3D ~FMODE_CREATED;
+again:
 	dentry =3D d_lookup(dir, &nd->last);
 	for (;;) {
 		if (!dentry) {
@@ -4120,11 +4169,11 @@ static struct dentry *lookup_open(struct nameidata =
*nd, struct file *file,
 							     nd->flags);
 		d_lookup_done(dentry);
 		if (unlikely(res)) {
-			if (IS_ERR(res)) {
-				error =3D PTR_ERR(res);
-				goto out_dput;
-			}
 			dput(dentry);
+			if (res =3D=3D ERR_PTR(-EAGAIN))
+				goto again;
+			if (IS_ERR(res))
+				return res;
 			dentry =3D res;
 		}
 	}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bbeb24805a0e..4293754cdec6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -750,15 +750,14 @@ void nfs_prime_dcache(struct dentry *parent, struct n=
fs_entry *entry,
 		if (filename.len =3D=3D 2 && filename.name[1] =3D=3D '.')
 			return;
 	}
-	filename.hash =3D full_name_hash(parent, filename.name, filename.len);
=20
-	dentry =3D d_lookup(parent, &filename);
+	dentry =3D try_lookup_noperm(&filename, parent);
 again:
-	if (!dentry) {
-		dentry =3D d_alloc_parallel(parent, &filename);
-		if (IS_ERR(dentry))
-			return;
-	}
+	if (!dentry)
+		dentry =3D d_alloc_noblock(parent, &filename);
+	if (IS_ERR(dentry))
+		return;
+
 	if (!d_in_lookup(dentry)) {
 		/* Is there a mountpoint here? If so, just exit */
 		if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 5a92a1ad317d..d47fc8ab7879 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -105,7 +105,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *n=
ame,
 		    (fattr->cf_flags & CIFS_FATTR_NEED_REVAL))
 			return;
=20
-		dentry =3D d_alloc_parallel(parent, name);
+		dentry =3D d_alloc_noblock(parent, name);
 	}
 	if (IS_ERR(dentry))
 		return;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 149b5460fbfd..07a4a8116f08 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -35,6 +35,7 @@
 #include <linux/security.h>
 #include <linux/iversion.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
=20
 /*
  * Directories have different lock order w.r.t. mmap_lock compared to regu=
lar
@@ -349,6 +350,10 @@ xfs_vn_ci_lookup(
 	if (dentry->d_name.len >=3D MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
=20
+	dentry =3D lock_lookup(dentry);
+	if (IS_ERR_OR_NULL(dentry))
+		return dentry;
+
 	xfs_dentry_to_name(&xname, dentry);
 	error =3D xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
 	if (unlikely(error)) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 996259d1bc88..cfccd5c2fa5b 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -114,7 +114,10 @@ struct dentry {
=20
 	union {
 		struct list_head d_lru;		/* LRU list */
-		wait_queue_head_t *d_wait;	/* in-lookup ones only */
+		struct {			/* in-lookup ones only */
+			wait_queue_head_t *d_wait;
+			bool		   d_lookup_locked;
+		};
 	};
 	struct hlist_node d_sib;	/* child of parent list */
 	struct hlist_head d_children;	/* our children */
@@ -242,6 +245,8 @@ extern void d_delete(struct dentry *);
 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
 extern struct dentry * d_alloc_anon(struct super_block *);
 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr=
 *);
+extern struct dentry * d_alloc_noblock(struct dentry *, struct qstr *);
+extern struct dentry * d_alloc_locked(struct dentry *, struct qstr *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 /* weird procfs mess; *NOT* exported */
 extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *,
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 26e5778c665f..a39dca19375b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -101,6 +101,8 @@ struct dentry *simple_start_creating(struct dentry *par=
ent, const char *name)
 				    LOOKUP_CREATE | LOOKUP_EXCL);
 }
=20
+struct dentry *lock_lookup(struct dentry *dentry);
+
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *path, unsigned int flags);
 extern int follow_up(struct path *);
--=20
2.50.0.107.gf914562f5916.dirty