fs/nfs/dir.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-)
A race condition occurs when both unlink() and link() are running
concurrently on the same inode, and the nlink count from the nfs server
received in link()->nfs_do_access() clobbers the nlink count of the
inode in nfs_safe_remove() after the "remove" RPC is made to the server
but before we decrement the link count. If the nlink value from
nfs_do_access() reflects the decremented nlink of the "remove" RPC, a
double decrement occurs, which can lead to the dropping of the client
side inode, causing the link call to return ENOENT. To fix this, we
record an expected nlink value before the "remove" RPC and compare it
with the value afterwards---if these two are the same, the drop is
performed. Note that this does not take into account nlink values that
are a result of multi-client (un)link operations as these are not
guaranteed to be atomic by the NFS spec.
Signed-off-by: Aiden Lambert <alambert48@gatech.edu>
---
fs/nfs/dir.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 46d9c65d50f..965787a8eee 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1892,19 +1892,24 @@ static int nfs_dentry_delete(const struct dentry *dentry)
return 0;
}
-/* Ensure that we revalidate inode->i_nlink */
-static void nfs_drop_nlink(struct inode *inode)
+static void nfs_drop_nlink_locked(struct inode *inode)
{
- spin_lock(&inode->i_lock);
/* drop the inode if we're reasonably sure this is the last link */
if (inode->i_nlink > 0)
drop_nlink(inode);
NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
nfs_set_cache_invalid(
inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
NFS_INO_INVALID_NLINK);
+}
+
+/* Ensure that we revalidate inode->i_nlink */
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_drop_nlink_locked(inode);
spin_unlock(&inode->i_lock);
}
/*
@@ -2505,11 +2510,18 @@ static int nfs_safe_remove(struct dentry *dentry)
}
trace_nfs_remove_enter(dir, dentry);
if (inode != NULL) {
+ spin_lock(&inode->i_lock);
+ unsigned int expected_nlink = inode->i_nlink;
+
+ spin_unlock(&inode->i_lock);
+
error = NFS_PROTO(dir)->remove(dir, dentry);
- if (error == 0)
- nfs_drop_nlink(inode);
+
+ spin_lock(&inode->i_lock);
+ if (error == 0 && expected_nlink == inode->i_nlink)
+ nfs_drop_nlink_locked(inode);
+ spin_unlock(&inode->i_lock);
} else
error = NFS_PROTO(dir)->remove(dir, dentry);
if (error == -ENOENT)
nfs_dentry_handle_enoent(dentry);
--
2.51.1
On Mon, 2025-11-17 at 13:03 -0500, Aiden Lambert wrote: > A race condition occurs when both unlink() and link() are running > concurrently on the same inode, and the nlink count from the nfs > server > received in link()->nfs_do_access() clobbers the nlink count of the > inode in nfs_safe_remove() after the "remove" RPC is made to the > server > but before we decrement the link count. If the nlink value from > nfs_do_access() reflects the decremented nlink of the "remove" RPC, a > double decrement occurs, which can lead to the dropping of the client > side inode, causing the link call to return ENOENT. To fix this, we > record an expected nlink value before the "remove" RPC and compare it > with the value afterwards---if these two are the same, the drop is > performed. Note that this does not take into account nlink values > that > are a result of multi-client (un)link operations as these are not > guaranteed to be atomic by the NFS spec. Why do we end up running nfs_do_access() at all in the above test? That sounds like a bug. We shouldn't ever need to validate if we can create or delete things using ACCESS. That just ends up producing an unnecessary TOCTOU race. -- Trond Myklebust Linux NFS client maintainer, Hammerspace trondmy@kernel.org, trond.myklebust@hammerspace.com
On Mon, Nov 17, 2025 at 01:24:54PM -0500, Trond Myklebust wrote: > On Mon, 2025-11-17 at 13:03 -0500, Aiden Lambert wrote: > > A race condition occurs when both unlink() and link() are running > > concurrently on the same inode, and the nlink count from the nfs > > server > > received in link()->nfs_do_access() clobbers the nlink count of the > > inode in nfs_safe_remove() after the "remove" RPC is made to the > > server > > but before we decrement the link count. If the nlink value from > > nfs_do_access() reflects the decremented nlink of the "remove" RPC, a > > double decrement occurs, which can lead to the dropping of the client > > side inode, causing the link call to return ENOENT. To fix this, we > > record an expected nlink value before the "remove" RPC and compare it > > with the value afterwards---if these two are the same, the drop is > > performed. Note that this does not take into account nlink values > > that > > are a result of multi-client (un)link operations as these are not > > guaranteed to be atomic by the NFS spec. > > > Why do we end up running nfs_do_access() at all in the above test? That > sounds like a bug. We shouldn't ever need to validate if we can create > or delete things using ACCESS. That just ends up producing an > unnecessary TOCTOU race. > > > -- > Trond Myklebust > Linux NFS client maintainer, Hammerspace > trondmy@kernel.org, trond.myklebust@hammerspace.com It seems that the call chain is 1. vfs_link() 2. may_create() 3. inode_permission()/nfs_permission() which fails to get a cached value as the cache is invalidated by the (un)link operations 4. nfs_do_access() My initial thought for the fix was to elevate the inode rwsem to exclusive write access in nfs_do_access if the thread does not already have it, but I figured this design of not locking was a conscious one. A second thought was to add a new lock around all operations talking to nfsd and their corresponding client side metadata updates, but that may not be the way to go. I have a proof of concept script that catches the symptoms of this race if you would like me to upload it.
On Mon, 2025-11-17 at 13:57 -0500, Aiden Lambert wrote:
> On Mon, Nov 17, 2025 at 01:24:54PM -0500, Trond Myklebust wrote:
> > On Mon, 2025-11-17 at 13:03 -0500, Aiden Lambert wrote:
> > > A race condition occurs when both unlink() and link() are running
> > > concurrently on the same inode, and the nlink count from the nfs
> > > server
> > > received in link()->nfs_do_access() clobbers the nlink count of
> > > the
> > > inode in nfs_safe_remove() after the "remove" RPC is made to the
> > > server
> > > but before we decrement the link count. If the nlink value from
> > > nfs_do_access() reflects the decremented nlink of the "remove"
> > > RPC, a
> > > double decrement occurs, which can lead to the dropping of the
> > > client
> > > side inode, causing the link call to return ENOENT. To fix this,
> > > we
> > > record an expected nlink value before the "remove" RPC and
> > > compare it
> > > with the value afterwards---if these two are the same, the drop
> > > is
> > > performed. Note that this does not take into account nlink values
> > > that
> > > are a result of multi-client (un)link operations as these are not
> > > guaranteed to be atomic by the NFS spec.
> >
> >
> > Why do we end up running nfs_do_access() at all in the above test?
> > That
> > sounds like a bug. We shouldn't ever need to validate if we can
> > create
> > or delete things using ACCESS. That just ends up producing an
> > unnecessary TOCTOU race.
> >
> >
>
> It seems that the call chain is
> 1. vfs_link()
> 2. may_create()
> 3. inode_permission()/nfs_permission() which fails to get a cached
> value as the cache is
> invalidated by the (un)link operations
> 4. nfs_do_access()
I'm still confused. Why wouldn't the S_IFDIR case in nfs_permission()
be filtering away the call to nfs_do_access()?
Either way, I think this cannot be fixed without treating it as a
generic attribute race. Your patch is sort of correct, but there are
loopholes that it won't catch either, so let's just fix it the way that
we fix all attribute races: by using the generation counter to tell us
when such a race may have occurred.
8<---------------------------------------
From bd4928ec799b31c492eb63f9f4a0c1e0bb4bb3f7 Mon Sep 17 00:00:00 2001
Message-ID: <bd4928ec799b31c492eb63f9f4a0c1e0bb4bb3f7.1763418720.git.trond.myklebust@hammerspace.com>
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 17 Nov 2025 15:28:17 -0500
Subject: [PATCH] NFS: Avoid changing nlink when file removes and attribute
updates race
If a file removal races with another operation that updates its
attributes, then skip the change to nlink, and just mark the attributes
as being stale.
Reported-by: Aiden Lambert <alambert48@gatech.edu>
Fixes: 59a707b0d42e ("NFS: Ensure we revalidate the inode correctly after remove or rename")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
fs/nfs/dir.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ea9f6ca8f30f..d557b0443e8b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1894,13 +1894,15 @@ static int nfs_dentry_delete(const struct dentry *dentry)
}
/* Ensure that we revalidate inode->i_nlink */
-static void nfs_drop_nlink(struct inode *inode)
+static void nfs_drop_nlink(struct inode *inode, unsigned long gencount)
{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
spin_lock(&inode->i_lock);
/* drop the inode if we're reasonably sure this is the last link */
- if (inode->i_nlink > 0)
+ if (inode->i_nlink > 0 && gencount == nfsi->attr_gencount)
drop_nlink(inode);
- NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfs_set_cache_invalid(
inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
NFS_INO_INVALID_NLINK);
@@ -1914,8 +1916,9 @@ static void nfs_drop_nlink(struct inode *inode)
static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
{
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ unsigned long gencount = READ_ONCE(NFS_I(inode)->attr_gencount);
nfs_complete_unlink(dentry, inode);
- nfs_drop_nlink(inode);
+ nfs_drop_nlink(inode, gencount);
}
iput(inode);
}
@@ -2507,9 +2510,11 @@ static int nfs_safe_remove(struct dentry *dentry)
trace_nfs_remove_enter(dir, dentry);
if (inode != NULL) {
+ unsigned long gencount = READ_ONCE(NFS_I(inode)->attr_gencount);
+
error = NFS_PROTO(dir)->remove(dir, dentry);
if (error == 0)
- nfs_drop_nlink(inode);
+ nfs_drop_nlink(inode, gencount);
} else
error = NFS_PROTO(dir)->remove(dir, dentry);
if (error == -ENOENT)
@@ -2709,6 +2714,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
+ unsigned long new_gencount = 0;
struct dentry *dentry = NULL;
struct rpc_task *task;
bool must_unblock = false;
@@ -2761,6 +2767,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
} else {
block_revalidate(new_dentry);
must_unblock = true;
+ new_gencount = NFS_I(new_inode)->attr_gencount;
spin_unlock(&new_dentry->d_lock);
}
@@ -2800,7 +2807,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_dir, new_dentry, error);
if (!error) {
if (new_inode != NULL)
- nfs_drop_nlink(new_inode);
+ nfs_drop_nlink(new_inode, new_gencount);
/*
* The d_move() should be here instead of in an async RPC completion
* handler because we need the proper locks to move the dentry. If
--
2.51.1
--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@kernel.org, trond.myklebust@hammerspace.com
© 2016 - 2025 Red Hat, Inc.