[PATCH 16/53] ovl: drop dir lock for lookups in impure readdir

NeilBrown posted 53 patches 3 weeks, 4 days ago
[PATCH 16/53] ovl: drop dir lock for lookups in impure readdir
Posted by NeilBrown 3 weeks, 4 days ago
From: NeilBrown <neil@brown.name>

When performing an "impure" readdir, ovl needs to perform a lookup on some
of the names that it found.
With proposed locking changes it will not be possible to perform this
lookup (in particular, not safe to wait for d_alloc_parallel()) while
holding a lock on the directory.

ovl doesn't really need the lock at this point.  It has already iterated
the directory and has cached a list of the contents.  It now needs to
gather extra information about some contents.  It can do this without
the lock.

After gathering that info it needs to retake the lock for API
correctness.  After doing this it must check IS_DEADDIR() again to
ensure readdir always returns -ENOENT on a removed directory.

Note that while ->iterate_shared is called with a shared lock, ovl uses
WRAP_DIR_ITER() so an exclusive lock is held and so we drop and retake
that exclusive lock.

As the directory is no longer locked in ovl_cache_update() we need
dget_parent() to get a reference to the parent.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/overlayfs/readdir.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 1dcc75b3a90f..d5123b37921c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -568,13 +568,12 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p,
 			goto get;
 		}
 		if (p->len == 2) {
-			/* we shall not be moved */
-			this = dget(dir->d_parent);
+			this = dget_parent(dir);
 			goto get;
 		}
 	}
 	/* This checks also for xwhiteouts */
-	this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);
+	this = lookup_one_unlocked(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);
 	if (IS_ERR_OR_NULL(this) || !this->d_inode) {
 		/* Mark a stale entry */
 		p->is_whiteout = true;
@@ -666,11 +665,12 @@ static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
 	if (err)
 		return err;
 
+	inode_unlock(path->dentry->d_inode);
 	list_for_each_entry_safe(p, n, list, l_node) {
 		if (!name_is_dot_dotdot(p->name, p->len)) {
 			err = ovl_cache_update(path, p, true);
 			if (err)
-				return err;
+				break;
 		}
 		if (p->ino == p->real_ino) {
 			list_del(&p->l_node);
@@ -680,14 +680,19 @@ static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
 			struct rb_node *parent = NULL;
 
 			if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
-							      &newp, &parent)))
-				return -EIO;
+							      &newp, &parent))) {
+				err = -EIO;
+				break;
+			}
 
 			rb_link_node(&p->node, parent, newp);
 			rb_insert_color(&p->node, root);
 		}
 	}
-	return 0;
+	inode_lock(path->dentry->d_inode);
+	if (IS_DEADDIR(path->dentry->d_inode))
+		err = -ENOENT;
+	return err;
 }
 
 static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
-- 
2.50.0.107.gf914562f5916.dirty
Re: [PATCH 16/53] ovl: drop dir lock for lookups in impure readdir
Posted by Amir Goldstein 3 weeks, 2 days ago
On Thu, Mar 12, 2026 at 10:49 PM NeilBrown <neilb@ownmail.net> wrote:
>
> From: NeilBrown <neil@brown.name>
>
> When performing an "impure" readdir, ovl needs to perform a lookup on some
> of the names that it found.
> With proposed locking changes it will not be possible to perform this
> lookup (in particular, not safe to wait for d_alloc_parallel()) while
> holding a lock on the directory.
>
> ovl doesn't really need the lock at this point.

Not exactly. see below.

> It has already iterated
> the directory and has cached a list of the contents.  It now needs to
> gather extra information about some contents.  It can do this without
> the lock.
>
> After gathering that info it needs to retake the lock for API
> correctness.  After doing this it must check IS_DEADDIR() again to
> ensure readdir always returns -ENOENT on a removed directory.
>
> Note that while ->iterate_shared is called with a shared lock, ovl uses
> WRAP_DIR_ITER() so an exclusive lock is held and so we drop and retake
> that exclusive lock.
>
> As the directory is no longer locked in ovl_cache_update() we need
> dget_parent() to get a reference to the parent.
>
> Signed-off-by: NeilBrown <neil@brown.name>
> ---
>  fs/overlayfs/readdir.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
> index 1dcc75b3a90f..d5123b37921c 100644
> --- a/fs/overlayfs/readdir.c
> +++ b/fs/overlayfs/readdir.c
> @@ -568,13 +568,12 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p,
>                         goto get;
>                 }
>                 if (p->len == 2) {
> -                       /* we shall not be moved */
> -                       this = dget(dir->d_parent);
> +                       this = dget_parent(dir);
>                         goto get;
>                 }
>         }
>         /* This checks also for xwhiteouts */
> -       this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);
> +       this = lookup_one_unlocked(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);

ovl_cache_update() is also called from ovl_iterate_merged() where inode
is locked.

>         if (IS_ERR_OR_NULL(this) || !this->d_inode) {
>                 /* Mark a stale entry */
>                 p->is_whiteout = true;
> @@ -666,11 +665,12 @@ static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
>         if (err)
>                 return err;
>
> +       inode_unlock(path->dentry->d_inode);
>         list_for_each_entry_safe(p, n, list, l_node) {
>                 if (!name_is_dot_dotdot(p->name, p->len)) {
>                         err = ovl_cache_update(path, p, true);
>                         if (err)
> -                               return err;
> +                               break;
>                 }
>                 if (p->ino == p->real_ino) {
>                         list_del(&p->l_node);
> @@ -680,14 +680,19 @@ static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
>                         struct rb_node *parent = NULL;
>
>                         if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
> -                                                             &newp, &parent)))
> -                               return -EIO;
> +                                                             &newp, &parent))) {
> +                               err = -EIO;
> +                               break;
> +                       }
>
>                         rb_link_node(&p->node, parent, newp);
>                         rb_insert_color(&p->node, root);
>                 }
>         }
> -       return 0;
> +       inode_lock(path->dentry->d_inode);
> +       if (IS_DEADDIR(path->dentry->d_inode))
> +               err = -ENOENT;
> +       return err;
>  }
>
>  static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
> --

You missed the fact that overlayfs uses the dir inode lock
to protect the readdir inode cache, so your patch introduces
a risk for storing a stale readdir cache when dir modify operations
invalidate the readdir cache version while lock is dropped
and also introduces memory leak when cache is stomped
without freeing cache created by a competing thread.
I think something like the untested patch below should fix this.

I did not look into ovl_iterate_merged() to see if it has a simple
fix and I am not 100% sure that this fix for impure dir is enough.

Thanks,
Amir.

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index d5123b37921c8..9e90064b252ce 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -702,15 +702,13 @@ static struct ovl_dir_cache
*ovl_cache_get_impure(const struct path *path)
        struct inode *inode = d_inode(dentry);
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct ovl_dir_cache *cache;
+       /* Snapshot version before ovl_dir_read_impure() drops i_rwsem */
+       u64 version = ovl_inode_version_get(inode);

        cache = ovl_dir_cache(inode);
-       if (cache && ovl_inode_version_get(inode) == cache->version)
+       if (cache && version == cache->version)
                return cache;

-       /* Impure cache is not refcounted, free it here */
-       ovl_dir_cache_free(inode);
-       ovl_set_dir_cache(inode, NULL);
-
        cache = kzalloc_obj(struct ovl_dir_cache);
        if (!cache)
                return ERR_PTR(-ENOMEM);
@@ -721,6 +719,14 @@ static struct ovl_dir_cache
*ovl_cache_get_impure(const struct path *path)
                kfree(cache);
                return ERR_PTR(res);
        }
+
+       /*
+        * Impure cache is not refcounted, free it here.
+        * Also frees cache stored by concurrent readdir during i_rwsem drop.
+        */
+       ovl_dir_cache_free(inode);
+       ovl_set_dir_cache(inode, NULL);
+
        if (list_empty(&cache->entries)) {
                /*
                 * A good opportunity to get rid of an unneeded "impure" flag.
@@ -736,7 +742,7 @@ static struct ovl_dir_cache
*ovl_cache_get_impure(const struct path *path)
                return NULL;
        }

-       cache->version = ovl_inode_version_get(inode);
+       cache->version = version;
        ovl_set_dir_cache(inode, cache);

        return cache;