fs/fuse/dev.c | 16 ++++++++++++++++ fs/fuse/dir.c | 22 +++++++++++++++++++--- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 1 + fs/fuse/readdir.c | 3 +++ include/uapi/linux/fuse.h | 6 +++++- 6 files changed, 47 insertions(+), 4 deletions(-)
Currently userspace is able to notify the kernel to invalidate the cache
for an inode. This means that, if all the inodes in a filesystem need to
be invalidated, then userspace needs to iterate through all of them and do
this kernel notification separately.
This patch adds the concept of 'epoch': each fuse connection will have the
current epoch initialized and every new dentry will have it's d_time set to
the current epoch value. A new operation will then allow userspace to
increment the epoch value. Every time a dentry is d_revalidate()'ed, it's
epoch is compared with the current connection epoch and invalidated if it's
value is different.
Signed-off-by: Luis Henriques <luis@igalia.com>
---
Nothing huge since v7, I just realized I forgot to bump the version number
while cleaning up a patch to prepare a libfuse MR.
* Changes since v7
- Bump FUSE interface minor number
* Changes since v6
- Major patch re-write, following a different approach suggested by Miklos
and Dave.
* Changes since v5
- Added missing iput() in function fuse_reverse_inval_all()
* Changes since v4
- Replaced superblock inodes iteration by a single call to
invalidate_inodes(). Also do the shrink_dcache_sb() first. (Dave Chinner)
* Changes since v3
- Added comments to clarify semantic changes in fuse_reverse_inval_inode()
when called with FUSE_INVAL_ALL_INODES (suggested by Bernd).
- Added comments to inodes iteration loop to clarify __iget/iput usage
(suggested by Joanne)
- Dropped get_fuse_mount() call -- fuse_mount can be obtained from
fuse_ilookup() directly (suggested by Joanne)
(Also dropped the RFC from the subject.)
* Changes since v2
- Use the new helper from fuse_reverse_inval_inode(), as suggested by Bernd.
- Also updated patch description as per checkpatch.pl suggestion.
* Changes since v1
As suggested by Bernd, this patch v2 simply adds an helper function that
will make it easier to replace most of it's code by a call to function
super_iter_inodes() when Dave Chinner's patch[1] eventually gets merged.
[1] https://lore.kernel.org/r/20241002014017.3801899-3-david@fromorbit.com
fs/fuse/dev.c | 16 ++++++++++++++++
fs/fuse/dir.c | 22 +++++++++++++++++++---
fs/fuse/fuse_i.h | 3 +++
fs/fuse/inode.c | 1 +
fs/fuse/readdir.c | 3 +++
include/uapi/linux/fuse.h | 6 +++++-
6 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5b5f789b37eb..e31a55ac3887 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1902,6 +1902,19 @@ static int fuse_notify_resend(struct fuse_conn *fc)
return 0;
}
+/*
+ * Increments the fuse connection epoch. This will result of dentries from
+ * previous epochs to be invalidated.
+ *
+ * XXX optimization: add call to shrink_dcache_sb()?
+ */
+static int fuse_notify_inc_epoch(struct fuse_conn *fc)
+{
+ atomic_inc(&fc->epoch);
+
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
@@ -1930,6 +1943,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_RESEND:
return fuse_notify_resend(fc);
+ case FUSE_NOTIFY_INC_EPOCH:
+ return fuse_notify_inc_epoch(fc);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 198862b086ff..5291deeb191f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
{
struct inode *inode;
struct fuse_mount *fm;
+ struct fuse_conn *fc;
struct fuse_inode *fi;
int ret;
+ fc = get_fuse_conn_super(dir->i_sb);
+ if (entry->d_time < atomic_read(&fc->epoch))
+ goto invalid;
+
inode = d_inode_rcu(entry);
if (inode && fuse_is_bad(inode))
goto invalid;
@@ -415,16 +420,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
unsigned int flags)
{
- int err;
struct fuse_entry_out outarg;
+ struct fuse_conn *fc;
struct inode *inode;
struct dentry *newent;
+ int err, epoch;
bool outarg_valid = true;
bool locked;
if (fuse_is_bad(dir))
return ERR_PTR(-EIO);
+ fc = get_fuse_conn_super(dir->i_sb);
+ epoch = atomic_read(&fc->epoch);
+
locked = fuse_lock_inode(dir);
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
&outarg, &inode);
@@ -446,6 +455,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
goto out_err;
entry = newent ? newent : entry;
+ entry->d_time = epoch;
if (outarg_valid)
fuse_change_entry_timeout(entry, &outarg);
else
@@ -619,7 +629,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *entry, struct file *file,
unsigned int flags, umode_t mode, u32 opcode)
{
- int err;
struct inode *inode;
struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
@@ -629,11 +638,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
+ int epoch, err;
bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
+ epoch = atomic_read(&fm->fc->epoch);
forget = fuse_alloc_forget();
err = -ENOMEM;
if (!forget)
@@ -702,6 +713,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
}
kfree(forget);
d_instantiate(entry, inode);
+ entry->d_time = epoch;
fuse_change_entry_timeout(entry, &outentry);
fuse_dir_changed(dir);
err = generic_file_open(inode, file);
@@ -788,12 +800,14 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
struct fuse_entry_out outarg;
struct inode *inode;
struct dentry *d;
- int err;
struct fuse_forget_link *forget;
+ int epoch, err;
if (fuse_is_bad(dir))
return -EIO;
+ epoch = atomic_read(&fm->fc->epoch);
+
forget = fuse_alloc_forget();
if (!forget)
return -ENOMEM;
@@ -836,9 +850,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
return PTR_ERR(d);
if (d) {
+ d->d_time = epoch;
fuse_change_entry_timeout(d, &outarg);
dput(d);
} else {
+ entry->d_time = epoch;
fuse_change_entry_timeout(entry, &outarg);
}
fuse_dir_changed(dir);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fee96fe7887b..06eecc125f89 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -611,6 +611,9 @@ struct fuse_conn {
/** Number of fuse_dev's */
atomic_t dev_count;
+ /** Current epoch for up-to-date dentries */
+ atomic_t epoch;
+
struct rcu_head rcu;
/** The user id for this mount */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e9db2cb8c150..5d2d29fad658 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -959,6 +959,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
init_rwsem(&fc->killsb);
refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
+ atomic_set(&fc->epoch, 1);
init_waitqueue_head(&fc->blocked_waitq);
fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
INIT_LIST_HEAD(&fc->bg_queue);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..46b7146f2c0d 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file,
struct fuse_conn *fc;
struct inode *inode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ int epoch;
if (!o->nodeid) {
/*
@@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file,
return -EIO;
fc = get_fuse_conn(dir);
+ epoch = atomic_read(&fc->epoch);
name.hash = full_name_hash(parent, name.name, name.len);
dentry = d_lookup(parent, &name);
@@ -256,6 +258,7 @@ static int fuse_direntplus_link(struct file *file,
}
if (fc->readdirplus_auto)
set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ dentry->d_time = epoch;
fuse_change_entry_timeout(dentry, o);
dput(dentry);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 5e0eb41d967e..15991728a894 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -229,6 +229,9 @@
* - FUSE_URING_IN_OUT_HEADER_SZ
* - FUSE_URING_OP_IN_OUT_SZ
* - enum fuse_uring_cmd
+ *
+ * 7.43
+ * - add FUSE_NOTIFY_INC_EPOCH
*/
#ifndef _LINUX_FUSE_H
@@ -264,7 +267,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 42
+#define FUSE_KERNEL_MINOR_VERSION 43
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -666,6 +669,7 @@ enum fuse_notify_code {
FUSE_NOTIFY_RETRIEVE = 5,
FUSE_NOTIFY_DELETE = 6,
FUSE_NOTIFY_RESEND = 7,
+ FUSE_NOTIFY_INC_EPOCH = 8,
FUSE_NOTIFY_CODE_MAX,
};
Hi Miklos,
On Wed, Feb 26 2025, Luis Henriques wrote:
> Currently userspace is able to notify the kernel to invalidate the cache
> for an inode. This means that, if all the inodes in a filesystem need to
> be invalidated, then userspace needs to iterate through all of them and do
> this kernel notification separately.
>
> This patch adds the concept of 'epoch': each fuse connection will have the
> current epoch initialized and every new dentry will have it's d_time set to
> the current epoch value. A new operation will then allow userspace to
> increment the epoch value. Every time a dentry is d_revalidate()'ed, it's
> epoch is compared with the current connection epoch and invalidated if it's
> value is different.
Any further feedback on this patch, or is it already OK for being merged?
And what about the extra call to shrink_dcache_sb(), do you think that
would that be acceptable? Maybe that could be conditional, by for example
setting a flag.
Cheers,
--
Luís
> Signed-off-by: Luis Henriques <luis@igalia.com>
> ---
> Nothing huge since v7, I just realized I forgot to bump the version number
> while cleaning up a patch to prepare a libfuse MR.
>
> * Changes since v7
> - Bump FUSE interface minor number
>
> * Changes since v6
> - Major patch re-write, following a different approach suggested by Miklos
> and Dave.
>
> * Changes since v5
> - Added missing iput() in function fuse_reverse_inval_all()
>
> * Changes since v4
> - Replaced superblock inodes iteration by a single call to
> invalidate_inodes(). Also do the shrink_dcache_sb() first. (Dave Chinner)
>
> * Changes since v3
> - Added comments to clarify semantic changes in fuse_reverse_inval_inode()
> when called with FUSE_INVAL_ALL_INODES (suggested by Bernd).
> - Added comments to inodes iteration loop to clarify __iget/iput usage
> (suggested by Joanne)
> - Dropped get_fuse_mount() call -- fuse_mount can be obtained from
> fuse_ilookup() directly (suggested by Joanne)
>
> (Also dropped the RFC from the subject.)
>
> * Changes since v2
> - Use the new helper from fuse_reverse_inval_inode(), as suggested by Bernd.
> - Also updated patch description as per checkpatch.pl suggestion.
>
> * Changes since v1
> As suggested by Bernd, this patch v2 simply adds an helper function that
> will make it easier to replace most of it's code by a call to function
> super_iter_inodes() when Dave Chinner's patch[1] eventually gets merged.
>
> [1] https://lore.kernel.org/r/20241002014017.3801899-3-david@fromorbit.com
>
> fs/fuse/dev.c | 16 ++++++++++++++++
> fs/fuse/dir.c | 22 +++++++++++++++++++---
> fs/fuse/fuse_i.h | 3 +++
> fs/fuse/inode.c | 1 +
> fs/fuse/readdir.c | 3 +++
> include/uapi/linux/fuse.h | 6 +++++-
> 6 files changed, 47 insertions(+), 4 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 5b5f789b37eb..e31a55ac3887 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -1902,6 +1902,19 @@ static int fuse_notify_resend(struct fuse_conn *fc)
> return 0;
> }
>
> +/*
> + * Increments the fuse connection epoch. This will result of dentries from
> + * previous epochs to be invalidated.
> + *
> + * XXX optimization: add call to shrink_dcache_sb()?
> + */
> +static int fuse_notify_inc_epoch(struct fuse_conn *fc)
> +{
> + atomic_inc(&fc->epoch);
> +
> + return 0;
> +}
> +
> static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
> unsigned int size, struct fuse_copy_state *cs)
> {
> @@ -1930,6 +1943,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
> case FUSE_NOTIFY_RESEND:
> return fuse_notify_resend(fc);
>
> + case FUSE_NOTIFY_INC_EPOCH:
> + return fuse_notify_inc_epoch(fc);
> +
> default:
> fuse_copy_finish(cs);
> return -EINVAL;
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 198862b086ff..5291deeb191f 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
> {
> struct inode *inode;
> struct fuse_mount *fm;
> + struct fuse_conn *fc;
> struct fuse_inode *fi;
> int ret;
>
> + fc = get_fuse_conn_super(dir->i_sb);
> + if (entry->d_time < atomic_read(&fc->epoch))
> + goto invalid;
> +
> inode = d_inode_rcu(entry);
> if (inode && fuse_is_bad(inode))
> goto invalid;
> @@ -415,16 +420,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
> static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
> unsigned int flags)
> {
> - int err;
> struct fuse_entry_out outarg;
> + struct fuse_conn *fc;
> struct inode *inode;
> struct dentry *newent;
> + int err, epoch;
> bool outarg_valid = true;
> bool locked;
>
> if (fuse_is_bad(dir))
> return ERR_PTR(-EIO);
>
> + fc = get_fuse_conn_super(dir->i_sb);
> + epoch = atomic_read(&fc->epoch);
> +
> locked = fuse_lock_inode(dir);
> err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
> &outarg, &inode);
> @@ -446,6 +455,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
> goto out_err;
>
> entry = newent ? newent : entry;
> + entry->d_time = epoch;
> if (outarg_valid)
> fuse_change_entry_timeout(entry, &outarg);
> else
> @@ -619,7 +629,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
> struct dentry *entry, struct file *file,
> unsigned int flags, umode_t mode, u32 opcode)
> {
> - int err;
> struct inode *inode;
> struct fuse_mount *fm = get_fuse_mount(dir);
> FUSE_ARGS(args);
> @@ -629,11 +638,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
> struct fuse_entry_out outentry;
> struct fuse_inode *fi;
> struct fuse_file *ff;
> + int epoch, err;
> bool trunc = flags & O_TRUNC;
>
> /* Userspace expects S_IFREG in create mode */
> BUG_ON((mode & S_IFMT) != S_IFREG);
>
> + epoch = atomic_read(&fm->fc->epoch);
> forget = fuse_alloc_forget();
> err = -ENOMEM;
> if (!forget)
> @@ -702,6 +713,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
> }
> kfree(forget);
> d_instantiate(entry, inode);
> + entry->d_time = epoch;
> fuse_change_entry_timeout(entry, &outentry);
> fuse_dir_changed(dir);
> err = generic_file_open(inode, file);
> @@ -788,12 +800,14 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
> struct fuse_entry_out outarg;
> struct inode *inode;
> struct dentry *d;
> - int err;
> struct fuse_forget_link *forget;
> + int epoch, err;
>
> if (fuse_is_bad(dir))
> return -EIO;
>
> + epoch = atomic_read(&fm->fc->epoch);
> +
> forget = fuse_alloc_forget();
> if (!forget)
> return -ENOMEM;
> @@ -836,9 +850,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
> return PTR_ERR(d);
>
> if (d) {
> + d->d_time = epoch;
> fuse_change_entry_timeout(d, &outarg);
> dput(d);
> } else {
> + entry->d_time = epoch;
> fuse_change_entry_timeout(entry, &outarg);
> }
> fuse_dir_changed(dir);
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index fee96fe7887b..06eecc125f89 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -611,6 +611,9 @@ struct fuse_conn {
> /** Number of fuse_dev's */
> atomic_t dev_count;
>
> + /** Current epoch for up-to-date dentries */
> + atomic_t epoch;
> +
> struct rcu_head rcu;
>
> /** The user id for this mount */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index e9db2cb8c150..5d2d29fad658 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -959,6 +959,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
> init_rwsem(&fc->killsb);
> refcount_set(&fc->count, 1);
> atomic_set(&fc->dev_count, 1);
> + atomic_set(&fc->epoch, 1);
> init_waitqueue_head(&fc->blocked_waitq);
> fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
> INIT_LIST_HEAD(&fc->bg_queue);
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index 17ce9636a2b1..46b7146f2c0d 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file,
> struct fuse_conn *fc;
> struct inode *inode;
> DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
> + int epoch;
>
> if (!o->nodeid) {
> /*
> @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file,
> return -EIO;
>
> fc = get_fuse_conn(dir);
> + epoch = atomic_read(&fc->epoch);
>
> name.hash = full_name_hash(parent, name.name, name.len);
> dentry = d_lookup(parent, &name);
> @@ -256,6 +258,7 @@ static int fuse_direntplus_link(struct file *file,
> }
> if (fc->readdirplus_auto)
> set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
> + dentry->d_time = epoch;
> fuse_change_entry_timeout(dentry, o);
>
> dput(dentry);
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 5e0eb41d967e..15991728a894 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -229,6 +229,9 @@
> * - FUSE_URING_IN_OUT_HEADER_SZ
> * - FUSE_URING_OP_IN_OUT_SZ
> * - enum fuse_uring_cmd
> + *
> + * 7.43
> + * - add FUSE_NOTIFY_INC_EPOCH
> */
>
> #ifndef _LINUX_FUSE_H
> @@ -264,7 +267,7 @@
> #define FUSE_KERNEL_VERSION 7
>
> /** Minor version number of this interface */
> -#define FUSE_KERNEL_MINOR_VERSION 42
> +#define FUSE_KERNEL_MINOR_VERSION 43
>
> /** The node ID of the root inode */
> #define FUSE_ROOT_ID 1
> @@ -666,6 +669,7 @@ enum fuse_notify_code {
> FUSE_NOTIFY_RETRIEVE = 5,
> FUSE_NOTIFY_DELETE = 6,
> FUSE_NOTIFY_RESEND = 7,
> + FUSE_NOTIFY_INC_EPOCH = 8,
> FUSE_NOTIFY_CODE_MAX,
> };
>
>
On Fri, 7 Mar 2025 at 16:31, Luis Henriques <luis@igalia.com> wrote: > Any further feedback on this patch, or is it already OK for being merged? The patch looks okay. I have ideas about improving the name, but that can wait. What I think is still needed is an actual use case with performance numbers. > And what about the extra call to shrink_dcache_sb(), do you think that > would that be acceptable? Maybe that could be conditional, by for example > setting a flag. My wish would be a more generic "garbage collection" mechanism that would collect stale cache entries and get rid of them in the background. Doing that synchronously doesn't really make sense, IMO. But that can be done independently of this patch, obviously. Thanks, Miklos
Hi Miklos, [ adding Laura to CC, something I should have done before ] On Mon, Mar 10 2025, Miklos Szeredi wrote: > On Fri, 7 Mar 2025 at 16:31, Luis Henriques <luis@igalia.com> wrote: > >> Any further feedback on this patch, or is it already OK for being merged? > > The patch looks okay. I have ideas about improving the name, but that can wait. > > What I think is still needed is an actual use case with performance numbers. As requested, I've run some tests on CVMFS using this kernel patch[1]. For reference, I'm also sharing the changes I've done to libfuse[2] and CVMFS[3] in order to use this new FUSE operation. The changes to these two repositories are in a branch named 'wip-notify-inc-epoch'. As for the details, basically what I've done was to hack the CVMFS loop in FuseInvalidator::MainInvalidator() so that it would do a single call to the libfuse operation fuse_lowlevel_notify_increment_epoch() instead of cycling through the inodes list. The CVMFS patch is ugly, it just short-circuiting the loop, but I didn't want to spend any more time with it at this stage. The real patch will be slightly more complex in order to deal with both approaches, in case the NOTIFY_INC_EPOCH isn't available. Anyway, my test environment was a small VM, where I have two scenarios: a small file-system with just a few inodes, and a larger one with around 8000 inodes. The test approach was to simply mount the filesystem, load the caches with 'find /mnt' and force a flush using the cvmfs_swissknife tool, with the 'ingest' command. [ Disclosure: my test environment actually uses a fork of upstream cvmfs, but for the purposes of these tests that shouldn't really make any difference. ] The numbers in the table below represent the average time (tests were run 100 times) it takes to run the MainInvalidator() function. As expected, using the NOTIFY_INC_EPOCH is much faster, as it's a single operation, a single call into FUSE. Using the NOTIFY_INVAL_* is much more expensive -- it requires calling into the kernel several times, depending on the number of inodes on the list. |------------------+------------------+----------------| | | small filesystem | "big" fs | | | (~20 inodes) | (~8000 inodes) | |------------------+------------------+----------------| | NOTIFY_INVAL_* | 330 us | 4300 us | | NOTIFY_INC_EPOCH | 40 us | 45 us | |------------------+------------------+----------------| Hopefully these results help answering Miklos questions regarding the cvmfs use-case. [1] https://lore.kernel.org/all/20250226091451.11899-1-luis@igalia.com/ [2] https://github.com/luis-henrix/libfuse [3] https://github.com/luis-henrix/cvmfs Cheers, -- Luís
Hi Miklos,
On Mon, Mar 10 2025, Miklos Szeredi wrote:
> On Fri, 7 Mar 2025 at 16:31, Luis Henriques <luis@igalia.com> wrote:
>
>> Any further feedback on this patch, or is it already OK for being merged?
>
> The patch looks okay. I have ideas about improving the name, but that can wait.
Naming suggestions are always welcome!
> What I think is still needed is an actual use case with performance numbers.
Well, the use-case I had in mind is, as I mentioned before, CVMFS. I
think this file system could benefit from using this mechanism.
However, I don't think that measuring the direct benefits is something
easily done. At the moment, it uses a thread that tries to drain the
cache using the FUSE_NOTIFY_INVAL_{INODE,ENTRY} operations. These are,
obviously, operations that are much more expensive than the proposed
FUSE_NOTIFY_INC_EPOCH. But, on the other hand, they have *immediate*
effect while the new operation does not: without the call to
shrink_dcache_sb() it's effect can only be observed in the long run.
I can try to come up with some artificial test case for this, but
comparing these operations will always need to be done indirectly. And I
wonder how useful that would be.
>> And what about the extra call to shrink_dcache_sb(), do you think that
>> would that be acceptable? Maybe that could be conditional, by for example
>> setting a flag.
>
> My wish would be a more generic "garbage collection" mechanism that
> would collect stale cache entries and get rid of them in the
> background. Doing that synchronously doesn't really make sense, IMO.
So, you're proposing something like having a workqueue that would walk
through the entries. And this workqueue would be triggered when the epoch
is increased.
> But that can be done independently of this patch, obviously.
OK, cool! I'm adding this to my TODO list, I can have a look into it once
we're done this patch.
Cheers,
--
Luís
On Tue, 11 Mar 2025 at 12:08, Luis Henriques <luis@igalia.com> wrote:
> Well, the use-case I had in mind is, as I mentioned before, CVMFS. I
> think this file system could benefit from using this mechanism.
We need more than just a hunch that this will work. Having code out
there that actually uses the new feature is a hard requirement.
It does not need to be actually committed to the cvmfs repo, but some
indication that the code will be accepted by the maintainers once the
kernel part is upstream is needed.
> However, I don't think that measuring the direct benefits is something
> easily done. At the moment, it uses a thread that tries to drain the
> cache using the FUSE_NOTIFY_INVAL_{INODE,ENTRY} operations. These are,
> obviously, operations that are much more expensive than the proposed
> FUSE_NOTIFY_INC_EPOCH. But, on the other hand, they have *immediate*
> effect while the new operation does not: without the call to
> shrink_dcache_sb() it's effect can only be observed in the long run.
How so? Isn't the advantage of FUSE_NOTIFY_INC_EPOCH that it spares
the server of having to send out FUSE_NOTIFY_INVAL_ENTRY for *all* of
the currently looked up dentries?
> I can try to come up with some artificial test case for this, but
> comparing these operations will always need to be done indirectly. And I
> wonder how useful that would be.
Any test is better than no test.
> So, you're proposing something like having a workqueue that would walk
> through the entries. And this workqueue would be triggered when the epoch
> is increased.
Not just. Also should periodically clean up expired dentries.
Thanks,
Miklos
On Thu, Mar 13 2025, Miklos Szeredi wrote:
> On Tue, 11 Mar 2025 at 12:08, Luis Henriques <luis@igalia.com> wrote:
>
>> Well, the use-case I had in mind is, as I mentioned before, CVMFS. I
>> think this file system could benefit from using this mechanism.
>
> We need more than just a hunch that this will work. Having code out
> there that actually uses the new feature is a hard requirement.
>
> It does not need to be actually committed to the cvmfs repo, but some
> indication that the code will be accepted by the maintainers once the
> kernel part is upstream is needed.
OK, makes sense. I do have a local cvmfs patch to use this new
notification. For now it's just a hack to replace the current code. It
has to be cleaned-up so that it uses FUSE_NOTIFY_INC_EPOCH only when it's
available in libfuse. My plan was to do this only after the kernel patch
was merged, but I can try to share an earlier version of it.
>> However, I don't think that measuring the direct benefits is something
>> easily done. At the moment, it uses a thread that tries to drain the
>> cache using the FUSE_NOTIFY_INVAL_{INODE,ENTRY} operations. These are,
>> obviously, operations that are much more expensive than the proposed
>> FUSE_NOTIFY_INC_EPOCH. But, on the other hand, they have *immediate*
>> effect while the new operation does not: without the call to
>> shrink_dcache_sb() it's effect can only be observed in the long run.
>
> How so? Isn't the advantage of FUSE_NOTIFY_INC_EPOCH that it spares
> the server of having to send out FUSE_NOTIFY_INVAL_ENTRY for *all* of
> the currently looked up dentries?
Well, I guess I misunderstood you. I can use my hacked cvmfs to measure
the improvement of removing this loop and replace it with a single
FUSE_NOTIFY_INC_EPOCH. Obviously, the performance improvements will
depend on how many dentries were cached.
>> I can try to come up with some artificial test case for this, but
>> comparing these operations will always need to be done indirectly. And I
>> wonder how useful that would be.
>
> Any test is better than no test.
>
>> So, you're proposing something like having a workqueue that would walk
>> through the entries. And this workqueue would be triggered when the epoch
>> is increased.
>
> Not just. Also should periodically clean up expired dentries.
Hmmm... And would you like this to be done in fuse? Or do you expect this
to me a more generic mechanism in dcache, available for other filesystems
as well?
Cheers,
--
Luís
On Thu, 13 Mar 2025 at 12:25, Luis Henriques <luis@igalia.com> wrote: > Hmmm... And would you like this to be done in fuse? Or do you expect this > to me a more generic mechanism in dcache, available for other filesystems > as well? I don't know if this would be useful for anything else, so initially fuse specific. An list sorted by expiry time (implemented with an rbtree) would work, I think. Thanks, Miklos
On Thu, Mar 13 2025, Miklos Szeredi wrote: > On Thu, 13 Mar 2025 at 12:25, Luis Henriques <luis@igalia.com> wrote: > >> Hmmm... And would you like this to be done in fuse? Or do you expect this >> to me a more generic mechanism in dcache, available for other filesystems >> as well? > > I don't know if this would be useful for anything else, so initially > fuse specific. > > An list sorted by expiry time (implemented with an rbtree) would work, > I think. Great, thanks. As I said, I've added this to my TODO; once we have this patch sorted out, I'll move into that. Thanks, Miklos. Cheers, -- Luís
On 3/10/25 17:42, Miklos Szeredi wrote: > On Fri, 7 Mar 2025 at 16:31, Luis Henriques <luis@igalia.com> wrote: > >> Any further feedback on this patch, or is it already OK for being merged? > > The patch looks okay. I have ideas about improving the name, but that can wait. > > What I think is still needed is an actual use case with performance numbers. > >> And what about the extra call to shrink_dcache_sb(), do you think that >> would that be acceptable? Maybe that could be conditional, by for example >> setting a flag. > > My wish would be a more generic "garbage collection" mechanism that > would collect stale cache entries and get rid of them in the > background. Doing that synchronously doesn't really make sense, IMO. > > But that can be done independently of this patch, obviously. Can't that be done in fuse-server? Maybe we should improve notifications to allow a batch of invalidations? I'm a bit thinking about https://github.com/libfuse/libfuse/issues/1131 I.e. userspace got out of FDs and my guess is it happens because of dentry/inode cache in the kernel. Here userspace could basically need to create its own LRU and then send invalidations. It also could be done in kernel, but kernel does not know amount of max open userspace FDs. We could add it into init-reply, but wouldn't be better to keep what we can in userspace? Thanks, Bernd
On Mon, 10 Mar 2025 at 21:11, Bernd Schubert <bernd@bsbernd.com> wrote: > Can't that be done in fuse-server? Maybe we should improve > notifications to allow a batch of invalidations? > > I'm a bit thinking about > https://github.com/libfuse/libfuse/issues/1131 > > I.e. userspace got out of FDs and my guess is it happens > because of dentry/inode cache in the kernel. Here userspace > could basically need to create its own LRU and then send > invalidations. It also could be done in kernel, > but kernel does not know amount of max open userspace FDs. > We could add it into init-reply, but wouldn't be better > to keep what we can in userspace? Two different things: 1) trimming the cache: this is what you are taking about above. I don't think it's possible to move the LRU to userspace since it doesn't see cache accesses and also does not have information about some references (e.g. cwd). This can be solved by adding a notification (FUSE_NOTIFY_TRIM) that tell the kernel to evict N *unused* dentries (valid or invalid). 2) cleaning up of invalid dentries. Dentries can become invalid by explicit invalidation or by expiring the timeout. The latter is a bit of a challenge to clean up, as we don't want to start a timer for each dentry. This is what I was suggesting instead of an explicit shrink_dcache_sb(). Thanks, Miklos
© 2016 - 2025 Red Hat, Inc.