Upon completion of a LOOKUP, if we're in famfs-mode we do a GET_FMAP to
retrieve and cache up the file-to-dax map in the kernel. If this
succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/dir.c | 69 +++++++++++++++++++++++++++++++++++++++
fs/fuse/fuse_i.h | 36 +++++++++++++++++++-
fs/fuse/inode.c | 15 +++++++++
include/uapi/linux/fuse.h | 4 +++
4 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bc29db0117f4..ae135c55b9f6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -359,6 +359,56 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size);
}
+#define FMAP_BUFSIZE 4096
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+static void
+fuse_get_fmap_init(
+ struct fuse_conn *fc,
+ struct fuse_args *args,
+ u64 nodeid,
+ void *outbuf,
+ size_t outbuf_size)
+{
+ memset(outbuf, 0, outbuf_size);
+ args->opcode = FUSE_GET_FMAP;
+ args->nodeid = nodeid;
+
+ args->in_numargs = 0;
+
+ args->out_numargs = 1;
+ args->out_args[0].size = FMAP_BUFSIZE;
+ args->out_args[0].value = outbuf;
+}
+
+static int
+fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
+{
+ size_t fmap_size;
+ void *fmap_buf;
+ int err;
+
+ pr_notice("%s: nodeid=%lld, inode=%llx\n", __func__,
+ nodeid, (u64)inode);
+ fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL);
+ FUSE_ARGS(args);
+ fuse_get_fmap_init(fm->fc, &args, nodeid, fmap_buf, FMAP_BUFSIZE);
+
+ /* Send GET_FMAP command */
+ err = fuse_simple_request(fm, &args);
+ if (err) {
+ pr_err("%s: err=%d from fuse_simple_request()\n",
+ __func__, err);
+ return err;
+ }
+
+ fmap_size = args.out_args[0].size;
+ pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
+
+ return 0;
+}
+#endif
+
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
@@ -404,6 +454,25 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
goto out;
}
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ if (fm->fc->famfs_iomap) {
+ if (S_ISREG((*inode)->i_mode)) {
+ /* Note Lookup returns the looked-up inode in the attr
+ * struct, but not in outarg->nodeid !
+ */
+ pr_notice("%s: outarg: size=%d nodeid=%lld attr.ino=%lld\n",
+ __func__, args.out_args[0].size, outarg->nodeid,
+ outarg->attr.ino);
+ /* Get the famfs fmap */
+ fuse_get_fmap(fm, *inode, outarg->attr.ino);
+ } else
+ pr_notice("%s: no get_fmap for non-regular file\n",
+ __func__);
+ } else
+ pr_notice("%s: fc->dax_iomap is not set\n", __func__);
+#endif
+
err = 0;
out_put_forget:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 931613102d32..437177c2f092 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -193,6 +193,10 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ void *famfs_meta;
+#endif
};
/** FUSE inode state bits */
@@ -942,6 +946,8 @@ struct fuse_conn {
#endif
#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ struct rw_semaphore famfs_devlist_sem;
+ struct famfs_dax_devlist *dax_devlist;
char *shadow;
#endif
};
@@ -1432,11 +1438,14 @@ void fuse_free_conn(struct fuse_conn *fc);
/* dax.c */
+static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
+
/* This macro is used by virtio_fs, but now it also needs to filter for
* "not famfs"
*/
#define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
- && IS_DAX(&fuse_inode->inode))
+ && IS_DAX(&fuse_inode->inode) \
+ && !fuse_file_famfs(fuse_inode))
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
@@ -1547,4 +1556,29 @@ extern void fuse_sysctl_unregister(void);
#define fuse_sysctl_unregister() do { } while (0)
#endif /* CONFIG_SYSCTL */
+/* famfs.c */
+static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
+ void *meta)
+{
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ return xchg(&fi->famfs_meta, meta);
+#else
+ return NULL;
+#endif
+}
+
+static inline void famfs_meta_free(struct fuse_inode *fi)
+{
+ /* Stub wil be connected in a subsequent commit */
+}
+
+static inline int fuse_file_famfs(struct fuse_inode *fi)
+{
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ return (fi->famfs_meta != NULL);
+#else
+ return 0;
+#endif
+}
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7f4b73e739cb..848c8818e6f7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_inode_backing_set(fi, NULL);
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ famfs_meta_set(fi, NULL);
+
return &fi->inode;
out_free_forget:
@@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_put(fuse_inode_backing(fi));
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ if (S_ISREG(inode->i_mode) && fi->famfs_meta) {
+ famfs_meta_free(fi);
+ famfs_meta_set(fi, NULL);
+ }
+#endif
+
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -1002,6 +1012,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_init(fc);
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) {
+ pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__);
+ init_rwsem(&fc->famfs_devlist_sem);
+ }
+
INIT_LIST_HEAD(&fc->mounts);
list_add(&fm->fc_entry, &fc->mounts);
fm->fc = fc;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index f9e14180367a..d85fb692cf3b 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -652,6 +652,10 @@ enum fuse_opcode {
FUSE_TMPFILE = 51,
FUSE_STATX = 52,
+ /* Famfs / devdax opcodes */
+ FUSE_GET_FMAP = 53,
+ FUSE_GET_DAXDEV = 54,
+
/* CUSE specific operations */
CUSE_INIT = 4096,
--
2.49.0
On Sun, Apr 20, 2025 at 6:34 PM John Groves <John@groves.net> wrote:
>
> Upon completion of a LOOKUP, if we're in famfs-mode we do a GET_FMAP to
> retrieve and cache up the file-to-dax map in the kernel. If this
> succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
>
> Signed-off-by: John Groves <john@groves.net>
> ---
> fs/fuse/dir.c | 69 +++++++++++++++++++++++++++++++++++++++
> fs/fuse/fuse_i.h | 36 +++++++++++++++++++-
> fs/fuse/inode.c | 15 +++++++++
> include/uapi/linux/fuse.h | 4 +++
> 4 files changed, 123 insertions(+), 1 deletion(-)
>
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index bc29db0117f4..ae135c55b9f6 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -359,6 +359,56 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
> return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size);
> }
>
> +#define FMAP_BUFSIZE 4096
> +
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> +static void
> +fuse_get_fmap_init(
> + struct fuse_conn *fc,
> + struct fuse_args *args,
> + u64 nodeid,
> + void *outbuf,
> + size_t outbuf_size)
> +{
> + memset(outbuf, 0, outbuf_size);
I think we can skip the memset here since kcalloc will zero out the
memory automatically when the fmap_buf gets allocated
> + args->opcode = FUSE_GET_FMAP;
> + args->nodeid = nodeid;
> +
> + args->in_numargs = 0;
> +
> + args->out_numargs = 1;
> + args->out_args[0].size = FMAP_BUFSIZE;
> + args->out_args[0].value = outbuf;
> +}
> +
> +static int
> +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
> +{
> + size_t fmap_size;
> + void *fmap_buf;
> + int err;
> +
> + pr_notice("%s: nodeid=%lld, inode=%llx\n", __func__,
> + nodeid, (u64)inode);
> + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL);
> + FUSE_ARGS(args);
> + fuse_get_fmap_init(fm->fc, &args, nodeid, fmap_buf, FMAP_BUFSIZE);
> +
> + /* Send GET_FMAP command */
> + err = fuse_simple_request(fm, &args);
I'm assuming the fmap_buf gets freed in a later patch, but for this
one we'll probably need a kfree(fmap_buf) here in the meantime?
> + if (err) {
> + pr_err("%s: err=%d from fuse_simple_request()\n",
> + __func__, err);
> + return err;
> + }
> +
> + fmap_size = args.out_args[0].size;
> + pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
> +
> + return 0;
> +}
> +#endif
> +
> int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
> struct fuse_entry_out *outarg, struct inode **inode)
> {
> @@ -404,6 +454,25 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
> fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
> goto out;
> }
> +
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + if (fm->fc->famfs_iomap) {
> + if (S_ISREG((*inode)->i_mode)) {
> + /* Note Lookup returns the looked-up inode in the attr
> + * struct, but not in outarg->nodeid !
> + */
> + pr_notice("%s: outarg: size=%d nodeid=%lld attr.ino=%lld\n",
> + __func__, args.out_args[0].size, outarg->nodeid,
> + outarg->attr.ino);
> + /* Get the famfs fmap */
> + fuse_get_fmap(fm, *inode, outarg->attr.ino);
I agree with Darrick's comment about fetching the mappings only if the
file gets opened. I wonder though if we could bundle the open with the
get_fmap so that we don't have to do an additional request / incur 2
extra context switches. This seems feasible to me. When we send the
open request, we could check if fc->famfs_iomap is set and if so, set
inarg.open_flags to include FUSE_OPEN_GET_FMAP and set outarg.value to
an allocated buffer that holds both struct fuse_open_out and the
fmap_buf and adjust outarg.size accordingly. Then the server could
send both the open and corresponding fmap data in the reply.
> + } else
> + pr_notice("%s: no get_fmap for non-regular file\n",
> + __func__);
> + } else
> + pr_notice("%s: fc->dax_iomap is not set\n", __func__);
> +#endif
> +
> err = 0;
>
> out_put_forget:
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 931613102d32..437177c2f092 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -193,6 +193,10 @@ struct fuse_inode {
> /** Reference to backing file in passthrough mode */
> struct fuse_backing *fb;
> #endif
> +
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + void *famfs_meta;
> +#endif
> };
>
> /** FUSE inode state bits */
> @@ -942,6 +946,8 @@ struct fuse_conn {
> #endif
>
> #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + struct rw_semaphore famfs_devlist_sem;
> + struct famfs_dax_devlist *dax_devlist;
> char *shadow;
> #endif
> };
> @@ -1432,11 +1438,14 @@ void fuse_free_conn(struct fuse_conn *fc);
>
> /* dax.c */
>
> +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
> +
> /* This macro is used by virtio_fs, but now it also needs to filter for
> * "not famfs"
> */
> #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
> - && IS_DAX(&fuse_inode->inode))
> + && IS_DAX(&fuse_inode->inode) \
> + && !fuse_file_famfs(fuse_inode))
>
> ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
> ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
> @@ -1547,4 +1556,29 @@ extern void fuse_sysctl_unregister(void);
> #define fuse_sysctl_unregister() do { } while (0)
> #endif /* CONFIG_SYSCTL */
>
> +/* famfs.c */
> +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
> + void *meta)
> +{
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + return xchg(&fi->famfs_meta, meta);
> +#else
> + return NULL;
> +#endif
> +}
> +
> +static inline void famfs_meta_free(struct fuse_inode *fi)
> +{
> + /* Stub wil be connected in a subsequent commit */
> +}
> +
> +static inline int fuse_file_famfs(struct fuse_inode *fi)
> +{
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + return (fi->famfs_meta != NULL);
Does this need to be "return READ_ONCE(fi->famfs_meta) != NULL"?
> +#else
> + return 0;
> +#endif
> +}
> +
> #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 7f4b73e739cb..848c8818e6f7 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_inode_backing_set(fi, NULL);
>
> + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> + famfs_meta_set(fi, NULL);
"fi->famfs_meta = NULL;" looks simpler here
> +
> return &fi->inode;
>
> out_free_forget:
> @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode)
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_backing_put(fuse_inode_backing(fi));
>
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> + if (S_ISREG(inode->i_mode) && fi->famfs_meta) {
> + famfs_meta_free(fi);
> + famfs_meta_set(fi, NULL);
> + }
> +#endif
> +
> kmem_cache_free(fuse_inode_cachep, fi);
> }
>
> @@ -1002,6 +1012,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
> if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> fuse_backing_files_init(fc);
>
> + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) {
> + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__);
> + init_rwsem(&fc->famfs_devlist_sem);
> + }
Should we only init this if the server chooses to opt into famfs (eg
if their init reply sets the FUSE_DAX_FMAP flag)? This imo seems to
belong more in process_init_reply().
Thanks,
Joanne
> +
> INIT_LIST_HEAD(&fc->mounts);
> list_add(&fm->fc_entry, &fc->mounts);
> fm->fc = fc;
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index f9e14180367a..d85fb692cf3b 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -652,6 +652,10 @@ enum fuse_opcode {
> FUSE_TMPFILE = 51,
> FUSE_STATX = 52,
>
> + /* Famfs / devdax opcodes */
> + FUSE_GET_FMAP = 53,
> + FUSE_GET_DAXDEV = 54,
> +
> /* CUSE specific operations */
> CUSE_INIT = 4096,
>
> --
> 2.49.0
>
On 25/05/01 10:48PM, Joanne Koong wrote:
> On Sun, Apr 20, 2025 at 6:34 PM John Groves <John@groves.net> wrote:
> >
> > Upon completion of a LOOKUP, if we're in famfs-mode we do a GET_FMAP to
> > retrieve and cache up the file-to-dax map in the kernel. If this
> > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
> >
> > Signed-off-by: John Groves <john@groves.net>
> > ---
> > fs/fuse/dir.c | 69 +++++++++++++++++++++++++++++++++++++++
> > fs/fuse/fuse_i.h | 36 +++++++++++++++++++-
> > fs/fuse/inode.c | 15 +++++++++
> > include/uapi/linux/fuse.h | 4 +++
> > 4 files changed, 123 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> > index bc29db0117f4..ae135c55b9f6 100644
> > --- a/fs/fuse/dir.c
> > +++ b/fs/fuse/dir.c
> > @@ -359,6 +359,56 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
> > return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size);
> > }
> >
> > +#define FMAP_BUFSIZE 4096
> > +
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > +static void
> > +fuse_get_fmap_init(
> > + struct fuse_conn *fc,
> > + struct fuse_args *args,
> > + u64 nodeid,
> > + void *outbuf,
> > + size_t outbuf_size)
> > +{
> > + memset(outbuf, 0, outbuf_size);
>
> I think we can skip the memset here since kcalloc will zero out the
> memory automatically when the fmap_buf gets allocated
Good catch, thanks. Queued to -next.
>
> > + args->opcode = FUSE_GET_FMAP;
> > + args->nodeid = nodeid;
> > +
> > + args->in_numargs = 0;
> > +
> > + args->out_numargs = 1;
> > + args->out_args[0].size = FMAP_BUFSIZE;
> > + args->out_args[0].value = outbuf;
> > +}
> > +
> > +static int
> > +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
> > +{
> > + size_t fmap_size;
> > + void *fmap_buf;
> > + int err;
> > +
> > + pr_notice("%s: nodeid=%lld, inode=%llx\n", __func__,
> > + nodeid, (u64)inode);
> > + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL);
> > + FUSE_ARGS(args);
> > + fuse_get_fmap_init(fm->fc, &args, nodeid, fmap_buf, FMAP_BUFSIZE);
> > +
> > + /* Send GET_FMAP command */
> > + err = fuse_simple_request(fm, &args);
>
> I'm assuming the fmap_buf gets freed in a later patch, but for this
> one we'll probably need a kfree(fmap_buf) here in the meantime?
Nice of you to give me the benefit of the doubt there ;)
At this commit, nothing is done with fmap_buf, and a subsequent
commit adds a call to famfs_file_init_dax(...fmap_buf...). But
the fmap_buf was leaked.
I'm adding a kfree(fmap_buf) to this commit, which will come after the
call to famfs_file_init_dax() when that's added in a subsequent
commit.
Thanks!
>
> > + if (err) {
> > + pr_err("%s: err=%d from fuse_simple_request()\n",
> > + __func__, err);
> > + return err;
> > + }
> > +
> > + fmap_size = args.out_args[0].size;
> > + pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
> > +
> > + return 0;
> > +}
> > +#endif
> > +
> > int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
> > struct fuse_entry_out *outarg, struct inode **inode)
> > {
> > @@ -404,6 +454,25 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
> > fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
> > goto out;
> > }
> > +
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + if (fm->fc->famfs_iomap) {
> > + if (S_ISREG((*inode)->i_mode)) {
> > + /* Note Lookup returns the looked-up inode in the attr
> > + * struct, but not in outarg->nodeid !
> > + */
> > + pr_notice("%s: outarg: size=%d nodeid=%lld attr.ino=%lld\n",
> > + __func__, args.out_args[0].size, outarg->nodeid,
> > + outarg->attr.ino);
> > + /* Get the famfs fmap */
> > + fuse_get_fmap(fm, *inode, outarg->attr.ino);
>
> I agree with Darrick's comment about fetching the mappings only if the
> file gets opened. I wonder though if we could bundle the open with the
> get_fmap so that we don't have to do an additional request / incur 2
> extra context switches. This seems feasible to me. When we send the
> open request, we could check if fc->famfs_iomap is set and if so, set
> inarg.open_flags to include FUSE_OPEN_GET_FMAP and set outarg.value to
> an allocated buffer that holds both struct fuse_open_out and the
> fmap_buf and adjust outarg.size accordingly. Then the server could
> send both the open and corresponding fmap data in the reply.
I agree about moving GET_FMAP to open, but I want to be cautious about
moving it *into* open. Right now fitting an entire fmap into a single
message response looks like a totally acceptable requirement for famfs -
but it might not survive as a permanent requirement, and it seems likely
not to work out for Darrick's use cases - which I think would lead us back
to needing GET_FMAP.
Elswhere in this thread, and also 1:1, Darrick and I have discussed the
possibility of partial retrieval of fmaps (in part due to the possibility
that they might not always fit in a single message). If these responses
can get arbitrarily large, this would become a requirement. GET_FMAP could
specify an offset, and the reply could also specify its starting offset;
I think it has to be in both places because the current "elegantly simple"
fmap format doesn't always split easily at arbitrary offsets.
Also, with famfs I think fmaps can be retained in-kernel past close,
making the retrieval-on-open only needed if the fmap isn't already
present. Famfs doesn't currently allow fmaps to change, although there
are reasons we might relax that later.
This can be revisited down the road.
Unless I run into a blocker, the next rev of the series will call
GET_FMAP on open...
BTW I think moving GET_FMAP to open will remove the reasons why famfs
currently needs to avoid READDIRPLUS.
>
> > + } else
> > + pr_notice("%s: no get_fmap for non-regular file\n",
> > + __func__);
> > + } else
> > + pr_notice("%s: fc->dax_iomap is not set\n", __func__);
> > +#endif
> > +
> > err = 0;
> >
> > out_put_forget:
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 931613102d32..437177c2f092 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -193,6 +193,10 @@ struct fuse_inode {
> > /** Reference to backing file in passthrough mode */
> > struct fuse_backing *fb;
> > #endif
> > +
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + void *famfs_meta;
> > +#endif
> > };
> >
> > /** FUSE inode state bits */
> > @@ -942,6 +946,8 @@ struct fuse_conn {
> > #endif
> >
> > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + struct rw_semaphore famfs_devlist_sem;
> > + struct famfs_dax_devlist *dax_devlist;
> > char *shadow;
> > #endif
> > };
> > @@ -1432,11 +1438,14 @@ void fuse_free_conn(struct fuse_conn *fc);
> >
> > /* dax.c */
> >
> > +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
> > +
> > /* This macro is used by virtio_fs, but now it also needs to filter for
> > * "not famfs"
> > */
> > #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
> > - && IS_DAX(&fuse_inode->inode))
> > + && IS_DAX(&fuse_inode->inode) \
> > + && !fuse_file_famfs(fuse_inode))
> >
> > ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
> > ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
> > @@ -1547,4 +1556,29 @@ extern void fuse_sysctl_unregister(void);
> > #define fuse_sysctl_unregister() do { } while (0)
> > #endif /* CONFIG_SYSCTL */
> >
> > +/* famfs.c */
> > +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
> > + void *meta)
> > +{
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + return xchg(&fi->famfs_meta, meta);
> > +#else
> > + return NULL;
> > +#endif
> > +}
> > +
> > +static inline void famfs_meta_free(struct fuse_inode *fi)
> > +{
> > + /* Stub wil be connected in a subsequent commit */
> > +}
> > +
> > +static inline int fuse_file_famfs(struct fuse_inode *fi)
> > +{
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + return (fi->famfs_meta != NULL);
>
> Does this need to be "return READ_ONCE(fi->famfs_meta) != NULL"?
I'm not sure, but it can't hurt. Queued...
>
> > +#else
> > + return 0;
> > +#endif
> > +}
> > +
> > #endif /* _FS_FUSE_I_H */
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 7f4b73e739cb..848c8818e6f7 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
> > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> > fuse_inode_backing_set(fi, NULL);
> >
> > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> > + famfs_meta_set(fi, NULL);
>
> "fi->famfs_meta = NULL;" looks simpler here
I toootally agree here, but I was following the passthrough pattern
just above. @miklos or @Amir, got a preference here?
Furthermore, initially I didn't init fi->famfs_meta at all because I
*assumed* fi (the fuse_inode) would be zeroed upon allocation - but it's
currently not. @miklos, would you object to zeroing fuse_inodes on
allocation? Clearly it's working without that, but it seems like a
"normal" thing to do, that might someday pre-empt a problem.
>
> > +
> > return &fi->inode;
> >
> > out_free_forget:
> > @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode)
> > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> > fuse_backing_put(fuse_inode_backing(fi));
> >
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + if (S_ISREG(inode->i_mode) && fi->famfs_meta) {
> > + famfs_meta_free(fi);
> > + famfs_meta_set(fi, NULL);
> > + }
> > +#endif
> > +
> > kmem_cache_free(fuse_inode_cachep, fi);
> > }
> >
> > @@ -1002,6 +1012,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
> > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> > fuse_backing_files_init(fc);
> >
> > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) {
> > + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__);
> > + init_rwsem(&fc->famfs_devlist_sem);
> > + }
>
> Should we only init this if the server chooses to opt into famfs (eg
> if their init reply sets the FUSE_DAX_FMAP flag)? This imo seems to
> belong more in process_init_reply().
Another good catch. Queued - thanks!
>
>
> Thanks,
> Joanne
Thank you!
On Mon, May 12, 2025 at 6:28 PM John Groves <John@groves.net> wrote: > > On 25/05/01 10:48PM, Joanne Koong wrote: > > On Sun, Apr 20, 2025 at 6:34 PM John Groves <John@groves.net> wrote: > > > > > > Upon completion of a LOOKUP, if we're in famfs-mode we do a GET_FMAP to > > > retrieve and cache up the file-to-dax map in the kernel. If this > > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > > > Signed-off-by: John Groves <john@groves.net> > > > --- ... > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > > index 7f4b73e739cb..848c8818e6f7 100644 > > > --- a/fs/fuse/inode.c > > > +++ b/fs/fuse/inode.c > > > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > > fuse_inode_backing_set(fi, NULL); > > > > > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > > + famfs_meta_set(fi, NULL); > > > > "fi->famfs_meta = NULL;" looks simpler here > > I toootally agree here, but I was following the passthrough pattern > just above. @miklos or @Amir, got a preference here? > It's not about preference, this fails build. Even though compiler (or pre-processor whatever) should be able to skip "fi->famfs_meta = NULL;" if CONFIG_FUSE_FAMFS_DAX is not defined IIRC it does not. Feel free to try. Maybe I do not remember correctly. Thanks, Amir.
On 25/05/22 05:45PM, Amir Goldstein wrote: > On Mon, May 12, 2025 at 6:28 PM John Groves <John@groves.net> wrote: > > > > On 25/05/01 10:48PM, Joanne Koong wrote: > > > On Sun, Apr 20, 2025 at 6:34 PM John Groves <John@groves.net> wrote: > > > > > > > > Upon completion of a LOOKUP, if we're in famfs-mode we do a GET_FMAP to > > > > retrieve and cache up the file-to-dax map in the kernel. If this > > > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > > > > > Signed-off-by: John Groves <john@groves.net> > > > > --- > ... > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > > > index 7f4b73e739cb..848c8818e6f7 100644 > > > > --- a/fs/fuse/inode.c > > > > +++ b/fs/fuse/inode.c > > > > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > > > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > > > fuse_inode_backing_set(fi, NULL); > > > > > > > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > > > + famfs_meta_set(fi, NULL); > > > > > > "fi->famfs_meta = NULL;" looks simpler here > > > > I toootally agree here, but I was following the passthrough pattern > > just above. @miklos or @Amir, got a preference here? > > > > It's not about preference, this fails build. > Even though compiler (or pre-processor whatever) should be able to skip > "fi->famfs_meta = NULL;" if CONFIG_FUSE_FAMFS_DAX is not defined > IIRC it does not. Feel free to try. Maybe I do not remember correctly. > > Thanks, > Amir. Thanks Amir. Will fiddle with this when cleaning up V2. John
On Thu, May 01, 2025 at 10:48:15PM -0700, Joanne Koong wrote:
> On Sun, Apr 20, 2025 at 6:34 PM John Groves <John@groves.net> wrote:
> >
> > Upon completion of a LOOKUP, if we're in famfs-mode we do a GET_FMAP to
> > retrieve and cache up the file-to-dax map in the kernel. If this
> > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
> >
> > Signed-off-by: John Groves <john@groves.net>
> > ---
> > fs/fuse/dir.c | 69 +++++++++++++++++++++++++++++++++++++++
> > fs/fuse/fuse_i.h | 36 +++++++++++++++++++-
> > fs/fuse/inode.c | 15 +++++++++
> > include/uapi/linux/fuse.h | 4 +++
> > 4 files changed, 123 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> > index bc29db0117f4..ae135c55b9f6 100644
> > --- a/fs/fuse/dir.c
> > +++ b/fs/fuse/dir.c
> > @@ -359,6 +359,56 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
> > return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size);
> > }
> >
> > +#define FMAP_BUFSIZE 4096
> > +
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > +static void
> > +fuse_get_fmap_init(
> > + struct fuse_conn *fc,
> > + struct fuse_args *args,
> > + u64 nodeid,
> > + void *outbuf,
> > + size_t outbuf_size)
> > +{
> > + memset(outbuf, 0, outbuf_size);
>
> I think we can skip the memset here since kcalloc will zero out the
> memory automatically when the fmap_buf gets allocated
>
> > + args->opcode = FUSE_GET_FMAP;
> > + args->nodeid = nodeid;
> > +
> > + args->in_numargs = 0;
> > +
> > + args->out_numargs = 1;
> > + args->out_args[0].size = FMAP_BUFSIZE;
> > + args->out_args[0].value = outbuf;
> > +}
> > +
> > +static int
> > +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
> > +{
> > + size_t fmap_size;
> > + void *fmap_buf;
> > + int err;
> > +
> > + pr_notice("%s: nodeid=%lld, inode=%llx\n", __func__,
> > + nodeid, (u64)inode);
> > + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL);
> > + FUSE_ARGS(args);
> > + fuse_get_fmap_init(fm->fc, &args, nodeid, fmap_buf, FMAP_BUFSIZE);
> > +
> > + /* Send GET_FMAP command */
> > + err = fuse_simple_request(fm, &args);
>
> I'm assuming the fmap_buf gets freed in a later patch, but for this
> one we'll probably need a kfree(fmap_buf) here in the meantime?
>
> > + if (err) {
> > + pr_err("%s: err=%d from fuse_simple_request()\n",
> > + __func__, err);
> > + return err;
> > + }
> > +
> > + fmap_size = args.out_args[0].size;
> > + pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
> > +
> > + return 0;
> > +}
> > +#endif
> > +
> > int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
> > struct fuse_entry_out *outarg, struct inode **inode)
> > {
> > @@ -404,6 +454,25 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
> > fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
> > goto out;
> > }
> > +
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + if (fm->fc->famfs_iomap) {
> > + if (S_ISREG((*inode)->i_mode)) {
> > + /* Note Lookup returns the looked-up inode in the attr
> > + * struct, but not in outarg->nodeid !
> > + */
> > + pr_notice("%s: outarg: size=%d nodeid=%lld attr.ino=%lld\n",
> > + __func__, args.out_args[0].size, outarg->nodeid,
> > + outarg->attr.ino);
> > + /* Get the famfs fmap */
> > + fuse_get_fmap(fm, *inode, outarg->attr.ino);
>
> I agree with Darrick's comment about fetching the mappings only if the
> file gets opened. I wonder though if we could bundle the open with the
> get_fmap so that we don't have to do an additional request / incur 2
> extra context switches.
What's the intended lifetime of these files? If we only have to do this
once per file lifetime then perhaps that amortizes towards zero? That
said, I don't know how aggressively fuse reclaims the inode structures,
so maybe the need to GET_FMAP is more frequent? AFAICT the fuse code
seems to use the regular lru so maybe that's not so bad. But maybe the
kernel shouldn't ask for mappings until someone tries a file IO
operation so that walking the directory tree doesn't bog down in a bunch
of GET_FMAP operations.
It also occurred to me just now that this is a LOOKUP operation, which
makes me wonder what happens for the other things like . But maybe the
order of operations for creat() is that you tell the famfs management
layer to create a file, it preallocates space and the directory tree,
and later you can just resolve the pathname to open it, at which point
fuse+famfs creates the in-kernel abstractions?
> extra context switches. This seems feasible to me. When we send the
> open request, we could check if fc->famfs_iomap is set and if so, set
> inarg.open_flags to include FUSE_OPEN_GET_FMAP and set outarg.value to
> an allocated buffer that holds both struct fuse_open_out and the
> fmap_buf and adjust outarg.size accordingly. Then the server could
> send both the open and corresponding fmap data in the reply.
Alternately, we could create a fuse "notification" that really is just a
means for the famfs open function to send mappings to the kernel. But I
don't know if it makes much difference for the kernel to demand-page in
mapping information as needed vs just using GET_FMAP here.
>
> > + } else
> > + pr_notice("%s: no get_fmap for non-regular file\n",
> > + __func__);
> > + } else
> > + pr_notice("%s: fc->dax_iomap is not set\n", __func__);
> > +#endif
> > +
> > err = 0;
> >
> > out_put_forget:
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 931613102d32..437177c2f092 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -193,6 +193,10 @@ struct fuse_inode {
> > /** Reference to backing file in passthrough mode */
> > struct fuse_backing *fb;
> > #endif
> > +
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + void *famfs_meta;
> > +#endif
> > };
> >
> > /** FUSE inode state bits */
> > @@ -942,6 +946,8 @@ struct fuse_conn {
> > #endif
> >
> > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + struct rw_semaphore famfs_devlist_sem;
> > + struct famfs_dax_devlist *dax_devlist;
> > char *shadow;
> > #endif
> > };
> > @@ -1432,11 +1438,14 @@ void fuse_free_conn(struct fuse_conn *fc);
> >
> > /* dax.c */
> >
> > +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
> > +
> > /* This macro is used by virtio_fs, but now it also needs to filter for
> > * "not famfs"
> > */
> > #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
> > - && IS_DAX(&fuse_inode->inode))
> > + && IS_DAX(&fuse_inode->inode) \
> > + && !fuse_file_famfs(fuse_inode))
> >
> > ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
> > ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
> > @@ -1547,4 +1556,29 @@ extern void fuse_sysctl_unregister(void);
> > #define fuse_sysctl_unregister() do { } while (0)
> > #endif /* CONFIG_SYSCTL */
> >
> > +/* famfs.c */
> > +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
> > + void *meta)
> > +{
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + return xchg(&fi->famfs_meta, meta);
> > +#else
> > + return NULL;
> > +#endif
> > +}
> > +
> > +static inline void famfs_meta_free(struct fuse_inode *fi)
> > +{
> > + /* Stub wil be connected in a subsequent commit */
> > +}
> > +
> > +static inline int fuse_file_famfs(struct fuse_inode *fi)
> > +{
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + return (fi->famfs_meta != NULL);
>
> Does this need to be "return READ_ONCE(fi->famfs_meta) != NULL"?
>
> > +#else
> > + return 0;
> > +#endif
> > +}
> > +
> > #endif /* _FS_FUSE_I_H */
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 7f4b73e739cb..848c8818e6f7 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
> > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> > fuse_inode_backing_set(fi, NULL);
> >
> > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> > + famfs_meta_set(fi, NULL);
>
> "fi->famfs_meta = NULL;" looks simpler here
>
> > +
> > return &fi->inode;
> >
> > out_free_forget:
> > @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode)
> > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> > fuse_backing_put(fuse_inode_backing(fi));
> >
> > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> > + if (S_ISREG(inode->i_mode) && fi->famfs_meta) {
> > + famfs_meta_free(fi);
> > + famfs_meta_set(fi, NULL);
> > + }
> > +#endif
> > +
> > kmem_cache_free(fuse_inode_cachep, fi);
> > }
> >
> > @@ -1002,6 +1012,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
> > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
> > fuse_backing_files_init(fc);
> >
> > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) {
> > + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__);
> > + init_rwsem(&fc->famfs_devlist_sem);
> > + }
>
> Should we only init this if the server chooses to opt into famfs (eg
> if their init reply sets the FUSE_DAX_FMAP flag)? This imo seems to
> belong more in process_init_reply().
/me has no idea what this means, but has a sneaking suspicion it'll
become important for his science projects. ;)
Though I guess for general purpose files maybe we want to allow opting
into iomapping on a per-inode basis like the existing iomap allows.
--D
>
> Thanks,
> Joanne
> > +
> > INIT_LIST_HEAD(&fc->mounts);
> > list_add(&fm->fc_entry, &fc->mounts);
> > fm->fc = fc;
> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index f9e14180367a..d85fb692cf3b 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -652,6 +652,10 @@ enum fuse_opcode {
> > FUSE_TMPFILE = 51,
> > FUSE_STATX = 52,
> >
> > + /* Famfs / devdax opcodes */
> > + FUSE_GET_FMAP = 53,
> > + FUSE_GET_DAXDEV = 54,
> > +
> > /* CUSE specific operations */
> > CUSE_INIT = 4096,
> >
> > --
> > 2.49.0
> >
>
© 2016 - 2025 Red Hat, Inc.