Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to
retrieve and cache up the file-to-dax map in the kernel. If this
succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.
GET_FMAP has a variable-size response payload, and the allocated size
is sent in the in_args[0].size field. If the fmap would overflow the
message, the fuse server sends a reply of size 'sizeof(uint32_t)' which
specifies the size of the fmap message. Then the kernel can realloc a
large enough buffer and try again.
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++
fs/fuse/fuse_i.h | 36 ++++++++++++++++-
fs/fuse/inode.c | 19 +++++++--
fs/fuse/iomode.c | 2 +-
include/uapi/linux/fuse.h | 18 +++++++++
5 files changed, 154 insertions(+), 5 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 93b82660f0c8..8616fb0a6d61 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
}
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+
+#define FMAP_BUFSIZE 4096
+
+static int
+fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
+{
+ struct fuse_get_fmap_in inarg = { 0 };
+ size_t fmap_bufsize = FMAP_BUFSIZE;
+ ssize_t fmap_size;
+ int retries = 1;
+ void *fmap_buf;
+ int rc;
+
+ FUSE_ARGS(args);
+
+ fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL);
+ if (!fmap_buf)
+ return -EIO;
+
+ retry_once:
+ inarg.size = fmap_bufsize;
+
+ args.opcode = FUSE_GET_FMAP;
+ args.nodeid = nodeid;
+
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+
+ /* Variable-sized output buffer
+ * this causes fuse_simple_request() to return the size of the
+ * output payload
+ */
+ args.out_argvar = true;
+ args.out_numargs = 1;
+ args.out_args[0].size = fmap_bufsize;
+ args.out_args[0].value = fmap_buf;
+
+ /* Send GET_FMAP command */
+ rc = fuse_simple_request(fm, &args);
+ if (rc < 0) {
+ pr_err("%s: err=%d from fuse_simple_request()\n",
+ __func__, rc);
+ return rc;
+ }
+ fmap_size = rc;
+
+ if (retries && fmap_size == sizeof(uint32_t)) {
+ /* fmap size exceeded fmap_bufsize;
+ * actual fmap size returned in fmap_buf;
+ * realloc and retry once
+ */
+ fmap_bufsize = *((uint32_t *)fmap_buf);
+
+ --retries;
+ kfree(fmap_buf);
+ fmap_buf = kcalloc(1, fmap_bufsize, GFP_KERNEL);
+ if (!fmap_buf)
+ return -EIO;
+
+ goto retry_once;
+ }
+
+ /* Will call famfs_file_init_dax() when that gets added */
+
+ kfree(fmap_buf);
+ return 0;
+}
+#endif
+
static int fuse_open(struct inode *inode, struct file *file)
{
struct fuse_mount *fm = get_fuse_mount(inode);
@@ -263,6 +334,19 @@ static int fuse_open(struct inode *inode, struct file *file)
err = fuse_do_open(fm, get_node_id(inode), file, false);
if (!err) {
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ if (fm->fc->famfs_iomap) {
+ if (S_ISREG(inode->i_mode)) {
+ int rc;
+ /* Get the famfs fmap */
+ rc = fuse_get_fmap(fm, inode,
+ get_node_id(inode));
+ if (rc)
+ pr_err("%s: fuse_get_fmap err=%d\n",
+ __func__, rc);
+ }
+ }
+#endif
ff = file->private_data;
err = fuse_finish_open(inode, file);
if (err)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f4ee61046578..e01d6e5c6e93 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -193,6 +193,10 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ void *famfs_meta;
+#endif
};
/** FUSE inode state bits */
@@ -945,6 +949,8 @@ struct fuse_conn {
#endif
#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ struct rw_semaphore famfs_devlist_sem;
+ struct famfs_dax_devlist *dax_devlist;
char *shadow;
#endif
};
@@ -1435,11 +1441,14 @@ void fuse_free_conn(struct fuse_conn *fc);
/* dax.c */
+static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
+
/* This macro is used by virtio_fs, but now it also needs to filter for
* "not famfs"
*/
#define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \
- && IS_DAX(&fuse_inode->inode))
+ && IS_DAX(&fuse_inode->inode) \
+ && !fuse_file_famfs(fuse_inode))
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
@@ -1550,4 +1559,29 @@ extern void fuse_sysctl_unregister(void);
#define fuse_sysctl_unregister() do { } while (0)
#endif /* CONFIG_SYSCTL */
+/* famfs.c */
+static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
+ void *meta)
+{
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ return xchg(&fi->famfs_meta, meta);
+#else
+ return NULL;
+#endif
+}
+
+static inline void famfs_meta_free(struct fuse_inode *fi)
+{
+ /* Stub wil be connected in a subsequent commit */
+}
+
+static inline int fuse_file_famfs(struct fuse_inode *fi)
+{
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ return (READ_ONCE(fi->famfs_meta) != NULL);
+#else
+ return 0;
+#endif
+}
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a7e1cf8257b0..b071d16f7d04 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_inode_backing_set(fi, NULL);
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ famfs_meta_set(fi, NULL);
+
return &fi->inode;
out_free_forget:
@@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_put(fuse_inode_backing(fi));
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+ if (S_ISREG(inode->i_mode) && fi->famfs_meta) {
+ famfs_meta_free(fi);
+ famfs_meta_set(fi, NULL);
+ }
+#endif
+
kmem_cache_free(fuse_inode_cachep, fi);
}
@@ -1002,6 +1012,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_init(fc);
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__);
+
INIT_LIST_HEAD(&fc->mounts);
list_add(&fm->fc_entry, &fc->mounts);
fm->fc = fc;
@@ -1036,9 +1049,8 @@ void fuse_conn_put(struct fuse_conn *fc)
}
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_free(fc);
-#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
- kfree(fc->shadow);
-#endif
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ kfree(fc->shadow);
call_rcu(&fc->rcu, delayed_release);
}
}
@@ -1425,6 +1437,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
* those capabilities, they are held here).
*/
fc->famfs_iomap = 1;
+ init_rwsem(&fc->famfs_devlist_sem);
}
} else {
ra_pages = fc->max_read / PAGE_SIZE;
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index aec4aecb5d79..443b337b0c05 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -204,7 +204,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
* io modes are not relevant with DAX and with server that does not
* implement open.
*/
- if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args)
+ if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args)
return 0;
/*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 6c384640c79b..dff5aa62543e 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -654,6 +654,10 @@ enum fuse_opcode {
FUSE_TMPFILE = 51,
FUSE_STATX = 52,
+ /* Famfs / devdax opcodes */
+ FUSE_GET_FMAP = 53,
+ FUSE_GET_DAXDEV = 54,
+
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -888,6 +892,16 @@ struct fuse_access_in {
uint32_t padding;
};
+struct fuse_get_fmap_in {
+ uint32_t size;
+ uint32_t padding;
+};
+
+struct fuse_get_fmap_out {
+ uint32_t size;
+ uint32_t padding;
+};
+
struct fuse_init_in {
uint32_t major;
uint32_t minor;
@@ -1284,4 +1298,8 @@ struct fuse_uring_cmd_req {
uint8_t padding[6];
};
+/* Famfs fmap message components */
+
+#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */
+
#endif /* _LINUX_FUSE_H */
--
2.49.0
On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > retrieve and cache up the file-to-dax map in the kernel. If this > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. Nothing to do at this time unless you want a side project: doing this with compound requests would save a roundtrip (OPEN + GET_FMAP in one go). > GET_FMAP has a variable-size response payload, and the allocated size > is sent in the in_args[0].size field. If the fmap would overflow the > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > specifies the size of the fmap message. Then the kernel can realloc a > large enough buffer and try again. There is a better way to do this: the allocation can happen when we get the response. Just need to add infrastructure to dev.c. > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index 6c384640c79b..dff5aa62543e 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -654,6 +654,10 @@ enum fuse_opcode { > FUSE_TMPFILE = 51, > FUSE_STATX = 52, > > + /* Famfs / devdax opcodes */ > + FUSE_GET_FMAP = 53, > + FUSE_GET_DAXDEV = 54, Introduced too early. > + > /* CUSE specific operations */ > CUSE_INIT = 4096, > > @@ -888,6 +892,16 @@ struct fuse_access_in { > uint32_t padding; > }; > > +struct fuse_get_fmap_in { > + uint32_t size; > + uint32_t padding; > +}; As noted, passing size to server really makes no sense. I'd just omit fuse_get_fmap_in completely. > + > +struct fuse_get_fmap_out { > + uint32_t size; > + uint32_t padding; > +}; > + > struct fuse_init_in { > uint32_t major; > uint32_t minor; > @@ -1284,4 +1298,8 @@ struct fuse_uring_cmd_req { > uint8_t padding[6]; > }; > > +/* Famfs fmap message components */ > + > +#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */ > + Hmm, Darrick's interface gets one extents at a time. This one tries to get the whole map in one go. The single extent thing can be inefficient even for plain block fs, so it would be nice to get multiple extents. The whole map has an artificial limit that currently may seem sufficient but down the line could cause pain. I'm still hoping some common ground would benefit both interfaces. Just not sure what it should be. Thanks, Miklos
On 25/08/14 03:36PM, Miklos Szeredi wrote: > On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > retrieve and cache up the file-to-dax map in the kernel. If this > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > Nothing to do at this time unless you want a side project: doing this > with compound requests would save a roundtrip (OPEN + GET_FMAP in one > go). I'm thinking that's an opportunity for improvement after the basic mechanism is in ;) > > > GET_FMAP has a variable-size response payload, and the allocated size > > is sent in the in_args[0].size field. If the fmap would overflow the > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > specifies the size of the fmap message. Then the kernel can realloc a > > large enough buffer and try again. > > There is a better way to do this: the allocation can happen when we > get the response. Just need to add infrastructure to dev.c. OK, makes sense. Will take a run at this. Might drop back and go with a hard limit and relax it later. Famfs fmaps won't grow unbounded near term... > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > index 6c384640c79b..dff5aa62543e 100644 > > --- a/include/uapi/linux/fuse.h > > +++ b/include/uapi/linux/fuse.h > > @@ -654,6 +654,10 @@ enum fuse_opcode { > > FUSE_TMPFILE = 51, > > FUSE_STATX = 52, > > > > + /* Famfs / devdax opcodes */ > > + FUSE_GET_FMAP = 53, > > + FUSE_GET_DAXDEV = 54, > > Introduced too early. You mean FUSE_GET_DAXDEV I presume (which is not used until 2 patches later? Right, will fix. > > > + > > /* CUSE specific operations */ > > CUSE_INIT = 4096, > > > > @@ -888,6 +892,16 @@ struct fuse_access_in { > > uint32_t padding; > > }; > > > > +struct fuse_get_fmap_in { > > + uint32_t size; > > + uint32_t padding; > > +}; > > As noted, passing size to server really makes no sense. I'd just omit > fuse_get_fmap_in completely. OK, I think I understand; Will rework in v3. Same idea as "better way" above... > > > + > > +struct fuse_get_fmap_out { > > + uint32_t size; > > + uint32_t padding; > > +}; > > + > > struct fuse_init_in { > > uint32_t major; > > uint32_t minor; > > @@ -1284,4 +1298,8 @@ struct fuse_uring_cmd_req { > > uint8_t padding[6]; > > }; > > > > +/* Famfs fmap message components */ > > + > > +#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */ > > + > > Hmm, Darrick's interface gets one extents at a time. This one tries > to get the whole map in one go. > > The single extent thing can be inefficient even for plain block fs, so > it would be nice to get multiple extents. The whole map has an > artificial limit that currently may seem sufficient but down the line > could cause pain. > > I'm still hoping some common ground would benefit both interfaces. > Just not sure what it should be. > > Thanks, > Miklos At one point Darrick and I discussed retrieving a [file: offset, length] range of extents (i.e. request describes what it wants, and reply describes what range of the file it covers). I'm not sure it will make sense for famfs to retrieve anything but the whole file's map, but I know it might in Darrick's case. I could imagine an update to GET_FMAP (possibly with a differnet name) that requests an offset range, and then receives a (possibly different) range that is intended to match or exceed the requested range. It seems like we might be able to share the same command to retrieve extents, provided the response starts with a header that allows us to have separate (and presumably extensible) payload formats. No doubt Darrick will have thoughts on this :D I don't think we can merge our "fmap" formats; famfs uses either short extent lists or a format that is efficient for repeating interleave patterns, and wants to cache the entire fmap. ...which is not likely to match Darrick's pattern, but we might be able to share the same retrieval message/response. Thanks! John
On Thu, Aug 14, 2025 at 03:36:26PM +0200, Miklos Szeredi wrote: > On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > retrieve and cache up the file-to-dax map in the kernel. If this > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > Nothing to do at this time unless you want a side project: doing this > with compound requests would save a roundtrip (OPEN + GET_FMAP in one > go). > > > GET_FMAP has a variable-size response payload, and the allocated size > > is sent in the in_args[0].size field. If the fmap would overflow the > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > specifies the size of the fmap message. Then the kernel can realloc a > > large enough buffer and try again. > > There is a better way to do this: the allocation can happen when we > get the response. Just need to add infrastructure to dev.c. > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > index 6c384640c79b..dff5aa62543e 100644 > > --- a/include/uapi/linux/fuse.h > > +++ b/include/uapi/linux/fuse.h > > @@ -654,6 +654,10 @@ enum fuse_opcode { > > FUSE_TMPFILE = 51, > > FUSE_STATX = 52, > > > > + /* Famfs / devdax opcodes */ > > + FUSE_GET_FMAP = 53, > > + FUSE_GET_DAXDEV = 54, > > Introduced too early. > > > + > > /* CUSE specific operations */ > > CUSE_INIT = 4096, > > > > @@ -888,6 +892,16 @@ struct fuse_access_in { > > uint32_t padding; > > }; > > > > +struct fuse_get_fmap_in { > > + uint32_t size; > > + uint32_t padding; > > +}; > > As noted, passing size to server really makes no sense. I'd just omit > fuse_get_fmap_in completely. > > > + > > +struct fuse_get_fmap_out { > > + uint32_t size; > > + uint32_t padding; > > +}; > > + > > struct fuse_init_in { > > uint32_t major; > > uint32_t minor; > > @@ -1284,4 +1298,8 @@ struct fuse_uring_cmd_req { > > uint8_t padding[6]; > > }; > > > > +/* Famfs fmap message components */ > > + > > +#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */ > > + > > Hmm, Darrick's interface gets one extents at a time. This one tries > to get the whole map in one go. > > The single extent thing can be inefficient even for plain block fs, so > it would be nice to get multiple extents. The whole map has an The fuse/iomap series adds a mapping upsertion "notification" that the fuse server can use to prepopulate mappings in the kernel so that it doesn't have to call back to the server for reads and pure overwrites. Maybe it would be useful to be able to upsert mappings for more than a single file range, but so far it hasn't been necessary. > artificial limit that currently may seem sufficient but down the line > could cause pain. > > I'm still hoping some common ground would benefit both interfaces. > Just not sure what it should be. It's possible that famfs could use the mapping upsertion notification to upload mappings into the kernel. As far as I can tell, fuse servers can send notifications even when they're in the middle of handling a fuse request, so the famfs daemon's ->open function could upload mappings before completing the open operation. As for the other use of FMAP (uploading a description of striping data from which one can construct mappings) ... I don't know what to say about that. That one isn't so useful for block filesystems. :) (onto the next reply) --D
On 25/08/14 11:05AM, Darrick J. Wong wrote: <snip> > It's possible that famfs could use the mapping upsertion notification to > upload mappings into the kernel. As far as I can tell, fuse servers can > send notifications even when they're in the middle of handling a fuse > request, so the famfs daemon's ->open function could upload mappings > before completing the open operation. > Famfs dax mappings don't change (and might or might not ever change). Plus, famfs is exposing memory, so it must run at memory speed - which is why it needs to cache the entire fmap for any active file. That way mapping faults happen at lookup-in-fmap speed (which is order 1 for interleaved fmaps, and order-small-n for non-interleaved. I wouldn't rule out ever using upsert, but probably not before we integrate famfs with PNFS, or some other major generalizing event. Thanks, John <snip>
On Sat, Aug 16, 2025 at 10:00:23AM -0500, John Groves wrote: > On 25/08/14 11:05AM, Darrick J. Wong wrote: > <snip> > > It's possible that famfs could use the mapping upsertion notification to > > upload mappings into the kernel. As far as I can tell, fuse servers can > > send notifications even when they're in the middle of handling a fuse > > request, so the famfs daemon's ->open function could upload mappings > > before completing the open operation. > > > > Famfs dax mappings don't change (and might or might not ever change). > Plus, famfs is exposing memory, so it must run at memory speed - which > is why it needs to cache the entire fmap for any active file. That way > mapping faults happen at lookup-in-fmap speed (which is order 1 for > interleaved fmaps, and order-small-n for non-interleaved. > > I wouldn't rule out ever using upsert, but probably not before we > integrate famfs with PNFS, or some other major generalizing event. Hrm? No, you'd just make the famfs ->open function upsert all the relevant mappings. Since the mappings are all fully written and (presumably) within EOF, they'll stay in the cache forever and you never have to upload them ever again. Though it's probably smarter to wait for the first ->iomap_begin to do the upserting because you wouldn't want waste kernel memory until something actually wants to do IO to the file. --D > Thanks, > John > > <snip> > >
On Thu, 14 Aug 2025 at 15:36, Miklos Szeredi <miklos@szeredi.hu> wrote: > I'm still hoping some common ground would benefit both interfaces. > Just not sure what it should be. Something very high level: - allow several map formats: say a plain one with a list of extents and a famfs one - allow several types of backing files: say regular and dax dev - querying maps has a common protocol, format of maps is opaque to this - maps are cached by a common facility - each type of mapping has a decoder module - each type of backing file has a module for handling I/O Does this make sense? This doesn't have to be implemented in one go, but for example GET_FMAP could be renamed to GET_READ_MAP with an added offset and size parameter. For famfs the offset/size would be set to zero/inf. I'd be content with that for now. Thanks, Miklos > > Thanks, > Miklos
On 25/08/14 04:36PM, Miklos Szeredi wrote: > On Thu, 14 Aug 2025 at 15:36, Miklos Szeredi <miklos@szeredi.hu> wrote: > > > I'm still hoping some common ground would benefit both interfaces. > > Just not sure what it should be. > > Something very high level: > > - allow several map formats: say a plain one with a list of extents > and a famfs one > - allow several types of backing files: say regular and dax dev > - querying maps has a common protocol, format of maps is opaque to this > - maps are cached by a common facility > - each type of mapping has a decoder module > - each type of backing file has a module for handling I/O > > Does this make sense? > > This doesn't have to be implemented in one go, but for example > GET_FMAP could be renamed to GET_READ_MAP with an added offset and > size parameter. For famfs the offset/size would be set to zero/inf. > I'd be content with that for now. Maybe GET_FILE_MAP or GET_FILE_IOMAP if we want to keep overloading the term iomap. Maps are to backing-dev for regular file systems, and to device memory (devdax) for famfs - in all cases both read and write (when write is allowed). Thanks, John
On Fri, Aug 15, 2025 at 11:53:16AM -0500, John Groves wrote: > On 25/08/14 04:36PM, Miklos Szeredi wrote: > > On Thu, 14 Aug 2025 at 15:36, Miklos Szeredi <miklos@szeredi.hu> wrote: > > > > > I'm still hoping some common ground would benefit both interfaces. > > > Just not sure what it should be. > > > > Something very high level: > > > > - allow several map formats: say a plain one with a list of extents > > and a famfs one > > - allow several types of backing files: say regular and dax dev > > - querying maps has a common protocol, format of maps is opaque to this > > - maps are cached by a common facility > > - each type of mapping has a decoder module > > - each type of backing file has a module for handling I/O > > > > Does this make sense? > > > > This doesn't have to be implemented in one go, but for example > > GET_FMAP could be renamed to GET_READ_MAP with an added offset and > > size parameter. For famfs the offset/size would be set to zero/inf. > > I'd be content with that for now. > > Maybe GET_FILE_MAP or GET_FILE_IOMAP if we want to keep overloading > the term iomap. Maps are to backing-dev for regular file systems, > and to device memory (devdax) for famfs - in all cases both read > and write (when write is allowed). The calling model for fuse-iomap is the same as fs/iomap -- there's an IOMAP_BEGIN upcall to get a mapping from the filesystem, and an IOMAP_END upcall to tell the fuse server whatever it did with the mapping. Some filesystems will reserve delayed allocation reservations in iomap_begin for a pagecache write, and need to cancel those reservations if the write fails. For a pagecache write you need both a read and a write mapping because the caller's file range isn't guaranteed to be fsblock-aligned. famfs mappings are a subcase of iomappings -- the read & write mappings are the same, and they're always FUSE_IOMAP_TYPE_MAPPED. IOWs, I don't want "GET_FILE_IOMAP" because that's not how iomap works. (There's a separate FUSE_IOMAP_IOEND to pass along IO completions from storage) Given that famfs just calls dax_iomap_rw with an iomap_ops struct, I seriously wonder if I should just wire up fsdax for RFC v5 and then let's see how much code famfs actually needs on top of that. --D > Thanks, > John > >
On Thu, Aug 14, 2025 at 04:36:57PM +0200, Miklos Szeredi wrote: > On Thu, 14 Aug 2025 at 15:36, Miklos Szeredi <miklos@szeredi.hu> wrote: > > > I'm still hoping some common ground would benefit both interfaces. > > Just not sure what it should be. > > Something very high level: > > - allow several map formats: say a plain one with a list of extents > and a famfs one Yes, I think that's needed. > - allow several types of backing files: say regular and dax dev "block device", for iomap. > - querying maps has a common protocol, format of maps is opaque to this > - maps are cached by a common facility I've written such a cache already. :) > - each type of mapping has a decoder module I don't know that you need much "decoding" -- for famfs, the regular mappings correspond to FUSE_IOMAP_TYPE_MAPPED. The one goofy part is the device cookie in each IO mapping: fuse-iomap maps each block device you give it to a device cookie, so I guess famfs will have to do the same. OTOH you can then have a famfs backed by many persistent memory devices. > - each type of backing file has a module for handling I/O > > Does this make sense? More or less. > This doesn't have to be implemented in one go, but for example > GET_FMAP could be renamed to GET_READ_MAP with an added offset and > size parameter. For famfs the offset/size would be set to zero/inf. > I'd be content with that for now. I'll try to cough up a RFC v4 next week. --D > Thanks, > Miklos > > > > > Thanks, > > Miklos >
On 25/08/14 11:20AM, Darrick J. Wong wrote: > On Thu, Aug 14, 2025 at 04:36:57PM +0200, Miklos Szeredi wrote: > > On Thu, 14 Aug 2025 at 15:36, Miklos Szeredi <miklos@szeredi.hu> wrote: > > > > > I'm still hoping some common ground would benefit both interfaces. > > > Just not sure what it should be. > > > > Something very high level: > > > > - allow several map formats: say a plain one with a list of extents > > and a famfs one > > Yes, I think that's needed. Agreed > > > - allow several types of backing files: say regular and dax dev > > "block device", for iomap. > > > - querying maps has a common protocol, format of maps is opaque to this > > - maps are cached by a common facility > > I've written such a cache already. :) I guess I need to take a look at that. Can you point me to the right place? > > > - each type of mapping has a decoder module > > I don't know that you need much "decoding" -- for famfs, the regular > mappings correspond to FUSE_IOMAP_TYPE_MAPPED. The one goofy part is > the device cookie in each IO mapping: fuse-iomap maps each block device > you give it to a device cookie, so I guess famfs will have to do the > same. > > OTOH you can then have a famfs backed by many persistent memory > devices. That's handled in the famfs fmaps already. When an fmap is ingested, if it references any previously-unknown daxdevs, they get retrieved (FUSE_GET_DAXDEV). Oversimplifying a bit, I assume that famfs fmaps won't really change, they'll just be retrieved by a more flexible method and be preceded by a header that identifies the payload as a famfs fmap. > > > - each type of backing file has a module for handling I/O > > > > Does this make sense? > > More or less. I'm nervous about going for too much generalization too soon here, but otherwise yeah. > > > This doesn't have to be implemented in one go, but for example > > GET_FMAP could be renamed to GET_READ_MAP with an added offset and > > size parameter. For famfs the offset/size would be set to zero/inf. > > I'd be content with that for now. > > I'll try to cough up a RFC v4 next week. Darrick, let's try to chat next week to compare notes. Based on this thinking, I will keep my rework of GET_FMAP to a minimum since that will likely be a new shared message/response. I think that part can be merged later in the cycle... John
On Fri, Aug 15, 2025 at 10:06:01AM -0500, John Groves wrote: > On 25/08/14 11:20AM, Darrick J. Wong wrote: > > On Thu, Aug 14, 2025 at 04:36:57PM +0200, Miklos Szeredi wrote: > > > On Thu, 14 Aug 2025 at 15:36, Miklos Szeredi <miklos@szeredi.hu> wrote: > > > > > > > I'm still hoping some common ground would benefit both interfaces. > > > > Just not sure what it should be. > > > > > > Something very high level: > > > > > > - allow several map formats: say a plain one with a list of extents > > > and a famfs one > > > > Yes, I think that's needed. > > Agreed > > > > > > - allow several types of backing files: say regular and dax dev > > > > "block device", for iomap. > > > > > - querying maps has a common protocol, format of maps is opaque to this > > > - maps are cached by a common facility > > > > I've written such a cache already. :) > > I guess I need to take a look at that. Can you point me to the right place? > > > > > > - each type of mapping has a decoder module > > > > I don't know that you need much "decoding" -- for famfs, the regular > > mappings correspond to FUSE_IOMAP_TYPE_MAPPED. The one goofy part is > > the device cookie in each IO mapping: fuse-iomap maps each block device > > you give it to a device cookie, so I guess famfs will have to do the > > same. > > > > OTOH you can then have a famfs backed by many persistent memory > > devices. > > That's handled in the famfs fmaps already. When an fmap is ingested, > if it references any previously-unknown daxdevs, they get retrieved > (FUSE_GET_DAXDEV). > > Oversimplifying a bit, I assume that famfs fmaps won't really change, > they'll just be retrieved by a more flexible method and be preceded > by a header that identifies the payload as a famfs fmap. <nod> Well, I suppose fmaps aren't supposed to change much, but I get the strong sense that Miklos would rather we both use the FUSE_DEV_IOC_BACKING_OPEN interface... > > > > > - each type of backing file has a module for handling I/O > > > > > > Does this make sense? > > > > More or less. > > I'm nervous about going for too much generalization too soon here, > but otherwise yeah. ...and I've tried to make it simple for famfs to pick up the interface. From the new fuse_backing_open: /* * Each _backing_open function should either: * * 1. Take a ref to fb if it wants the file and return 0. * 2. Return 0 without taking a ref if the backing file isn't needed. * 3. Return an errno explaining why it couldn't attach. * * If at least one subsystem bumps the reference count to open it, * we'll install it into the index and return the index. If nobody * opens the file, the error code will be passed up. EPERM is the * default. */ passthrough_res = fuse_passthrough_backing_open(fc, fb); iomap_res = fuse_iomap_backing_open(fc, fb); if (refcount_read(&fb->count) < 2) /* drop the fuse_backing and return one of the res */ So all your famfs_backing_open function has to do is check that fb->file points to a pmem device. If so, it sets fb->famfs = 1 and bumps the fb->count refcount. Full code here: https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=fuse-iomap-attrs_2025-08-19 I'll let the build robots find any facepalm problems and post this whole series tomorrow. > > > This doesn't have to be implemented in one go, but for example > > > GET_FMAP could be renamed to GET_READ_MAP with an added offset and > > > size parameter. For famfs the offset/size would be set to zero/inf. > > > I'd be content with that for now. > > > > I'll try to cough up a RFC v4 next week. > > Darrick, let's try to chat next week to compare notes. > > Based on this thinking, I will keep my rework of GET_FMAP to a minimum > since that will likely be a new shared message/response. I think that > part can be merged later in the cycle... <nod> --D
On Thu, Jul 03, 2025 at 01:50:26PM -0500, John Groves wrote: > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > retrieve and cache up the file-to-dax map in the kernel. If this > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > GET_FMAP has a variable-size response payload, and the allocated size > is sent in the in_args[0].size field. If the fmap would overflow the > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > specifies the size of the fmap message. Then the kernel can realloc a > large enough buffer and try again. > > Signed-off-by: John Groves <john@groves.net> > --- > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > fs/fuse/inode.c | 19 +++++++-- > fs/fuse/iomode.c | 2 +- > include/uapi/linux/fuse.h | 18 +++++++++ > 5 files changed, 154 insertions(+), 5 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 93b82660f0c8..8616fb0a6d61 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > } > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + > +#define FMAP_BUFSIZE 4096 PAGE_SIZE ? > + > +static int > +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid) > +{ > + struct fuse_get_fmap_in inarg = { 0 }; > + size_t fmap_bufsize = FMAP_BUFSIZE; > + ssize_t fmap_size; > + int retries = 1; > + void *fmap_buf; > + int rc; > + > + FUSE_ARGS(args); > + > + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL); > + if (!fmap_buf) > + return -EIO; > + > + retry_once: > + inarg.size = fmap_bufsize; > + > + args.opcode = FUSE_GET_FMAP; > + args.nodeid = nodeid; > + > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + > + /* Variable-sized output buffer > + * this causes fuse_simple_request() to return the size of the > + * output payload > + */ > + args.out_argvar = true; > + args.out_numargs = 1; > + args.out_args[0].size = fmap_bufsize; > + args.out_args[0].value = fmap_buf; > + > + /* Send GET_FMAP command */ > + rc = fuse_simple_request(fm, &args); > + if (rc < 0) { > + pr_err("%s: err=%d from fuse_simple_request()\n", > + __func__, rc); > + return rc; > + } > + fmap_size = rc; > + > + if (retries && fmap_size == sizeof(uint32_t)) { > + /* fmap size exceeded fmap_bufsize; > + * actual fmap size returned in fmap_buf; > + * realloc and retry once > + */ > + fmap_bufsize = *((uint32_t *)fmap_buf); > + > + --retries; > + kfree(fmap_buf); > + fmap_buf = kcalloc(1, fmap_bufsize, GFP_KERNEL); > + if (!fmap_buf) > + return -EIO; > + > + goto retry_once; > + } > + > + /* Will call famfs_file_init_dax() when that gets added */ Hard to say what this does without looking further down in the patchset. :) > + kfree(fmap_buf); > + return 0; > +} > +#endif > + > static int fuse_open(struct inode *inode, struct file *file) > { > struct fuse_mount *fm = get_fuse_mount(inode); > @@ -263,6 +334,19 @@ static int fuse_open(struct inode *inode, struct file *file) > > err = fuse_do_open(fm, get_node_id(inode), file, false); > if (!err) { > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + if (fm->fc->famfs_iomap) { > + if (S_ISREG(inode->i_mode)) { /me wonders if you want to turn this into a dumb helper to reduce the indenting levels? #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) static inline bool fuse_is_famfs_file(struct inode *inode) { return fm->fc->famfs_iomap && S_ISREG(inode->i_mode); } #else # define fuse_is_famfs_file(...) (false) #endif if (!err) { if (fuse_is_famfs_file(inode)) { rc = fuse_get_fmap(fm, inode); ... } } > + int rc; > + /* Get the famfs fmap */ > + rc = fuse_get_fmap(fm, inode, > + get_node_id(inode)); Just get_node_id inside fuse_get_fmap to reduce the parameter count. > + if (rc) > + pr_err("%s: fuse_get_fmap err=%d\n", > + __func__, rc); > + } > + } > +#endif > ff = file->private_data; > err = fuse_finish_open(inode, file); > if (err) > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index f4ee61046578..e01d6e5c6e93 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -193,6 +193,10 @@ struct fuse_inode { > /** Reference to backing file in passthrough mode */ > struct fuse_backing *fb; > #endif > + > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + void *famfs_meta; > +#endif What gets stored in here? > }; > > /** FUSE inode state bits */ > @@ -945,6 +949,8 @@ struct fuse_conn { > #endif > > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + struct rw_semaphore famfs_devlist_sem; > + struct famfs_dax_devlist *dax_devlist; > char *shadow; > #endif > }; > @@ -1435,11 +1441,14 @@ void fuse_free_conn(struct fuse_conn *fc); > > /* dax.c */ > > +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */ > + > /* This macro is used by virtio_fs, but now it also needs to filter for > * "not famfs" > */ > #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \ > - && IS_DAX(&fuse_inode->inode)) > + && IS_DAX(&fuse_inode->inode) \ > + && !fuse_file_famfs(fuse_inode)) > > ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); > ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); > @@ -1550,4 +1559,29 @@ extern void fuse_sysctl_unregister(void); > #define fuse_sysctl_unregister() do { } while (0) > #endif /* CONFIG_SYSCTL */ > > +/* famfs.c */ > +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi, > + void *meta) > +{ > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + return xchg(&fi->famfs_meta, meta); > +#else > + return NULL; > +#endif > +} > + > +static inline void famfs_meta_free(struct fuse_inode *fi) > +{ > + /* Stub wil be connected in a subsequent commit */ > +} > + > +static inline int fuse_file_famfs(struct fuse_inode *fi) > +{ > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + return (READ_ONCE(fi->famfs_meta) != NULL); > +#else > + return 0; > +#endif > +} ...or maybe this is the predicate you want to see if you really need to fmapping related stuff? > + > #endif /* _FS_FUSE_I_H */ > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index a7e1cf8257b0..b071d16f7d04 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_inode_backing_set(fi, NULL); > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > + famfs_meta_set(fi, NULL); > + > return &fi->inode; > > out_free_forget: > @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode) > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_backing_put(fuse_inode_backing(fi)); > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + if (S_ISREG(inode->i_mode) && fi->famfs_meta) { > + famfs_meta_free(fi); > + famfs_meta_set(fi, NULL); _free should null out the pointer, no? --D > + } > +#endif > + > kmem_cache_free(fuse_inode_cachep, fi); > } > > @@ -1002,6 +1012,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_backing_files_init(fc); > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__); > + > INIT_LIST_HEAD(&fc->mounts); > list_add(&fm->fc_entry, &fc->mounts); > fm->fc = fc; > @@ -1036,9 +1049,8 @@ void fuse_conn_put(struct fuse_conn *fc) > } > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_backing_files_free(fc); > -#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > - kfree(fc->shadow); > -#endif > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > + kfree(fc->shadow); > call_rcu(&fc->rcu, delayed_release); > } > } > @@ -1425,6 +1437,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > * those capabilities, they are held here). > */ > fc->famfs_iomap = 1; > + init_rwsem(&fc->famfs_devlist_sem); > } > } else { > ra_pages = fc->max_read / PAGE_SIZE; > diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c > index aec4aecb5d79..443b337b0c05 100644 > --- a/fs/fuse/iomode.c > +++ b/fs/fuse/iomode.c > @@ -204,7 +204,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) > * io modes are not relevant with DAX and with server that does not > * implement open. > */ > - if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args) > + if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args) > return 0; > > /* > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index 6c384640c79b..dff5aa62543e 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -654,6 +654,10 @@ enum fuse_opcode { > FUSE_TMPFILE = 51, > FUSE_STATX = 52, > > + /* Famfs / devdax opcodes */ > + FUSE_GET_FMAP = 53, > + FUSE_GET_DAXDEV = 54, > + > /* CUSE specific operations */ > CUSE_INIT = 4096, > > @@ -888,6 +892,16 @@ struct fuse_access_in { > uint32_t padding; > }; > > +struct fuse_get_fmap_in { > + uint32_t size; > + uint32_t padding; > +}; > + > +struct fuse_get_fmap_out { > + uint32_t size; > + uint32_t padding; > +}; > + > struct fuse_init_in { > uint32_t major; > uint32_t minor; > @@ -1284,4 +1298,8 @@ struct fuse_uring_cmd_req { > uint8_t padding[6]; > }; > > +/* Famfs fmap message components */ > + > +#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */ > + > #endif /* _LINUX_FUSE_H */ > -- > 2.49.0 > >
On 25/07/08 09:27PM, Darrick J. Wong wrote: > On Thu, Jul 03, 2025 at 01:50:26PM -0500, John Groves wrote: > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > retrieve and cache up the file-to-dax map in the kernel. If this > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > GET_FMAP has a variable-size response payload, and the allocated size > > is sent in the in_args[0].size field. If the fmap would overflow the > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > specifies the size of the fmap message. Then the kernel can realloc a > > large enough buffer and try again. > > > > Signed-off-by: John Groves <john@groves.net> > > --- > > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > > fs/fuse/inode.c | 19 +++++++-- > > fs/fuse/iomode.c | 2 +- > > include/uapi/linux/fuse.h | 18 +++++++++ > > 5 files changed, 154 insertions(+), 5 deletions(-) > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index 93b82660f0c8..8616fb0a6d61 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > > } > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + > > +#define FMAP_BUFSIZE 4096 > > PAGE_SIZE ? Like it. Queued to -next > > > + > > +static int > > +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid) > > +{ > > + struct fuse_get_fmap_in inarg = { 0 }; > > + size_t fmap_bufsize = FMAP_BUFSIZE; > > + ssize_t fmap_size; > > + int retries = 1; > > + void *fmap_buf; > > + int rc; > > + > > + FUSE_ARGS(args); > > + > > + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL); > > + if (!fmap_buf) > > + return -EIO; > > + > > + retry_once: > > + inarg.size = fmap_bufsize; > > + > > + args.opcode = FUSE_GET_FMAP; > > + args.nodeid = nodeid; > > + > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(inarg); > > + args.in_args[0].value = &inarg; > > + > > + /* Variable-sized output buffer > > + * this causes fuse_simple_request() to return the size of the > > + * output payload > > + */ > > + args.out_argvar = true; > > + args.out_numargs = 1; > > + args.out_args[0].size = fmap_bufsize; > > + args.out_args[0].value = fmap_buf; > > + > > + /* Send GET_FMAP command */ > > + rc = fuse_simple_request(fm, &args); > > + if (rc < 0) { > > + pr_err("%s: err=%d from fuse_simple_request()\n", > > + __func__, rc); > > + return rc; > > + } > > + fmap_size = rc; > > + > > + if (retries && fmap_size == sizeof(uint32_t)) { > > + /* fmap size exceeded fmap_bufsize; > > + * actual fmap size returned in fmap_buf; > > + * realloc and retry once > > + */ > > + fmap_bufsize = *((uint32_t *)fmap_buf); > > + > > + --retries; > > + kfree(fmap_buf); > > + fmap_buf = kcalloc(1, fmap_bufsize, GFP_KERNEL); > > + if (!fmap_buf) > > + return -EIO; > > + > > + goto retry_once; > > + } > > + > > + /* Will call famfs_file_init_dax() when that gets added */ > > Hard to say what this does without looking further down in the patchset. > :) New comment: /* We retrieved the "fmap" (the file's map to memory), but * we haven't used it yet. A call to famfs_file_init_dax() will be added * here in a subsequent patch, when we add the ability to attach * fmaps to files. */ > > > + kfree(fmap_buf); > > + return 0; > > +} > > +#endif > > + > > static int fuse_open(struct inode *inode, struct file *file) > > { > > struct fuse_mount *fm = get_fuse_mount(inode); > > @@ -263,6 +334,19 @@ static int fuse_open(struct inode *inode, struct file *file) > > > > err = fuse_do_open(fm, get_node_id(inode), file, false); > > if (!err) { > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + if (fm->fc->famfs_iomap) { > > + if (S_ISREG(inode->i_mode)) { > > /me wonders if you want to turn this into a dumb helper to reduce the > indenting levels? > > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > static inline bool fuse_is_famfs_file(struct inode *inode) > { > return fm->fc->famfs_iomap && S_ISREG(inode->i_mode); > } > #else > # define fuse_is_famfs_file(...) (false) > #endif > > if (!err) { > if (fuse_is_famfs_file(inode)) { > rc = fuse_get_fmap(fm, inode); > ... > } > } > I've already refactored helpers and simplified this logic in the -next branch, including losing the conditrional code here in file.c: if (!err) { if ((fm->fc->famfs_iomap) && (S_ISREG(inode->i_mode))) { int rc; /* Get the famfs fmap */ rc = fuse_get_fmap(fm, inode); ... } ... } So I think it's quite a bit cleaner... will send out an updated patch pretty soon (probably next week, without the poisoned page fixes yet). > > + int rc; > > + /* Get the famfs fmap */ > > + rc = fuse_get_fmap(fm, inode, > > + get_node_id(inode)); > > Just get_node_id inside fuse_get_fmap to reduce the parameter count. Done, thanks > > > + if (rc) > > + pr_err("%s: fuse_get_fmap err=%d\n", > > + __func__, rc); > > + } > > + } > > +#endif > > ff = file->private_data; > > err = fuse_finish_open(inode, file); > > if (err) > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index f4ee61046578..e01d6e5c6e93 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -193,6 +193,10 @@ struct fuse_inode { > > /** Reference to backing file in passthrough mode */ > > struct fuse_backing *fb; > > #endif > > + > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + void *famfs_meta; > > +#endif > > What gets stored in here? Explanatory comment added: /* Pointer to the file's famfs metadata. Primary content is the * in-memory version of the fmap - the map from file's offset range * to DAX memory */ > > > }; > > > > /** FUSE inode state bits */ > > @@ -945,6 +949,8 @@ struct fuse_conn { > > #endif > > > > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + struct rw_semaphore famfs_devlist_sem; > > + struct famfs_dax_devlist *dax_devlist; > > char *shadow; > > #endif > > }; > > @@ -1435,11 +1441,14 @@ void fuse_free_conn(struct fuse_conn *fc); > > > > /* dax.c */ > > > > +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */ > > + > > /* This macro is used by virtio_fs, but now it also needs to filter for > > * "not famfs" > > */ > > #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \ > > - && IS_DAX(&fuse_inode->inode)) > > + && IS_DAX(&fuse_inode->inode) \ > > + && !fuse_file_famfs(fuse_inode)) > > > > ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); > > ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); > > @@ -1550,4 +1559,29 @@ extern void fuse_sysctl_unregister(void); > > #define fuse_sysctl_unregister() do { } while (0) > > #endif /* CONFIG_SYSCTL */ > > > > +/* famfs.c */ > > +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi, > > + void *meta) > > +{ > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + return xchg(&fi->famfs_meta, meta); > > +#else > > + return NULL; > > +#endif > > +} > > + > > +static inline void famfs_meta_free(struct fuse_inode *fi) > > +{ > > + /* Stub wil be connected in a subsequent commit */ > > +} > > + > > +static inline int fuse_file_famfs(struct fuse_inode *fi) > > +{ > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + return (READ_ONCE(fi->famfs_meta) != NULL); > > +#else > > + return 0; > > +#endif > > +} > > ...or maybe this is the predicate you want to see if you really need to > fmapping related stuff? > > > + > > #endif /* _FS_FUSE_I_H */ > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > index a7e1cf8257b0..b071d16f7d04 100644 > > --- a/fs/fuse/inode.c > > +++ b/fs/fuse/inode.c > > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > fuse_inode_backing_set(fi, NULL); > > > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > + famfs_meta_set(fi, NULL); > > + > > return &fi->inode; > > > > out_free_forget: > > @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode) > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > fuse_backing_put(fuse_inode_backing(fi)); > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + if (S_ISREG(inode->i_mode) && fi->famfs_meta) { > > + famfs_meta_free(fi); > > + famfs_meta_set(fi, NULL); > > _free should null out the pointer, no? Good point - will do <snip> Thanks Darrick! John
On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote: > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > retrieve and cache up the file-to-dax map in the kernel. If this > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > GET_FMAP has a variable-size response payload, and the allocated size > is sent in the in_args[0].size field. If the fmap would overflow the > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > specifies the size of the fmap message. Then the kernel can realloc a > large enough buffer and try again. > > Signed-off-by: John Groves <john@groves.net> > --- > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > fs/fuse/inode.c | 19 +++++++-- > fs/fuse/iomode.c | 2 +- > include/uapi/linux/fuse.h | 18 +++++++++ > 5 files changed, 154 insertions(+), 5 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 93b82660f0c8..8616fb0a6d61 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > } > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) We generally try to avoid #ifdef blocks in c files keep them mostly in h files and use in c files if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) also #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) it a bit strange for a bool Kconfig because it looks too much like the c code, so I prefer #ifdef CONFIG_FUSE_FAMFS_DAX when you have to use it If you need entire functions compiled out, why not put them in famfs.c? > + > +#define FMAP_BUFSIZE 4096 > + > +static int > +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid) > +{ > + struct fuse_get_fmap_in inarg = { 0 }; > + size_t fmap_bufsize = FMAP_BUFSIZE; > + ssize_t fmap_size; > + int retries = 1; > + void *fmap_buf; > + int rc; > + > + FUSE_ARGS(args); > + > + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL); > + if (!fmap_buf) > + return -EIO; > + > + retry_once: > + inarg.size = fmap_bufsize; > + > + args.opcode = FUSE_GET_FMAP; > + args.nodeid = nodeid; > + > + args.in_numargs = 1; > + args.in_args[0].size = sizeof(inarg); > + args.in_args[0].value = &inarg; > + > + /* Variable-sized output buffer > + * this causes fuse_simple_request() to return the size of the > + * output payload > + */ > + args.out_argvar = true; > + args.out_numargs = 1; > + args.out_args[0].size = fmap_bufsize; > + args.out_args[0].value = fmap_buf; > + > + /* Send GET_FMAP command */ > + rc = fuse_simple_request(fm, &args); > + if (rc < 0) { > + pr_err("%s: err=%d from fuse_simple_request()\n", > + __func__, rc); > + return rc; > + } > + fmap_size = rc; > + > + if (retries && fmap_size == sizeof(uint32_t)) { > + /* fmap size exceeded fmap_bufsize; > + * actual fmap size returned in fmap_buf; > + * realloc and retry once > + */ > + fmap_bufsize = *((uint32_t *)fmap_buf); > + > + --retries; > + kfree(fmap_buf); > + fmap_buf = kcalloc(1, fmap_bufsize, GFP_KERNEL); > + if (!fmap_buf) > + return -EIO; > + > + goto retry_once; > + } > + > + /* Will call famfs_file_init_dax() when that gets added */ > + > + kfree(fmap_buf); > + return 0; > +} > +#endif > + > static int fuse_open(struct inode *inode, struct file *file) > { > struct fuse_mount *fm = get_fuse_mount(inode); > @@ -263,6 +334,19 @@ static int fuse_open(struct inode *inode, struct file *file) > > err = fuse_do_open(fm, get_node_id(inode), file, false); > if (!err) { > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + if (fm->fc->famfs_iomap) { That should be > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) && > + fm->fc->famfs_iomap) { > + if (S_ISREG(inode->i_mode)) { > + int rc; > + /* Get the famfs fmap */ > + rc = fuse_get_fmap(fm, inode, > + get_node_id(inode)); > + if (rc) > + pr_err("%s: fuse_get_fmap err=%d\n", > + __func__, rc); > + } > + } > +#endif > ff = file->private_data; > err = fuse_finish_open(inode, file); > if (err) > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index f4ee61046578..e01d6e5c6e93 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -193,6 +193,10 @@ struct fuse_inode { > /** Reference to backing file in passthrough mode */ > struct fuse_backing *fb; > #endif > + > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + void *famfs_meta; > +#endif > }; > > /** FUSE inode state bits */ > @@ -945,6 +949,8 @@ struct fuse_conn { > #endif > > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + struct rw_semaphore famfs_devlist_sem; > + struct famfs_dax_devlist *dax_devlist; > char *shadow; > #endif > }; > @@ -1435,11 +1441,14 @@ void fuse_free_conn(struct fuse_conn *fc); > > /* dax.c */ > > +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */ > + > /* This macro is used by virtio_fs, but now it also needs to filter for > * "not famfs" > */ > #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \ > - && IS_DAX(&fuse_inode->inode)) > + && IS_DAX(&fuse_inode->inode) \ > + && !fuse_file_famfs(fuse_inode)) > > ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); > ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); > @@ -1550,4 +1559,29 @@ extern void fuse_sysctl_unregister(void); > #define fuse_sysctl_unregister() do { } while (0) > #endif /* CONFIG_SYSCTL */ > > +/* famfs.c */ > +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi, > + void *meta) > +{ > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + return xchg(&fi->famfs_meta, meta); > +#else > + return NULL; > +#endif > +} > + > +static inline void famfs_meta_free(struct fuse_inode *fi) > +{ > + /* Stub wil be connected in a subsequent commit */ > +} > + > +static inline int fuse_file_famfs(struct fuse_inode *fi) > +{ > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + return (READ_ONCE(fi->famfs_meta) != NULL); > +#else > + return 0; > +#endif > +} > + > #endif /* _FS_FUSE_I_H */ > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index a7e1cf8257b0..b071d16f7d04 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_inode_backing_set(fi, NULL); > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > + famfs_meta_set(fi, NULL); > + > return &fi->inode; > > out_free_forget: > @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode) > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_backing_put(fuse_inode_backing(fi)); > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > + if (S_ISREG(inode->i_mode) && fi->famfs_meta) { > + famfs_meta_free(fi); > + famfs_meta_set(fi, NULL); > + } > +#endif > + > kmem_cache_free(fuse_inode_cachep, fi); > } > > @@ -1002,6 +1012,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_backing_files_init(fc); > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__); > + > INIT_LIST_HEAD(&fc->mounts); > list_add(&fm->fc_entry, &fc->mounts); > fm->fc = fc; > @@ -1036,9 +1049,8 @@ void fuse_conn_put(struct fuse_conn *fc) > } > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > fuse_backing_files_free(fc); > -#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > - kfree(fc->shadow); > -#endif > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > + kfree(fc->shadow); > call_rcu(&fc->rcu, delayed_release); > } > } > @@ -1425,6 +1437,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > * those capabilities, they are held here). > */ > fc->famfs_iomap = 1; > + init_rwsem(&fc->famfs_devlist_sem); > } > } else { > ra_pages = fc->max_read / PAGE_SIZE; > diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c > index aec4aecb5d79..443b337b0c05 100644 > --- a/fs/fuse/iomode.c > +++ b/fs/fuse/iomode.c > @@ -204,7 +204,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) > * io modes are not relevant with DAX and with server that does not > * implement open. > */ > - if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args) > + if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args) > return 0; This is where fuse_is_dax() helper would be handy. Thanks, Amir.
On 25/07/04 10:54AM, Amir Goldstein wrote: > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote: > > > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > retrieve and cache up the file-to-dax map in the kernel. If this > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > GET_FMAP has a variable-size response payload, and the allocated size > > is sent in the in_args[0].size field. If the fmap would overflow the > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > specifies the size of the fmap message. Then the kernel can realloc a > > large enough buffer and try again. > > > > Signed-off-by: John Groves <john@groves.net> > > --- > > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > > fs/fuse/inode.c | 19 +++++++-- > > fs/fuse/iomode.c | 2 +- > > include/uapi/linux/fuse.h | 18 +++++++++ > > 5 files changed, 154 insertions(+), 5 deletions(-) > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > index 93b82660f0c8..8616fb0a6d61 100644 > > --- a/fs/fuse/file.c > > +++ b/fs/fuse/file.c > > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > > } > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > We generally try to avoid #ifdef blocks in c files > keep them mostly in h files and use in c files > if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > also #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > it a bit strange for a bool Kconfig because it looks too > much like the c code, so I prefer > #ifdef CONFIG_FUSE_FAMFS_DAX > when you have to use it > > If you need entire functions compiled out, why not put them in famfs.c? Perhaps moving fuse_get_fmap() to famfs.c is the best approach. Will try that first. Regarding '#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)', vs. '#ifdef CONFIG_FUSE_FAMFS_DAX' vs. '#if CONFIG_FUSE_FAMFS_DAX'... I've learned to be cautious there because the latter two are undefined if CONFIG_FUSE_FAMFS_DAX=m. I've been burned by this. My original thinking was that famfs made sense as a module, but I'm leaning the other way now - and in this series fs/fuse/Kconfig makes it a bool - meaning all three macro tests will work because a bool can't be set to 'm'. So to the extent that I need conditional compilation macros I can switch to '#ifdef...'. > > > + > > +#define FMAP_BUFSIZE 4096 > > + > > +static int > > +fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid) > > +{ > > + struct fuse_get_fmap_in inarg = { 0 }; > > + size_t fmap_bufsize = FMAP_BUFSIZE; > > + ssize_t fmap_size; > > + int retries = 1; > > + void *fmap_buf; > > + int rc; > > + > > + FUSE_ARGS(args); > > + > > + fmap_buf = kcalloc(1, FMAP_BUFSIZE, GFP_KERNEL); > > + if (!fmap_buf) > > + return -EIO; > > + > > + retry_once: > > + inarg.size = fmap_bufsize; > > + > > + args.opcode = FUSE_GET_FMAP; > > + args.nodeid = nodeid; > > + > > + args.in_numargs = 1; > > + args.in_args[0].size = sizeof(inarg); > > + args.in_args[0].value = &inarg; > > + > > + /* Variable-sized output buffer > > + * this causes fuse_simple_request() to return the size of the > > + * output payload > > + */ > > + args.out_argvar = true; > > + args.out_numargs = 1; > > + args.out_args[0].size = fmap_bufsize; > > + args.out_args[0].value = fmap_buf; > > + > > + /* Send GET_FMAP command */ > > + rc = fuse_simple_request(fm, &args); > > + if (rc < 0) { > > + pr_err("%s: err=%d from fuse_simple_request()\n", > > + __func__, rc); > > + return rc; > > + } > > + fmap_size = rc; > > + > > + if (retries && fmap_size == sizeof(uint32_t)) { > > + /* fmap size exceeded fmap_bufsize; > > + * actual fmap size returned in fmap_buf; > > + * realloc and retry once > > + */ > > + fmap_bufsize = *((uint32_t *)fmap_buf); > > + > > + --retries; > > + kfree(fmap_buf); > > + fmap_buf = kcalloc(1, fmap_bufsize, GFP_KERNEL); > > + if (!fmap_buf) > > + return -EIO; > > + > > + goto retry_once; > > + } > > + > > + /* Will call famfs_file_init_dax() when that gets added */ > > + > > + kfree(fmap_buf); > > + return 0; > > +} > > +#endif > > + > > static int fuse_open(struct inode *inode, struct file *file) > > { > > struct fuse_mount *fm = get_fuse_mount(inode); > > @@ -263,6 +334,19 @@ static int fuse_open(struct inode *inode, struct file *file) > > > > err = fuse_do_open(fm, get_node_id(inode), file, false); > > if (!err) { > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + if (fm->fc->famfs_iomap) { > > That should be > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) && > > + fm->fc->famfs_iomap) { > > > + if (S_ISREG(inode->i_mode)) { > > + int rc; > > + /* Get the famfs fmap */ > > + rc = fuse_get_fmap(fm, inode, > > + get_node_id(inode)); > > + if (rc) > > + pr_err("%s: fuse_get_fmap err=%d\n", > > + __func__, rc); > > + } > > + } > > +#endif > > ff = file->private_data; > > err = fuse_finish_open(inode, file); > > if (err) > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index f4ee61046578..e01d6e5c6e93 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -193,6 +193,10 @@ struct fuse_inode { > > /** Reference to backing file in passthrough mode */ > > struct fuse_backing *fb; > > #endif > > + > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + void *famfs_meta; > > +#endif > > }; > > > > /** FUSE inode state bits */ > > @@ -945,6 +949,8 @@ struct fuse_conn { > > #endif > > > > #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + struct rw_semaphore famfs_devlist_sem; > > + struct famfs_dax_devlist *dax_devlist; > > char *shadow; > > #endif > > }; > > @@ -1435,11 +1441,14 @@ void fuse_free_conn(struct fuse_conn *fc); > > > > /* dax.c */ > > > > +static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */ > > + > > /* This macro is used by virtio_fs, but now it also needs to filter for > > * "not famfs" > > */ > > #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX) \ > > - && IS_DAX(&fuse_inode->inode)) > > + && IS_DAX(&fuse_inode->inode) \ > > + && !fuse_file_famfs(fuse_inode)) > > > > ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); > > ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); > > @@ -1550,4 +1559,29 @@ extern void fuse_sysctl_unregister(void); > > #define fuse_sysctl_unregister() do { } while (0) > > #endif /* CONFIG_SYSCTL */ > > > > +/* famfs.c */ > > +static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi, > > + void *meta) > > +{ > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + return xchg(&fi->famfs_meta, meta); > > +#else > > + return NULL; > > +#endif > > +} > > + > > +static inline void famfs_meta_free(struct fuse_inode *fi) > > +{ > > + /* Stub wil be connected in a subsequent commit */ > > +} > > + > > +static inline int fuse_file_famfs(struct fuse_inode *fi) > > +{ > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + return (READ_ONCE(fi->famfs_meta) != NULL); > > +#else > > + return 0; > > +#endif > > +} > > + > > #endif /* _FS_FUSE_I_H */ > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > > index a7e1cf8257b0..b071d16f7d04 100644 > > --- a/fs/fuse/inode.c > > +++ b/fs/fuse/inode.c > > @@ -117,6 +117,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > fuse_inode_backing_set(fi, NULL); > > > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > + famfs_meta_set(fi, NULL); > > + > > return &fi->inode; > > > > out_free_forget: > > @@ -138,6 +141,13 @@ static void fuse_free_inode(struct inode *inode) > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > fuse_backing_put(fuse_inode_backing(fi)); > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > + if (S_ISREG(inode->i_mode) && fi->famfs_meta) { > > + famfs_meta_free(fi); > > + famfs_meta_set(fi, NULL); > > + } > > +#endif > > + > > kmem_cache_free(fuse_inode_cachep, fi); > > } > > > > @@ -1002,6 +1012,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > fuse_backing_files_init(fc); > > > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > + pr_notice("%s: Kernel is FUSE_FAMFS_DAX capable\n", __func__); > > + > > INIT_LIST_HEAD(&fc->mounts); > > list_add(&fm->fc_entry, &fc->mounts); > > fm->fc = fc; > > @@ -1036,9 +1049,8 @@ void fuse_conn_put(struct fuse_conn *fc) > > } > > if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) > > fuse_backing_files_free(fc); > > -#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > - kfree(fc->shadow); > > -#endif > > + if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > + kfree(fc->shadow); > > call_rcu(&fc->rcu, delayed_release); > > } > > } > > @@ -1425,6 +1437,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > > * those capabilities, they are held here). > > */ > > fc->famfs_iomap = 1; > > + init_rwsem(&fc->famfs_devlist_sem); > > } > > } else { > > ra_pages = fc->max_read / PAGE_SIZE; > > diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c > > index aec4aecb5d79..443b337b0c05 100644 > > --- a/fs/fuse/iomode.c > > +++ b/fs/fuse/iomode.c > > @@ -204,7 +204,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) > > * io modes are not relevant with DAX and with server that does not > > * implement open. > > */ > > - if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args) > > + if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args) > > return 0; > > This is where fuse_is_dax() helper would be handy. Roger that. Thinking through it... > > Thanks, > Amir. Thank you, John
On 25/07/04 03:30PM, John Groves wrote: > On 25/07/04 10:54AM, Amir Goldstein wrote: > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote: > > > > > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > > retrieve and cache up the file-to-dax map in the kernel. If this > > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > > > GET_FMAP has a variable-size response payload, and the allocated size > > > is sent in the in_args[0].size field. If the fmap would overflow the > > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > > specifies the size of the fmap message. Then the kernel can realloc a > > > large enough buffer and try again. > > > > > > Signed-off-by: John Groves <john@groves.net> > > > --- > > > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > > > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > > > fs/fuse/inode.c | 19 +++++++-- > > > fs/fuse/iomode.c | 2 +- > > > include/uapi/linux/fuse.h | 18 +++++++++ > > > 5 files changed, 154 insertions(+), 5 deletions(-) > > > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > > index 93b82660f0c8..8616fb0a6d61 100644 > > > --- a/fs/fuse/file.c > > > +++ b/fs/fuse/file.c > > > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > > > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > > > } > > > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > > > We generally try to avoid #ifdef blocks in c files > > keep them mostly in h files and use in c files > > if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > > > also #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > it a bit strange for a bool Kconfig because it looks too > > much like the c code, so I prefer > > #ifdef CONFIG_FUSE_FAMFS_DAX > > when you have to use it > > > > If you need entire functions compiled out, why not put them in famfs.c? > > Perhaps moving fuse_get_fmap() to famfs.c is the best approach. Will try that > first. > > Regarding '#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)', vs. > '#ifdef CONFIG_FUSE_FAMFS_DAX' vs. '#if CONFIG_FUSE_FAMFS_DAX'... > > I've learned to be cautious there because the latter two are undefined if > CONFIG_FUSE_FAMFS_DAX=m. I've been burned by this. > > My original thinking was that famfs made sense as a module, but I'm leaning > the other way now - and in this series fs/fuse/Kconfig makes it a bool - > meaning all three macro tests will work because a bool can't be set to 'm'. > > So to the extent that I need conditional compilation macros I can switch > to '#ifdef...'. Doh. Spirit of full disclosure: this commit doesn't build if CONFIG_FUSE_FAMFS_DAX is not set (!=y). So the conditionals are at risk if getting worse, not better. Working on it... <snip> Thanks, John
On Sat, Jul 5, 2025 at 2:06 AM John Groves <John@groves.net> wrote: > > On 25/07/04 03:30PM, John Groves wrote: > > On 25/07/04 10:54AM, Amir Goldstein wrote: > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote: > > > > > > > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > > > retrieve and cache up the file-to-dax map in the kernel. If this > > > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > > > > > GET_FMAP has a variable-size response payload, and the allocated size > > > > is sent in the in_args[0].size field. If the fmap would overflow the > > > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > > > specifies the size of the fmap message. Then the kernel can realloc a > > > > large enough buffer and try again. > > > > > > > > Signed-off-by: John Groves <john@groves.net> > > > > --- > > > > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > > > > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > > > > fs/fuse/inode.c | 19 +++++++-- > > > > fs/fuse/iomode.c | 2 +- > > > > include/uapi/linux/fuse.h | 18 +++++++++ > > > > 5 files changed, 154 insertions(+), 5 deletions(-) > > > > > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > > > index 93b82660f0c8..8616fb0a6d61 100644 > > > > --- a/fs/fuse/file.c > > > > +++ b/fs/fuse/file.c > > > > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > > > > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > > > > } > > > > > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > > > > > We generally try to avoid #ifdef blocks in c files > > > keep them mostly in h files and use in c files > > > if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > > > > > also #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > > it a bit strange for a bool Kconfig because it looks too > > > much like the c code, so I prefer > > > #ifdef CONFIG_FUSE_FAMFS_DAX > > > when you have to use it > > > > > > If you need entire functions compiled out, why not put them in famfs.c? > > > > Perhaps moving fuse_get_fmap() to famfs.c is the best approach. Will try that > > first. > > > > Regarding '#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)', vs. > > '#ifdef CONFIG_FUSE_FAMFS_DAX' vs. '#if CONFIG_FUSE_FAMFS_DAX'... > > > > I've learned to be cautious there because the latter two are undefined if > > CONFIG_FUSE_FAMFS_DAX=m. I've been burned by this. Yes, that's a risk, but as the code is shaping up right now, I do not foresee FAMFS becoming a module(?) > > > > My original thinking was that famfs made sense as a module, but I'm leaning > > the other way now - and in this series fs/fuse/Kconfig makes it a bool - > > meaning all three macro tests will work because a bool can't be set to 'm'. > > > > So to the extent that I need conditional compilation macros I can switch > > to '#ifdef...'. > > Doh. Spirit of full disclosure: this commit doesn't build if > CONFIG_FUSE_FAMFS_DAX is not set (!=y). So the conditionals are at > risk if getting worse, not better. Working on it... > You're probably going to need to add stub inline functions for all the functions from famfs.c and a few more wrappers I guess. The right amount of ifdefs in C files is really a matter of judgement, but the fewer the better for code flow clarity. Thanks, Amir.
On 25/07/05 09:58AM, Amir Goldstein wrote: > On Sat, Jul 5, 2025 at 2:06 AM John Groves <John@groves.net> wrote: > > > > On 25/07/04 03:30PM, John Groves wrote: > > > On 25/07/04 10:54AM, Amir Goldstein wrote: > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote: > > > > > > > > > > Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to > > > > > retrieve and cache up the file-to-dax map in the kernel. If this > > > > > succeeds, read/write/mmap are resolved direct-to-dax with no upcalls. > > > > > > > > > > GET_FMAP has a variable-size response payload, and the allocated size > > > > > is sent in the in_args[0].size field. If the fmap would overflow the > > > > > message, the fuse server sends a reply of size 'sizeof(uint32_t)' which > > > > > specifies the size of the fmap message. Then the kernel can realloc a > > > > > large enough buffer and try again. > > > > > > > > > > Signed-off-by: John Groves <john@groves.net> > > > > > --- > > > > > fs/fuse/file.c | 84 +++++++++++++++++++++++++++++++++++++++ > > > > > fs/fuse/fuse_i.h | 36 ++++++++++++++++- > > > > > fs/fuse/inode.c | 19 +++++++-- > > > > > fs/fuse/iomode.c | 2 +- > > > > > include/uapi/linux/fuse.h | 18 +++++++++ > > > > > 5 files changed, 154 insertions(+), 5 deletions(-) > > > > > > > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > > > > > index 93b82660f0c8..8616fb0a6d61 100644 > > > > > --- a/fs/fuse/file.c > > > > > +++ b/fs/fuse/file.c > > > > > @@ -230,6 +230,77 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) > > > > > fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); > > > > > } > > > > > > > > > > +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > > > > > > > We generally try to avoid #ifdef blocks in c files > > > > keep them mostly in h files and use in c files > > > > if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) > > > > > > > > also #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) > > > > it a bit strange for a bool Kconfig because it looks too > > > > much like the c code, so I prefer > > > > #ifdef CONFIG_FUSE_FAMFS_DAX > > > > when you have to use it > > > > > > > > If you need entire functions compiled out, why not put them in famfs.c? > > > > > > Perhaps moving fuse_get_fmap() to famfs.c is the best approach. Will try that > > > first. > > > > > > Regarding '#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)', vs. > > > '#ifdef CONFIG_FUSE_FAMFS_DAX' vs. '#if CONFIG_FUSE_FAMFS_DAX'... > > > > > > I've learned to be cautious there because the latter two are undefined if > > > CONFIG_FUSE_FAMFS_DAX=m. I've been burned by this. > > Yes, that's a risk, but as the code is shaping up right now, > I do not foresee FAMFS becoming a module(?) Yeah, I can't think of a good reason to go that way at this point. > > > > > > > My original thinking was that famfs made sense as a module, but I'm leaning > > > the other way now - and in this series fs/fuse/Kconfig makes it a bool - > > > meaning all three macro tests will work because a bool can't be set to 'm'. > > > > > > So to the extent that I need conditional compilation macros I can switch > > > to '#ifdef...'. > > > > Doh. Spirit of full disclosure: this commit doesn't build if > > CONFIG_FUSE_FAMFS_DAX is not set (!=y). So the conditionals are at > > risk if getting worse, not better. Working on it... > > > > You're probably going to need to add stub inline functions > for all the functions from famfs.c and a few more wrappers > I guess. > > The right amount of ifdefs in C files is really a matter of judgement, > but the fewer the better for code flow clarity. > > Thanks, > Amir. Right - I've done that now, and it actually looks pretty clean to me. Thanks! John
© 2016 - 2025 Red Hat, Inc.