* The new GET_DAXDEV message/response is enabled
* The command it triggered by the update_daxdev_table() call, if there
are any daxdevs in the subject fmap that are not represented in the
daxdev_dable yet.
Signed-off-by: John Groves <john@groves.net>
---
fs/fuse/famfs.c | 227 ++++++++++++++++++++++++++++++++++++++
fs/fuse/famfs_kfmap.h | 26 +++++
fs/fuse/fuse_i.h | 1 +
fs/fuse/inode.c | 4 +-
fs/namei.c | 1 +
include/uapi/linux/fuse.h | 18 +++
6 files changed, 276 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index 41c4d92f1451..f5e01032b825 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -20,6 +20,230 @@
#include "famfs_kfmap.h"
#include "fuse_i.h"
+/*
+ * famfs_teardown()
+ *
+ * Deallocate famfs metadata for a fuse_conn
+ */
+void /* XXX valid xfs or fuse format? */
+famfs_teardown(struct fuse_conn *fc)
+{
+ struct famfs_dax_devlist *devlist = fc->dax_devlist;
+ int i;
+
+ fc->dax_devlist = NULL;
+
+ if (!devlist)
+ return;
+
+ if (!devlist->devlist)
+ goto out;
+
+ /* Close & release all the daxdevs in our table */
+ for (i = 0; i < devlist->nslots; i++) {
+ if (devlist->devlist[i].valid && devlist->devlist[i].devp)
+ fs_put_dax(devlist->devlist[i].devp, fc);
+ }
+ kfree(devlist->devlist);
+
+out:
+ kfree(devlist);
+}
+
+static int
+famfs_verify_daxdev(const char *pathname, dev_t *devno)
+{
+ struct inode *inode;
+ struct path path;
+ int err;
+
+ if (!pathname || !*pathname)
+ return -EINVAL;
+
+ err = kern_path(pathname, LOOKUP_FOLLOW, &path);
+ if (err)
+ return err;
+
+ inode = d_backing_inode(path.dentry);
+ if (!S_ISCHR(inode->i_mode)) {
+ err = -EINVAL;
+ goto out_path_put;
+ }
+
+ if (!may_open_dev(&path)) { /* had to export this */
+ err = -EACCES;
+ goto out_path_put;
+ }
+
+ *devno = inode->i_rdev;
+
+out_path_put:
+ path_put(&path);
+ return err;
+}
+
+/**
+ * famfs_fuse_get_daxdev() - Retrieve info for a DAX device from fuse server
+ *
+ * Send a GET_DAXDEV message to the fuse server to retrieve info on a
+ * dax device.
+ *
+ * @fm: fuse_mount
+ * @index: the index of the dax device; daxdevs are referred to by index
+ * in fmaps, and the server resolves the index to a particular daxdev
+ *
+ * Returns: 0=success
+ * -errno=failure
+ */
+static int
+famfs_fuse_get_daxdev(struct fuse_mount *fm, const u64 index)
+{
+ struct fuse_daxdev_out daxdev_out = { 0 };
+ struct fuse_conn *fc = fm->fc;
+ struct famfs_daxdev *daxdev;
+ int err = 0;
+
+ FUSE_ARGS(args);
+
+ /* Store the daxdev in our table */
+ if (index >= fc->dax_devlist->nslots) {
+ pr_err("%s: index(%lld) > nslots(%d)\n",
+ __func__, index, fc->dax_devlist->nslots);
+ err = -EINVAL;
+ goto out;
+ }
+
+ args.opcode = FUSE_GET_DAXDEV;
+ args.nodeid = index;
+
+ args.in_numargs = 0;
+
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(daxdev_out);
+ args.out_args[0].value = &daxdev_out;
+
+ /* Send GET_DAXDEV command */
+ err = fuse_simple_request(fm, &args);
+ if (err) {
+ pr_err("%s: err=%d from fuse_simple_request()\n",
+ __func__, err);
+ /*
+ * Error will be that the payload is smaller than FMAP_BUFSIZE,
+ * which is the max we can handle. Empty payload handled below.
+ */
+ goto out;
+ }
+
+ down_write(&fc->famfs_devlist_sem);
+
+ daxdev = &fc->dax_devlist->devlist[index];
+
+ /* Abort if daxdev is now valid */
+ if (daxdev->valid) {
+ up_write(&fc->famfs_devlist_sem);
+ /* We already have a valid entry at this index */
+ err = -EALREADY;
+ goto out;
+ }
+
+ /* Verify that the dev is valid and can be opened and gets the devno */
+ err = famfs_verify_daxdev(daxdev_out.name, &daxdev->devno);
+ if (err) {
+ up_write(&fc->famfs_devlist_sem);
+ pr_err("%s: err=%d from famfs_verify_daxdev()\n", __func__, err);
+ goto out;
+ }
+
+ /* This will fail if it's not a dax device */
+ daxdev->devp = dax_dev_get(daxdev->devno);
+ if (!daxdev->devp) {
+ up_write(&fc->famfs_devlist_sem);
+ pr_warn("%s: device %s not found or not dax\n",
+ __func__, daxdev_out.name);
+ err = -ENODEV;
+ goto out;
+ }
+
+ daxdev->name = kstrdup(daxdev_out.name, GFP_KERNEL);
+ wmb(); /* all daxdev fields must be visible before marking it valid */
+ daxdev->valid = 1;
+
+ up_write(&fc->famfs_devlist_sem);
+
+out:
+ return err;
+}
+
+/**
+ * famfs_update_daxdev_table() - Update the daxdev table
+ * @fm - fuse_mount
+ * @meta - famfs_file_meta, in-memory format, built from a GET_FMAP response
+ *
+ * This function is called for each new file fmap, to verify whether all
+ * referenced daxdevs are already known (i.e. in the table). Any daxdev
+ * indices that referenced in @meta but not in the table will be retrieved via
+ * famfs_fuse_get_daxdev() and added to the table
+ *
+ * Return: 0=success
+ * -errno=failure
+ */
+static int
+famfs_update_daxdev_table(
+ struct fuse_mount *fm,
+ const struct famfs_file_meta *meta)
+{
+ struct famfs_dax_devlist *local_devlist;
+ struct fuse_conn *fc = fm->fc;
+ int err;
+ int i;
+
+ /* First time through we will need to allocate the dax_devlist */
+ if (!fc->dax_devlist) {
+ local_devlist = kcalloc(1, sizeof(*fc->dax_devlist), GFP_KERNEL);
+ if (!local_devlist)
+ return -ENOMEM;
+
+ local_devlist->nslots = MAX_DAXDEVS;
+
+ local_devlist->devlist = kcalloc(MAX_DAXDEVS,
+ sizeof(struct famfs_daxdev),
+ GFP_KERNEL);
+ if (!local_devlist->devlist) {
+ kfree(local_devlist);
+ return -ENOMEM;
+ }
+
+ /* We don't need the famfs_devlist_sem here because we use cmpxchg... */
+ if (cmpxchg(&fc->dax_devlist, NULL, local_devlist) != NULL) {
+ kfree(local_devlist->devlist);
+ kfree(local_devlist); /* another thread beat us to it */
+ }
+ }
+
+ down_read(&fc->famfs_devlist_sem);
+ for (i = 0; i < fc->dax_devlist->nslots; i++) {
+ if (meta->dev_bitmap & (1ULL << i)) {
+ /* This file meta struct references devindex i
+ * if devindex i isn't in the table; get it...
+ */
+ if (!(fc->dax_devlist->devlist[i].valid)) {
+ up_read(&fc->famfs_devlist_sem);
+
+ err = famfs_fuse_get_daxdev(fm, i);
+ if (err)
+ pr_err("%s: failed to get daxdev=%d\n",
+ __func__, i);
+
+ down_read(&fc->famfs_devlist_sem);
+ }
+ }
+ }
+ up_read(&fc->famfs_devlist_sem);
+
+ return 0;
+}
+
+/***************************************************************************/
void
__famfs_meta_free(void *famfs_meta)
@@ -336,6 +560,9 @@ famfs_file_init_dax(
if (rc)
goto errout;
+ /* Make sure this fmap doesn't reference any unknown daxdevs */
+ famfs_update_daxdev_table(fm, meta);
+
/* Publish the famfs metadata on fi->famfs_meta */
inode_lock(inode);
if (fi->famfs_meta) {
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
index ce785d76719c..f79707b9f761 100644
--- a/fs/fuse/famfs_kfmap.h
+++ b/fs/fuse/famfs_kfmap.h
@@ -60,4 +60,30 @@ struct famfs_file_meta {
};
};
+/**
+ * famfs_daxdev - tracking struct for a daxdev within a famfs file system
+ *
+ * This is the in-memory daxdev metadata that is populated by
+ * the responses to GET_FMAP messages
+ */
+struct famfs_daxdev {
+ /* Include dev uuid? */
+ bool valid;
+ bool error;
+ dev_t devno;
+ struct dax_device *devp;
+ char *name;
+};
+
+#define MAX_DAXDEVS 24
+
+/**
+ * famfs_dax_devlist - list of famfs_daxdev's
+ */
+struct famfs_dax_devlist {
+ int nslots;
+ int ndevs;
+ struct famfs_daxdev *devlist; /* XXX: make this an xarray! */
+};
+
#endif /* FAMFS_KFMAP_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fb6095655403..37298551539c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1565,6 +1565,7 @@ int famfs_file_init_dax(struct fuse_mount *fm,
struct inode *inode, void *fmap_buf,
size_t fmap_size);
void __famfs_meta_free(void *map);
+void famfs_teardown(struct fuse_conn *fc);
#endif
static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1682755abf30..c29e9d96ea92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1049,8 +1049,10 @@ void fuse_conn_put(struct fuse_conn *fc)
}
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_free(fc);
- if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+ if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)) {
kfree(fc->shadow);
+ famfs_teardown(fc);
+ }
call_rcu(&fc->rcu, delayed_release);
}
}
diff --git a/fs/namei.c b/fs/namei.c
index ecb7b95c2ca3..75a1e1d46593 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3380,6 +3380,7 @@ bool may_open_dev(const struct path *path)
return !(path->mnt->mnt_flags & MNT_NODEV) &&
!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}
+EXPORT_SYMBOL(may_open_dev);
static int may_open(struct mnt_idmap *idmap, const struct path *path,
int acc_mode, int flag)
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index ecaaa62910f0..8a81b6c334fe 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -235,6 +235,9 @@
* - struct fuse_famfs_simple_ext
* - struct fuse_famfs_iext
* - struct fuse_famfs_fmap_header
+ * - Add the following structs for the GET_DAXDEV message and reply
+ * - struct fuse_get_daxdev_in
+ * - struct fuse_get_daxdev_out
* - Add the following enumerated types
* - enum fuse_famfs_file_type
* - enum famfs_ext_type
@@ -1351,6 +1354,20 @@ struct fuse_famfs_fmap_header {
uint64_t reserved1;
};
+struct fuse_get_daxdev_in {
+ uint32_t daxdev_num;
+};
+
+#define DAXDEV_NAME_MAX 256
+struct fuse_daxdev_out {
+ uint16_t index;
+ uint16_t reserved;
+ uint32_t reserved2;
+ uint64_t reserved3; /* enough space for a uuid if we need it */
+ uint64_t reserved4;
+ char name[DAXDEV_NAME_MAX];
+};
+
static inline int32_t fmap_msg_min_size(void)
{
/* Smallest fmap message is a header plus one simple extent */
@@ -1358,4 +1375,5 @@ static inline int32_t fmap_msg_min_size(void)
+ sizeof(struct fuse_famfs_simple_ext));
}
+
#endif /* _LINUX_FUSE_H */
--
2.49.0
On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > * The new GET_DAXDEV message/response is enabled > * The command it triggered by the update_daxdev_table() call, if there > are any daxdevs in the subject fmap that are not represented in the > daxdev_dable yet. This is rather convoluted, the server *should know* which dax devices it has registered, hence it shouldn't need to be explicitly asked. And there's already an API for registering file descriptors: FUSE_DEV_IOC_BACKING_OPEN. Is there a reason that interface couldn't be used by famfs? Thanks, Miklos
On 25/08/14 03:58PM, Miklos Szeredi wrote: > On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > > > * The new GET_DAXDEV message/response is enabled > > * The command it triggered by the update_daxdev_table() call, if there > > are any daxdevs in the subject fmap that are not represented in the > > daxdev_dable yet. > > This is rather convoluted, the server *should know* which dax devices > it has registered, hence it shouldn't need to be explicitly asked. That's not impossible, but it's also a bit harder than the current approach for the famfs user space - which I think would need to become stateful as to which daxdevs had been pushed into the kernel. The famfs user space is as unstateful as possible ;) > > And there's already an API for registering file descriptors: > FUSE_DEV_IOC_BACKING_OPEN. Is there a reason that interface couldn't > be used by famfs? FUSE_DEV_IOC_BACKING_OPEN looks pretty specific to passthrough. The procedure for opening a daxdev is stolen from the way xfs does it in fs-dax mode. It calls fs_dax_get() rather then open(), and passes in 'famfs_fuse_dax_holder_ops' to 1) effect exclusivity, and 2) receive callbacks in the event of memory errors. See famfs_fuse_get_daxdev()... A *similar* ioctl could be added to push in a daxdev, but that would still add statefulness that isn't required in the current implementation. I didn't go there because there are so few IOCTLs currently (the overall model is more 'pull' than 'push'). Keep in mind that the baseline case with famfs will be files that are interleaved across strips from many daxdevs. We commonly create files that are striped across 16 daxdevs, selected at random from a significantly larger pool. Because interleaving is essential to memory performance... There is no "device mapper" analog for memory, so famfs really does have to span daxdevs. As Darrick commented somewhere, famfs fmaps do repeating patterns well (i.e. striping)... I think there is a certain elegance to the current approach, but if you feel strongly I will change it. Thanks! John
On Fri, Aug 15, 2025 at 11:38:02AM -0500, John Groves wrote: > On 25/08/14 03:58PM, Miklos Szeredi wrote: > > On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > > > > > * The new GET_DAXDEV message/response is enabled > > > * The command it triggered by the update_daxdev_table() call, if there > > > are any daxdevs in the subject fmap that are not represented in the > > > daxdev_dable yet. > > > > This is rather convoluted, the server *should know* which dax devices > > it has registered, hence it shouldn't need to be explicitly asked. > > That's not impossible, but it's also a bit harder than the current > approach for the famfs user space - which I think would need to become > stateful as to which daxdevs had been pushed into the kernel. The > famfs user space is as unstateful as possible ;) > > > > > And there's already an API for registering file descriptors: > > FUSE_DEV_IOC_BACKING_OPEN. Is there a reason that interface couldn't > > be used by famfs? > > FUSE_DEV_IOC_BACKING_OPEN looks pretty specific to passthrough. The > procedure for opening a daxdev is stolen from the way xfs does it in > fs-dax mode. It calls fs_dax_get() rather then open(), and passes in > 'famfs_fuse_dax_holder_ops' to 1) effect exclusivity, and 2) receive > callbacks in the event of memory errors. See famfs_fuse_get_daxdev()... Yeah, that's about what I would do to wire up fsdax in fuse-iomap. > A *similar* ioctl could be added to push in a daxdev, but that would > still add statefulness that isn't required in the current implementation. > I didn't go there because there are so few IOCTLs currently (the overall > model is more 'pull' than 'push'). > > Keep in mind that the baseline case with famfs will be files that are > interleaved across strips from many daxdevs. We commonly create files > that are striped across 16 daxdevs, selected at random from a > significantly larger pool. Because interleaving is essential to memory > performance... > > There is no "device mapper" analog for memory, so famfs really does > have to span daxdevs. As Darrick commented somewhere, famfs fmaps do > repeating patterns well (i.e. striping)... > > I think there is a certain elegance to the current approach, but > if you feel strongly I will change it. I still kinda wonder if you actually want BPF for this sort of thing (programmatically computed file IO mappings) since they'd give you more flexibility than hardcoded C in the kernel. --D > Thanks! > John > >
On Thu, Aug 14, 2025 at 03:58:58PM +0200, Miklos Szeredi wrote: > On Thu, 3 Jul 2025 at 20:54, John Groves <John@groves.net> wrote: > > > > * The new GET_DAXDEV message/response is enabled > > * The command it triggered by the update_daxdev_table() call, if there > > are any daxdevs in the subject fmap that are not represented in the > > daxdev_dable yet. > > This is rather convoluted, the server *should know* which dax devices > it has registered, hence it shouldn't need to be explicitly asked. > > And there's already an API for registering file descriptors: > FUSE_DEV_IOC_BACKING_OPEN. Is there a reason that interface couldn't > be used by famfs? What happens if you want to have a fuse server that hosts both famfs files /and/ backing files? That'd be pretty crazy to mix both paths in one filesystem, but it's in theory possible, particularly if the famfs server wanted to export a pseudofile where everyone could find that shadow file? --D > Thanks, > Miklos >
On Thu, 14 Aug 2025 at 19:19, Darrick J. Wong <djwong@kernel.org> wrote: > What happens if you want to have a fuse server that hosts both famfs > files /and/ backing files? That'd be pretty crazy to mix both paths in > one filesystem, but it's in theory possible, particularly if the famfs > server wanted to export a pseudofile where everyone could find that > shadow file? Either FUSE_DEV_IOC_BACKING_OPEN detects what kind of object it has been handed, or we add a flag that explicitly says this is a dax dev or a block dev or a regular file. I'd prefer the latter. Thanks, Miklos
On 25/08/14 08:25PM, Miklos Szeredi wrote: > On Thu, 14 Aug 2025 at 19:19, Darrick J. Wong <djwong@kernel.org> wrote: > > What happens if you want to have a fuse server that hosts both famfs > > files /and/ backing files? That'd be pretty crazy to mix both paths in > > one filesystem, but it's in theory possible, particularly if the famfs > > server wanted to export a pseudofile where everyone could find that > > shadow file? > > Either FUSE_DEV_IOC_BACKING_OPEN detects what kind of object it has > been handed, or we add a flag that explicitly says this is a dax dev > or a block dev or a regular file. I'd prefer the latter. > > Thanks, > Miklos I have future ideas of famfs supporting non-dax-memory files in a mixed namespace with normal famfs dax files. This seems like the simplest way to relax the "files are strictly pre-allocated" rule. But I think this is orthogonal to how fmaps and backing devs are passed into the kernel. The way I'm thinking about it, the difference would be handled in read/write/mmap. Taking fuse_file_read_iter as the example, the code currently looks like this: if (FUSE_IS_VIRTIO_DAX(fi)) return fuse_dax_read_iter(iocb, to); if (fuse_file_famfs(fi)) return famfs_fuse_read_iter(iocb, to); /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); else if (fuse_file_passthrough(ff)) return fuse_passthrough_read_iter(iocb, to); else return fuse_cache_read_iter(iocb, to); If the famfs fuse servert wants a particular file handled via another mechanism -- e.g. READ message to server or passthrough -- the famfs fuse server can just provide an fmap that indicates such. Then fuse_file_famfs(fi) would return false for that file, and it would be handled through other existing mechanisms (which the famfs fuse server would have to handle correctly). Famfs could, for example, allow files to be created as generic or passthrough, and then have a "commit" step that allocated dax memory, moved the data from a non-dax into dax, and appended the file to the famfs metadata log - flipping the file to full-monty-famfs (tm). Prior to the "commit", performance is less but all manner of mutations could be allowed. So I don't think this looks very be hard, and it's independent of the mechanism by which fmaps get into the kernel. Regards, John
On Sat, Aug 16, 2025 at 11:22:49AM -0500, John Groves wrote: > On 25/08/14 08:25PM, Miklos Szeredi wrote: > > On Thu, 14 Aug 2025 at 19:19, Darrick J. Wong <djwong@kernel.org> wrote: > > > What happens if you want to have a fuse server that hosts both famfs > > > files /and/ backing files? That'd be pretty crazy to mix both paths in > > > one filesystem, but it's in theory possible, particularly if the famfs > > > server wanted to export a pseudofile where everyone could find that > > > shadow file? > > > > Either FUSE_DEV_IOC_BACKING_OPEN detects what kind of object it has > > been handed, or we add a flag that explicitly says this is a dax dev > > or a block dev or a regular file. I'd prefer the latter. > > > > Thanks, > > Miklos > > I have future ideas of famfs supporting non-dax-memory files in a mixed > namespace with normal famfs dax files. This seems like the simplest way > to relax the "files are strictly pre-allocated" rule. But I think this > is orthogonal to how fmaps and backing devs are passed into the kernel. > > The way I'm thinking about it, the difference would be handled in > read/write/mmap. Taking fuse_file_read_iter as the example, the code > currently looks like this: > > if (FUSE_IS_VIRTIO_DAX(fi)) > return fuse_dax_read_iter(iocb, to); > if (fuse_file_famfs(fi)) > return famfs_fuse_read_iter(iocb, to); > > /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ > if (ff->open_flags & FOPEN_DIRECT_IO) > return fuse_direct_read_iter(iocb, to); > else if (fuse_file_passthrough(ff)) > return fuse_passthrough_read_iter(iocb, to); > else > return fuse_cache_read_iter(iocb, to); > > If the famfs fuse servert wants a particular file handled via another > mechanism -- e.g. READ message to server or passthrough -- the famfs > fuse server can just provide an fmap that indicates such. Then > fuse_file_famfs(fi) would return false for that file, and it would be > handled through other existing mechanisms (which the famfs fuse > server would have to handle correctly). > > Famfs could, for example, allow files to be created as generic or > passthrough, and then have a "commit" step that allocated dax memory, > moved the data from a non-dax into dax, and appended the file to the > famfs metadata log - flipping the file to full-monty-famfs (tm). > Prior to the "commit", performance is less but all manner of mutations > could be allowed. > > So I don't think this looks very be hard, and it's independent of the > mechanism by which fmaps get into the kernel. This is one thing I wasn't planning -- iomap files are always that, and there's no fallback to any of the other IO strategies. The pagecache handling parts of iomap require things such as i_rwsem controlling access to a file no matter how many places it's hardlinked, and timestamp/mode/acl handling working more or less the same way they do in xfs and ext4. iomap isn't all that congruent with the way that the other IO paths (passthrough, writeback_cache, and "directio" files) work. Though to undercut my own point partially, sending an "inline data" mapping to the kernel causes it to call FUSE_READ/FUSE_WRITE and then you can inject whatever IO path you want. OTOH the iomap inlinedata paths are ... not well tested for pos > 0. --D > Regards, > John > > >
On Thu, Aug 14, 2025 at 08:25:23PM +0200, Miklos Szeredi wrote: > On Thu, 14 Aug 2025 at 19:19, Darrick J. Wong <djwong@kernel.org> wrote: > > What happens if you want to have a fuse server that hosts both famfs > > files /and/ backing files? That'd be pretty crazy to mix both paths in > > one filesystem, but it's in theory possible, particularly if the famfs > > server wanted to export a pseudofile where everyone could find that > > shadow file? > > Either FUSE_DEV_IOC_BACKING_OPEN detects what kind of object it has > been handed, or we add a flag that explicitly says this is a dax dev > or a block dev or a regular file. I'd prefer the latter. I don't think it's difficult to do something like: if (!fud) return -EPERM; if (copy_from_user(&map, argp, sizeof(map))) return -EFAULT; if (IS_ENABLED(CONFIG_FUSE_IOMAP)) { ret = fuse_iomap_dev_add(fud->fc, &map); if (ret) return ret; } if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return fuse_backing_open(fud->fc, &map); return 0; I guess the hard part is -- how do we return /two/ device cookies? Or do we move the backing_files_map out of CONFIG_FUSE_PASSTHROUGH and then let fuse-iomap/famfs extract the block/dax device from that? Then the backing_id/device cookie would be the same across a fuse mount. iomap would have to check that it's being given block devices, but that's easy. --D > Thanks, > Miklos >
On Thu, 14 Aug 2025 at 20:55, Darrick J. Wong <djwong@kernel.org> wrote: > Or do we move the backing_files_map out of CONFIG_FUSE_PASSTHROUGH and > then let fuse-iomap/famfs extract the block/dax device from that? > Then the backing_id/device cookie would be the same across a fuse mount. Yes. Thanks, Miklos
On Thu, 3 Jul 2025 13:50:28 -0500 John Groves <John@Groves.net> wrote: > * The new GET_DAXDEV message/response is enabled > * The command it triggered by the update_daxdev_table() call, if there > are any daxdevs in the subject fmap that are not represented in the > daxdev_dable yet. > > Signed-off-by: John Groves <john@groves.net> More drive by stuff you can ignore for now if you like. > --- > fs/fuse/famfs.c | 227 ++++++++++++++++++++++++++++++++++++++ > fs/fuse/famfs_kfmap.h | 26 +++++ > fs/fuse/fuse_i.h | 1 + > fs/fuse/inode.c | 4 +- > fs/namei.c | 1 + > include/uapi/linux/fuse.h | 18 +++ > 6 files changed, 276 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c > index 41c4d92f1451..f5e01032b825 100644 > --- a/fs/fuse/famfs.c > +++ b/fs/fuse/famfs.c > +/** > + * famfs_fuse_get_daxdev() - Retrieve info for a DAX device from fuse server > + * > + * Send a GET_DAXDEV message to the fuse server to retrieve info on a > + * dax device. > + * > + * @fm: fuse_mount > + * @index: the index of the dax device; daxdevs are referred to by index > + * in fmaps, and the server resolves the index to a particular daxdev > + * > + * Returns: 0=success > + * -errno=failure > + */ > +static int > +famfs_fuse_get_daxdev(struct fuse_mount *fm, const u64 index) > +{ > + struct fuse_daxdev_out daxdev_out = { 0 }; > + struct fuse_conn *fc = fm->fc; > + struct famfs_daxdev *daxdev; > + int err = 0; > + > + FUSE_ARGS(args); > + > + /* Store the daxdev in our table */ > + if (index >= fc->dax_devlist->nslots) { > + pr_err("%s: index(%lld) > nslots(%d)\n", > + __func__, index, fc->dax_devlist->nslots); > + err = -EINVAL; > + goto out; > + } > + > + args.opcode = FUSE_GET_DAXDEV; > + args.nodeid = index; > + > + args.in_numargs = 0; > + > + args.out_numargs = 1; > + args.out_args[0].size = sizeof(daxdev_out); > + args.out_args[0].value = &daxdev_out; > + > + /* Send GET_DAXDEV command */ > + err = fuse_simple_request(fm, &args); > + if (err) { > + pr_err("%s: err=%d from fuse_simple_request()\n", > + __func__, err); > + /* > + * Error will be that the payload is smaller than FMAP_BUFSIZE, > + * which is the max we can handle. Empty payload handled below. > + */ > + goto out; > + } > + > + down_write(&fc->famfs_devlist_sem); Worth thinking about guard() in this code in general. Simplify some of the error paths at least. > + > + daxdev = &fc->dax_devlist->devlist[index]; > + > + /* Abort if daxdev is now valid */ > + if (daxdev->valid) { > + up_write(&fc->famfs_devlist_sem); > + /* We already have a valid entry at this index */ > + err = -EALREADY; > + goto out; > + } > + > + /* Verify that the dev is valid and can be opened and gets the devno */ > + err = famfs_verify_daxdev(daxdev_out.name, &daxdev->devno); > + if (err) { > + up_write(&fc->famfs_devlist_sem); > + pr_err("%s: err=%d from famfs_verify_daxdev()\n", __func__, err); > + goto out; > + } > + > + /* This will fail if it's not a dax device */ > + daxdev->devp = dax_dev_get(daxdev->devno); > + if (!daxdev->devp) { > + up_write(&fc->famfs_devlist_sem); > + pr_warn("%s: device %s not found or not dax\n", > + __func__, daxdev_out.name); > + err = -ENODEV; > + goto out; > + } > + > + daxdev->name = kstrdup(daxdev_out.name, GFP_KERNEL); > + wmb(); /* all daxdev fields must be visible before marking it valid */ > + daxdev->valid = 1; > + > + up_write(&fc->famfs_devlist_sem); > + > +out: > + return err; > +} > + > +/** > + * famfs_update_daxdev_table() - Update the daxdev table > + * @fm - fuse_mount > + * @meta - famfs_file_meta, in-memory format, built from a GET_FMAP response > + * > + * This function is called for each new file fmap, to verify whether all > + * referenced daxdevs are already known (i.e. in the table). Any daxdev > + * indices that referenced in @meta but not in the table will be retrieved via > + * famfs_fuse_get_daxdev() and added to the table > + * > + * Return: 0=success > + * -errno=failure > + */ > +static int > +famfs_update_daxdev_table( > + struct fuse_mount *fm, > + const struct famfs_file_meta *meta) > +{ > + struct famfs_dax_devlist *local_devlist; > + struct fuse_conn *fc = fm->fc; > + int err; > + int i; > + > + /* First time through we will need to allocate the dax_devlist */ > + if (!fc->dax_devlist) { > + local_devlist = kcalloc(1, sizeof(*fc->dax_devlist), GFP_KERNEL); > + if (!local_devlist) > + return -ENOMEM; > + > + local_devlist->nslots = MAX_DAXDEVS; > + > + local_devlist->devlist = kcalloc(MAX_DAXDEVS, > + sizeof(struct famfs_daxdev), > + GFP_KERNEL); > + if (!local_devlist->devlist) { > + kfree(local_devlist); > + return -ENOMEM; > + } > + > + /* We don't need the famfs_devlist_sem here because we use cmpxchg... */ > + if (cmpxchg(&fc->dax_devlist, NULL, local_devlist) != NULL) { > + kfree(local_devlist->devlist); > + kfree(local_devlist); /* another thread beat us to it */ > + } > + } > + > + down_read(&fc->famfs_devlist_sem); > + for (i = 0; i < fc->dax_devlist->nslots; i++) { > + if (meta->dev_bitmap & (1ULL << i)) { Flip for readability. if (!(meta->dev_bitmap & (1ULL << i)) continue; Or can we use bitmap_from_arr64() and for_each_set_bit() to optimize this a little. > + /* This file meta struct references devindex i > + * if devindex i isn't in the table; get it... > + */ > + if (!(fc->dax_devlist->devlist[i].valid)) { > + up_read(&fc->famfs_devlist_sem); > + > + err = famfs_fuse_get_daxdev(fm, i); > + if (err) > + pr_err("%s: failed to get daxdev=%d\n", > + __func__, i); Don't want to surface that error? > + > + down_read(&fc->famfs_devlist_sem); > + } > + } > + } > + up_read(&fc->famfs_devlist_sem); > + > + return 0; > +} > + > +/***************************************************************************/ ? > diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h > index ce785d76719c..f79707b9f761 100644 > --- a/fs/fuse/famfs_kfmap.h > +++ b/fs/fuse/famfs_kfmap.h > @@ -60,4 +60,30 @@ struct famfs_file_meta { > }; > }; > > +/** > + * famfs_daxdev - tracking struct for a daxdev within a famfs file system > + * > + * This is the in-memory daxdev metadata that is populated by > + * the responses to GET_FMAP messages > + */ > +struct famfs_daxdev { > + /* Include dev uuid? */ > + bool valid; > + bool error; > + dev_t devno; > + struct dax_device *devp; > + char *name; > +}; > + > +#define MAX_DAXDEVS 24 > + > +/** > + * famfs_dax_devlist - list of famfs_daxdev's Run kernel-doc script over these. It gets grumpy about partial documentation. > + */ > +struct famfs_dax_devlist { > + int nslots; > + int ndevs; > + struct famfs_daxdev *devlist; /* XXX: make this an xarray! */ > +}; > + > #endif /* FAMFS_KFMAP_H */ > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index ecaaa62910f0..8a81b6c334fe 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -235,6 +235,9 @@ > * - struct fuse_famfs_simple_ext > * - struct fuse_famfs_iext > * - struct fuse_famfs_fmap_header > + * - Add the following structs for the GET_DAXDEV message and reply > + * - struct fuse_get_daxdev_in > + * - struct fuse_get_daxdev_out > * - Add the following enumerated types > * - enum fuse_famfs_file_type > * - enum famfs_ext_type > @@ -1351,6 +1354,20 @@ struct fuse_famfs_fmap_header { > uint64_t reserved1; > }; > > +struct fuse_get_daxdev_in { > + uint32_t daxdev_num; > +}; > + > +#define DAXDEV_NAME_MAX 256 > +struct fuse_daxdev_out { > + uint16_t index; > + uint16_t reserved; > + uint32_t reserved2; > + uint64_t reserved3; /* enough space for a uuid if we need it */ Odd place for the comment. If it just refers to reserved3 then nope not enough space. If you mean that and reserved4 then fiar enough but that's not obvious as it stands. > + uint64_t reserved4; > + char name[DAXDEV_NAME_MAX]; > +}; > + > static inline int32_t fmap_msg_min_size(void) > { > /* Smallest fmap message is a header plus one simple extent */ > @@ -1358,4 +1375,5 @@ static inline int32_t fmap_msg_min_size(void) > + sizeof(struct fuse_famfs_simple_ext)); > } > > + Stray change. Worth a quick scrub to clean these out (even in an RFC) as they just add noise to the bits you want people to look at! > #endif /* _LINUX_FUSE_H */
On 25/07/04 02:20PM, Jonathan Cameron wrote: > On Thu, 3 Jul 2025 13:50:28 -0500 > John Groves <John@Groves.net> wrote: > > > * The new GET_DAXDEV message/response is enabled > > * The command it triggered by the update_daxdev_table() call, if there > > are any daxdevs in the subject fmap that are not represented in the > > daxdev_dable yet. > > > > Signed-off-by: John Groves <john@groves.net> > > More drive by stuff you can ignore for now if you like. Always appreciated... > > > --- > > fs/fuse/famfs.c | 227 ++++++++++++++++++++++++++++++++++++++ > > fs/fuse/famfs_kfmap.h | 26 +++++ > > fs/fuse/fuse_i.h | 1 + > > fs/fuse/inode.c | 4 +- > > fs/namei.c | 1 + > > include/uapi/linux/fuse.h | 18 +++ > > 6 files changed, 276 insertions(+), 1 deletion(-) > > > > diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c > > index 41c4d92f1451..f5e01032b825 100644 > > --- a/fs/fuse/famfs.c > > +++ b/fs/fuse/famfs.c > > > +/** > > + * famfs_fuse_get_daxdev() - Retrieve info for a DAX device from fuse server > > + * > > + * Send a GET_DAXDEV message to the fuse server to retrieve info on a > > + * dax device. > > + * > > + * @fm: fuse_mount > > + * @index: the index of the dax device; daxdevs are referred to by index > > + * in fmaps, and the server resolves the index to a particular daxdev > > + * > > + * Returns: 0=success > > + * -errno=failure > > + */ > > +static int > > +famfs_fuse_get_daxdev(struct fuse_mount *fm, const u64 index) > > +{ > > + struct fuse_daxdev_out daxdev_out = { 0 }; > > + struct fuse_conn *fc = fm->fc; > > + struct famfs_daxdev *daxdev; > > + int err = 0; > > + > > + FUSE_ARGS(args); > > + > > + /* Store the daxdev in our table */ > > + if (index >= fc->dax_devlist->nslots) { > > + pr_err("%s: index(%lld) > nslots(%d)\n", > > + __func__, index, fc->dax_devlist->nslots); > > + err = -EINVAL; > > + goto out; > > + } > > + > > + args.opcode = FUSE_GET_DAXDEV; > > + args.nodeid = index; > > + > > + args.in_numargs = 0; > > + > > + args.out_numargs = 1; > > + args.out_args[0].size = sizeof(daxdev_out); > > + args.out_args[0].value = &daxdev_out; > > + > > + /* Send GET_DAXDEV command */ > > + err = fuse_simple_request(fm, &args); > > + if (err) { > > + pr_err("%s: err=%d from fuse_simple_request()\n", > > + __func__, err); > > + /* > > + * Error will be that the payload is smaller than FMAP_BUFSIZE, > > + * which is the max we can handle. Empty payload handled below. > > + */ > > + goto out; > > + } > > + > > + down_write(&fc->famfs_devlist_sem); > > Worth thinking about guard() in this code in general. > Simplify some of the error paths at least. Thinking about it. Not sure I'll go there yet; I find the guard macros a bit confusing... > > > + > > + daxdev = &fc->dax_devlist->devlist[index]; > > + > > + /* Abort if daxdev is now valid */ > > + if (daxdev->valid) { > > + up_write(&fc->famfs_devlist_sem); > > + /* We already have a valid entry at this index */ > > + err = -EALREADY; > > + goto out; > > + } > > + > > + /* Verify that the dev is valid and can be opened and gets the devno */ > > + err = famfs_verify_daxdev(daxdev_out.name, &daxdev->devno); > > + if (err) { > > + up_write(&fc->famfs_devlist_sem); > > + pr_err("%s: err=%d from famfs_verify_daxdev()\n", __func__, err); > > + goto out; > > + } > > + > > + /* This will fail if it's not a dax device */ > > + daxdev->devp = dax_dev_get(daxdev->devno); > > + if (!daxdev->devp) { > > + up_write(&fc->famfs_devlist_sem); > > + pr_warn("%s: device %s not found or not dax\n", > > + __func__, daxdev_out.name); > > + err = -ENODEV; > > + goto out; > > + } > > + > > + daxdev->name = kstrdup(daxdev_out.name, GFP_KERNEL); > > + wmb(); /* all daxdev fields must be visible before marking it valid */ > > + daxdev->valid = 1; > > + > > + up_write(&fc->famfs_devlist_sem); > > + > > +out: > > + return err; > > +} > > + > > +/** > > + * famfs_update_daxdev_table() - Update the daxdev table > > + * @fm - fuse_mount > > + * @meta - famfs_file_meta, in-memory format, built from a GET_FMAP response > > + * > > + * This function is called for each new file fmap, to verify whether all > > + * referenced daxdevs are already known (i.e. in the table). Any daxdev > > + * indices that referenced in @meta but not in the table will be retrieved via > > + * famfs_fuse_get_daxdev() and added to the table > > + * > > + * Return: 0=success > > + * -errno=failure > > + */ > > +static int > > +famfs_update_daxdev_table( > > + struct fuse_mount *fm, > > + const struct famfs_file_meta *meta) > > +{ > > + struct famfs_dax_devlist *local_devlist; > > + struct fuse_conn *fc = fm->fc; > > + int err; > > + int i; > > + > > + /* First time through we will need to allocate the dax_devlist */ > > + if (!fc->dax_devlist) { > > + local_devlist = kcalloc(1, sizeof(*fc->dax_devlist), GFP_KERNEL); > > + if (!local_devlist) > > + return -ENOMEM; > > + > > + local_devlist->nslots = MAX_DAXDEVS; > > + > > + local_devlist->devlist = kcalloc(MAX_DAXDEVS, > > + sizeof(struct famfs_daxdev), > > + GFP_KERNEL); > > + if (!local_devlist->devlist) { > > + kfree(local_devlist); > > + return -ENOMEM; > > + } > > + > > + /* We don't need the famfs_devlist_sem here because we use cmpxchg... */ > > + if (cmpxchg(&fc->dax_devlist, NULL, local_devlist) != NULL) { > > + kfree(local_devlist->devlist); > > + kfree(local_devlist); /* another thread beat us to it */ > > + } > > + } > > + > > + down_read(&fc->famfs_devlist_sem); > > + for (i = 0; i < fc->dax_devlist->nslots; i++) { > > + if (meta->dev_bitmap & (1ULL << i)) { > Flip for readability. > if (!(meta->dev_bitmap & (1ULL << i)) > continue; I like it - done.. > > Or can we use bitmap_from_arr64() and > for_each_set_bit() to optimize this a little. Could do, but I feel like that's a bit harder [for me] to read. > > > + /* This file meta struct references devindex i > > + * if devindex i isn't in the table; get it... > > + */ > > + if (!(fc->dax_devlist->devlist[i].valid)) { > > + up_read(&fc->famfs_devlist_sem); > > + > > + err = famfs_fuse_get_daxdev(fm, i); > > + if (err) > > + pr_err("%s: failed to get daxdev=%d\n", > > + __func__, i); > Don't want to surface that error? I'm thinking on that. Failure to retrieve a dax device is currently game over for the whole mount (because there is just one of them currently, and it's retrieved to get access to the superblock and metadata log). Once additional daxdevs are enabled there will be more nuance, but any file that references a 'missing' dax device will be non-operative, so putting something in the log makes sense to me. I may surface it a bit differently, but I think it needs to surface. > > + > > + down_read(&fc->famfs_devlist_sem); > > + } > > + } > > + } > > + up_read(&fc->famfs_devlist_sem); > > + > > + return 0; > > +} > > + > > +/***************************************************************************/ > > ? One of my tics is divider comments. Will probably drop it though ;) > > > diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h > > index ce785d76719c..f79707b9f761 100644 > > --- a/fs/fuse/famfs_kfmap.h > > +++ b/fs/fuse/famfs_kfmap.h > > @@ -60,4 +60,30 @@ struct famfs_file_meta { > > }; > > }; > > > > +/** > > + * famfs_daxdev - tracking struct for a daxdev within a famfs file system > > + * > > + * This is the in-memory daxdev metadata that is populated by > > + * the responses to GET_FMAP messages > > + */ > > +struct famfs_daxdev { > > + /* Include dev uuid? */ > > + bool valid; > > + bool error; > > + dev_t devno; > > + struct dax_device *devp; > > + char *name; > > +}; > > + > > +#define MAX_DAXDEVS 24 > > + > > +/** > > + * famfs_dax_devlist - list of famfs_daxdev's > > Run kernel-doc script over these. It gets grumpy about partial > documentation. Thank you... I just did, and fixed a couple of issues it complained about. > > > + */ > > +struct famfs_dax_devlist { > > + int nslots; > > + int ndevs; > > + struct famfs_daxdev *devlist; /* XXX: make this an xarray! */ > > +}; > > + > > #endif /* FAMFS_KFMAP_H */ > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > > index ecaaa62910f0..8a81b6c334fe 100644 > > --- a/include/uapi/linux/fuse.h > > +++ b/include/uapi/linux/fuse.h > > @@ -235,6 +235,9 @@ > > * - struct fuse_famfs_simple_ext > > * - struct fuse_famfs_iext > > * - struct fuse_famfs_fmap_header > > + * - Add the following structs for the GET_DAXDEV message and reply > > + * - struct fuse_get_daxdev_in > > + * - struct fuse_get_daxdev_out > > * - Add the following enumerated types > > * - enum fuse_famfs_file_type > > * - enum famfs_ext_type > > @@ -1351,6 +1354,20 @@ struct fuse_famfs_fmap_header { > > uint64_t reserved1; > > }; > > > > +struct fuse_get_daxdev_in { > > + uint32_t daxdev_num; > > +}; > > + > > +#define DAXDEV_NAME_MAX 256 > > +struct fuse_daxdev_out { > > + uint16_t index; > > + uint16_t reserved; > > + uint32_t reserved2; > > + uint64_t reserved3; /* enough space for a uuid if we need it */ > > Odd place for the comment. If it just refers to reserved3 then nope > not enough space. If you mean that and reserved4 then fiar enough > but that's not obvious as it stands. Good point. Moved it above in -next > > > + uint64_t reserved4; > > + char name[DAXDEV_NAME_MAX]; > > +}; > > + > > static inline int32_t fmap_msg_min_size(void) > > { > > /* Smallest fmap message is a header plus one simple extent */ > > @@ -1358,4 +1375,5 @@ static inline int32_t fmap_msg_min_size(void) > > + sizeof(struct fuse_famfs_simple_ext)); > > } > > > > + > Stray change. Worth a quick scrub to clean these out (even in an RFC) as they just add > noise to the bits you want people to look at! Yup, will fix. BTW, public service announcement: I've discovered the awesomeness of jj (aka ju jutsu, aka jj-vcs) as a wrapper for git that is great at the kind of rebase problems that come with factoring and re-factoring patch set branches. Without jj, more stuff like this would have slipped through ;) <snip> Thanks! John
© 2016 - 2025 Red Hat, Inc.