[v1] famfs: port into fuse

[RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by John Groves 7 months, 3 weeks ago

On completion of GET_FMAP message/response, setup the full famfs
metadata such that it's possible to handle read/write/mmap directly to
dax. Note that the devdax_iomap plumbing is not in yet...

Update MAINTAINERS for the new files.

Signed-off-by: John Groves <john@groves.net>
---
 MAINTAINERS               |   9 +
 fs/fuse/Makefile          |   2 +-
 fs/fuse/dir.c             |   3 +
 fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
 fs/fuse/famfs_kfmap.h     |  63 +++++++
 fs/fuse/fuse_i.h          |  16 +-
 fs/fuse/inode.c           |   2 +-
 include/uapi/linux/fuse.h |  42 +++++
 8 files changed, 477 insertions(+), 4 deletions(-)
 create mode 100644 fs/fuse/famfs.c
 create mode 100644 fs/fuse/famfs_kfmap.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 00e94bec401e..2a5a7e0e8b28 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8808,6 +8808,15 @@ F:	Documentation/networking/failover.rst
 F:	include/net/failover.h
 F:	net/core/failover.c
 
+FAMFS
+M:	John Groves <jgroves@micron.com>
+M:	John Groves <John@Groves.net>
+L:	linux-cxl@vger.kernel.org
+L:	linux-fsdevel@vger.kernel.org
+S:	Supported
+F:	fs/fuse/famfs.c
+F:	fs/fuse/famfs_kfmap.h
+
 FANOTIFY
 M:	Jan Kara <jack@suse.cz>
 R:	Amir Goldstein <amir73il@gmail.com>
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 3f0f312a31c1..65a12975d734 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -16,5 +16,5 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
 fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
 fuse-$(CONFIG_SYSCTL) += sysctl.o
 fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
-
+fuse-$(CONFIG_FUSE_FAMFS_DAX) += famfs.o
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ae135c55b9f6..b28a1e912d6b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -405,6 +405,9 @@ fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
 	fmap_size = args.out_args[0].size;
 	pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
 
+	/* Convert fmap into in-memory format and hang from inode */
+	famfs_file_init_dax(fm, inode, fmap_buf, fmap_size);
+
 	return 0;
 }
 #endif
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
new file mode 100644
index 000000000000..e62c047d0950
--- /dev/null
+++ b/fs/fuse/famfs.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * famfs - dax file system for shared fabric-attached memory
+ *
+ * Copyright 2023-2025 Micron Technology, Inc.
+ *
+ * This file system, originally based on ramfs the dax support from xfs,
+ * is intended to allow multiple host systems to mount a common file system
+ * view of dax files that map to shared memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dax.h>
+#include <linux/iomap.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "famfs_kfmap.h"
+#include "fuse_i.h"
+
+
+void
+__famfs_meta_free(void *famfs_meta)
+{
+	struct famfs_file_meta *fmap = famfs_meta;
+
+	if (!fmap)
+		return;
+
+	if (fmap) {
+		switch (fmap->fm_extent_type) {
+		case SIMPLE_DAX_EXTENT:
+			kfree(fmap->se);
+			break;
+		case INTERLEAVED_EXTENT:
+			if (fmap->ie)
+				kfree(fmap->ie->ie_strips);
+
+			kfree(fmap->ie);
+			break;
+		default:
+			pr_err("%s: invalid fmap type\n", __func__);
+			break;
+		}
+	}
+	kfree(fmap);
+}
+
+static int
+famfs_check_ext_alignment(struct famfs_meta_simple_ext *se)
+{
+	int errs = 0;
+
+	if (se->dev_index != 0)
+		errs++;
+
+	/* TODO: pass in alignment so we can support the other page sizes */
+	if (!IS_ALIGNED(se->ext_offset, PMD_SIZE))
+		errs++;
+
+	if (!IS_ALIGNED(se->ext_len, PMD_SIZE))
+		errs++;
+
+	return errs;
+}
+
+/**
+ * famfs_meta_alloc() - Allocate famfs file metadata
+ * @metap:       Pointer to an mcache_map_meta pointer
+ * @ext_count:  The number of extents needed
+ */
+static int
+famfs_meta_alloc_v3(
+	void *fmap_buf,
+	size_t fmap_buf_size,
+	struct famfs_file_meta **metap)
+{
+	struct famfs_file_meta *meta = NULL;
+	struct fuse_famfs_fmap_header *fmh;
+	size_t extent_total = 0;
+	size_t next_offset = 0;
+	int errs = 0;
+	int i, j;
+	int rc;
+
+	fmh = (struct fuse_famfs_fmap_header *)fmap_buf;
+
+	/* Move past fmh in fmap_buf */
+	next_offset += sizeof(*fmh);
+	if (next_offset > fmap_buf_size) {
+		pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+		       __func__, __LINE__, next_offset, fmap_buf_size);
+		rc = -EINVAL;
+		goto errout;
+	}
+
+	if (fmh->nextents < 1) {
+		pr_err("%s: nextents %d < 1\n", __func__, fmh->nextents);
+		rc = -EINVAL;
+		goto errout;
+	}
+
+	if (fmh->nextents > FUSE_FAMFS_MAX_EXTENTS) {
+		pr_err("%s: nextents %d > max (%d) 1\n",
+		       __func__, fmh->nextents, FUSE_FAMFS_MAX_EXTENTS);
+		rc = -E2BIG;
+		goto errout;
+	}
+
+	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+	if (!meta)
+		return -ENOMEM;
+	meta->error = false;
+
+	meta->file_type = fmh->file_type;
+	meta->file_size = fmh->file_size;
+	meta->fm_extent_type = fmh->ext_type;
+
+	switch (fmh->ext_type) {
+	case FUSE_FAMFS_EXT_SIMPLE: {
+		struct fuse_famfs_simple_ext *se_in;
+
+		se_in = (struct fuse_famfs_simple_ext *)(fmap_buf + next_offset);
+
+		/* Move past simple extents */
+		next_offset += fmh->nextents * sizeof(*se_in);
+		if (next_offset > fmap_buf_size) {
+			pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+			       __func__, __LINE__, next_offset, fmap_buf_size);
+			rc = -EINVAL;
+			goto errout;
+		}
+
+		meta->fm_nextents = fmh->nextents;
+
+		meta->se = kcalloc(meta->fm_nextents, sizeof(*(meta->se)),
+				   GFP_KERNEL);
+		if (!meta->se) {
+			rc = -ENOMEM;
+			goto errout;
+		}
+
+		if ((meta->fm_nextents > FUSE_FAMFS_MAX_EXTENTS) ||
+		    (meta->fm_nextents < 1)) {
+			rc = -EINVAL;
+			goto errout;
+		}
+
+		for (i = 0; i < fmh->nextents; i++) {
+			meta->se[i].dev_index  = se_in[i].se_devindex;
+			meta->se[i].ext_offset = se_in[i].se_offset;
+			meta->se[i].ext_len    = se_in[i].se_len;
+
+			/* Record bitmap of referenced daxdev indices */
+			meta->dev_bitmap |= (1 << meta->se[i].dev_index);
+
+			errs += famfs_check_ext_alignment(&meta->se[i]);
+
+			extent_total += meta->se[i].ext_len;
+		}
+		break;
+	}
+
+	case FUSE_FAMFS_EXT_INTERLEAVE: {
+		s64 size_remainder = meta->file_size;
+		struct fuse_famfs_iext *ie_in;
+		int niext = fmh->nextents;
+
+		meta->fm_niext = niext;
+
+		/* Allocate interleaved extent */
+		meta->ie = kcalloc(niext, sizeof(*(meta->ie)), GFP_KERNEL);
+		if (!meta->ie) {
+			rc = -ENOMEM;
+			goto errout;
+		}
+
+		/*
+		 * Each interleaved extent has a simple extent list of strips.
+		 * Outer loop is over separate interleaved extents
+		 */
+		for (i = 0; i < niext; i++) {
+			u64 nstrips;
+			struct fuse_famfs_simple_ext *sie_in;
+
+			/* ie_in = one interleaved extent in fmap_buf */
+			ie_in = (struct fuse_famfs_iext *)
+				(fmap_buf + next_offset);
+
+			/* Move past one interleaved extent header in fmap_buf */
+			next_offset += sizeof(*ie_in);
+			if (next_offset > fmap_buf_size) {
+				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+				       __func__, __LINE__, next_offset, fmap_buf_size);
+				rc = -EINVAL;
+				goto errout;
+			}
+
+			nstrips = ie_in->ie_nstrips;
+			meta->ie[i].fie_chunk_size = ie_in->ie_chunk_size;
+			meta->ie[i].fie_nstrips    = ie_in->ie_nstrips;
+			meta->ie[i].fie_nbytes     = ie_in->ie_nbytes;
+
+			if (!meta->ie[i].fie_nbytes) {
+				pr_err("%s: zero-length interleave!\n",
+				       __func__);
+				rc = -EINVAL;
+				goto errout;
+			}
+
+			/* sie_in = the strip extents in fmap_buf */
+			sie_in = (struct fuse_famfs_simple_ext *)
+				(fmap_buf + next_offset);
+
+			/* Move past strip extents in fmap_buf */
+			next_offset += nstrips * sizeof(*sie_in);
+			if (next_offset > fmap_buf_size) {
+				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+				       __func__, __LINE__, next_offset, fmap_buf_size);
+				rc = -EINVAL;
+				goto errout;
+			}
+
+			if ((nstrips > FUSE_FAMFS_MAX_STRIPS) || (nstrips < 1)) {
+				pr_err("%s: invalid nstrips=%lld (max=%d)\n",
+				       __func__, nstrips,
+				       FUSE_FAMFS_MAX_STRIPS);
+				errs++;
+			}
+
+			/* Allocate strip extent array */
+			meta->ie[i].ie_strips = kcalloc(ie_in->ie_nstrips,
+					sizeof(meta->ie[i].ie_strips[0]),
+							GFP_KERNEL);
+			if (!meta->ie[i].ie_strips) {
+				rc = -ENOMEM;
+				goto errout;
+			}
+
+			/* Inner loop is over strips */
+			for (j = 0; j < nstrips; j++) {
+				struct famfs_meta_simple_ext *strips_out;
+				u64 devindex = sie_in[j].se_devindex;
+				u64 offset   = sie_in[j].se_offset;
+				u64 len      = sie_in[j].se_len;
+
+				strips_out = meta->ie[i].ie_strips;
+				strips_out[j].dev_index  = devindex;
+				strips_out[j].ext_offset = offset;
+				strips_out[j].ext_len    = len;
+
+				/* Record bitmap of referenced daxdev indices */
+				meta->dev_bitmap |= (1 << devindex);
+
+				extent_total += len;
+				errs += famfs_check_ext_alignment(&strips_out[j]);
+				size_remainder -= len;
+			}
+		}
+
+		if (size_remainder > 0) {
+			/* Sum of interleaved extent sizes is less than file size! */
+			pr_err("%s: size_remainder %lld (0x%llx)\n",
+			       __func__, size_remainder, size_remainder);
+			rc = -EINVAL;
+			goto errout;
+		}
+		break;
+	}
+
+	default:
+		pr_err("%s: invalid ext_type %d\n", __func__, fmh->ext_type);
+		rc = -EINVAL;
+		goto errout;
+	}
+
+	if (errs > 0) {
+		pr_err("%s: %d alignment errors found\n", __func__, errs);
+		rc = -EINVAL;
+		goto errout;
+	}
+
+	/* More sanity checks */
+	if (extent_total < meta->file_size) {
+		pr_err("%s: file size %ld larger than map size %ld\n",
+		       __func__, meta->file_size, extent_total);
+		rc = -EINVAL;
+		goto errout;
+	}
+
+	*metap = meta;
+
+	return 0;
+errout:
+	__famfs_meta_free(meta);
+	return rc;
+}
+
+int
+famfs_file_init_dax(
+	struct fuse_mount *fm,
+	struct inode *inode,
+	void *fmap_buf,
+	size_t fmap_size)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct famfs_file_meta *meta = NULL;
+	int rc;
+
+	if (fi->famfs_meta) {
+		pr_notice("%s: i_no=%ld fmap_size=%ld ALREADY INITIALIZED\n",
+			  __func__,
+			  inode->i_ino, fmap_size);
+		return -EEXIST;
+	}
+
+	rc = famfs_meta_alloc_v3(fmap_buf, fmap_size, &meta);
+	if (rc)
+		goto errout;
+
+	/* Publish the famfs metadata on fi->famfs_meta */
+	inode_lock(inode);
+	if (fi->famfs_meta) {
+		rc = -EEXIST; /* file already has famfs metadata */
+	} else {
+		if (famfs_meta_set(fi, meta) != NULL) {
+			pr_err("%s: file already had metadata\n", __func__);
+			rc = -EALREADY;
+			goto errout;
+		}
+		i_size_write(inode, meta->file_size);
+		inode->i_flags |= S_DAX;
+	}
+	inode_unlock(inode);
+
+ errout:
+	if (rc)
+		__famfs_meta_free(meta);
+
+	return rc;
+}
+
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
new file mode 100644
index 000000000000..ce785d76719c
--- /dev/null
+++ b/fs/fuse/famfs_kfmap.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * famfs - dax file system for shared fabric-attached memory
+ *
+ * Copyright 2023-2025 Micron Technology, Inc.
+ */
+#ifndef FAMFS_KFMAP_H
+#define FAMFS_KFMAP_H
+
+/*
+ * These structures are the in-memory metadata format for famfs files. Metadata
+ * retrieved via the GET_FMAP response is converted to this format for use in
+ * resolving file mapping faults.
+ */
+
+enum famfs_file_type {
+	FAMFS_REG,
+	FAMFS_SUPERBLOCK,
+	FAMFS_LOG,
+};
+
+/* We anticipate the possiblity of supporting additional types of extents */
+enum famfs_extent_type {
+	SIMPLE_DAX_EXTENT,
+	INTERLEAVED_EXTENT,
+	INVALID_EXTENT_TYPE,
+};
+
+struct famfs_meta_simple_ext {
+	u64 dev_index;
+	u64 ext_offset;
+	u64 ext_len;
+};
+
+struct famfs_meta_interleaved_ext {
+	u64 fie_nstrips;
+	u64 fie_chunk_size;
+	u64 fie_nbytes;
+	struct famfs_meta_simple_ext *ie_strips;
+};
+
+/*
+ * Each famfs dax file has this hanging from its fuse_inode->famfs_meta
+ */
+struct famfs_file_meta {
+	bool                   error;
+	enum famfs_file_type   file_type;
+	size_t                 file_size;
+	enum famfs_extent_type fm_extent_type;
+	u64 dev_bitmap; /* bitmap of referenced daxdevs by index */
+	union { /* This will make code a bit more readable */
+		struct {
+			size_t         fm_nextents;
+			struct famfs_meta_simple_ext  *se;
+		};
+		struct {
+			size_t         fm_niext;
+			struct famfs_meta_interleaved_ext *ie;
+		};
+	};
+};
+
+#endif /* FAMFS_KFMAP_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 437177c2f092..d8e0ac784224 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1557,11 +1557,18 @@ extern void fuse_sysctl_unregister(void);
 #endif /* CONFIG_SYSCTL */
 
 /* famfs.c */
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+int famfs_file_init_dax(struct fuse_mount *fm,
+			     struct inode *inode, void *fmap_buf,
+			     size_t fmap_size);
+void __famfs_meta_free(void *map);
+#endif
+
 static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
 						       void *meta)
 {
 #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
-	return xchg(&fi->famfs_meta, meta);
+	return cmpxchg(&fi->famfs_meta, NULL, meta);
 #else
 	return NULL;
 #endif
@@ -1569,7 +1576,12 @@ static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
 
 static inline void famfs_meta_free(struct fuse_inode *fi)
 {
-	/* Stub wil be connected in a subsequent commit */
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+	if (fi->famfs_meta != NULL) {
+		__famfs_meta_free(fi->famfs_meta);
+		famfs_meta_set(fi, NULL);
+	}
+#endif
 }
 
 static inline int fuse_file_famfs(struct fuse_inode *fi)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 848c8818e6f7..e86bf330117f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,7 +118,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 		fuse_inode_backing_set(fi, NULL);
 
 	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
-		famfs_meta_set(fi, NULL);
+		fi->famfs_meta = NULL; /* XXX new inodes currently not zeroed; why not? */
 
 	return &fi->inode;
 
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index d85fb692cf3b..0f6ff1ffb23d 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -1286,4 +1286,46 @@ struct fuse_uring_cmd_req {
 	uint8_t padding[6];
 };
 
+/* Famfs fmap message components */
+
+#define FAMFS_FMAP_VERSION 1
+
+#define FUSE_FAMFS_MAX_EXTENTS 2
+#define FUSE_FAMFS_MAX_STRIPS 16
+
+enum fuse_famfs_file_type {
+	FUSE_FAMFS_FILE_REG,
+	FUSE_FAMFS_FILE_SUPERBLOCK,
+	FUSE_FAMFS_FILE_LOG,
+};
+
+enum famfs_ext_type {
+	FUSE_FAMFS_EXT_SIMPLE = 0,
+	FUSE_FAMFS_EXT_INTERLEAVE = 1,
+};
+
+struct fuse_famfs_simple_ext {
+	uint32_t se_devindex;
+	uint32_t reserved;
+	uint64_t se_offset;
+	uint64_t se_len;
+};
+
+struct fuse_famfs_iext { /* Interleaved extent */
+	uint32_t ie_nstrips;
+	uint32_t ie_chunk_size;
+	uint64_t ie_nbytes; /* Total bytes for this interleaved_ext; sum of strips may be more */
+	uint64_t reserved;
+};
+
+struct fuse_famfs_fmap_header {
+	uint8_t file_type; /* enum famfs_file_type */
+	uint8_t reserved;
+	uint16_t fmap_version;
+	uint32_t ext_type; /* enum famfs_log_ext_type */
+	uint32_t nextents;
+	uint32_t reserved0;
+	uint64_t file_size;
+	uint64_t reserved1;
+};
 #endif /* _LINUX_FUSE_H */
-- 
2.49.0

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by John Groves 7 months, 3 weeks ago

On 25/04/20 08:33PM, John Groves wrote:
> On completion of GET_FMAP message/response, setup the full famfs
> metadata such that it's possible to handle read/write/mmap directly to
> dax. Note that the devdax_iomap plumbing is not in yet...
> 
> Update MAINTAINERS for the new files.
> 
> Signed-off-by: John Groves <john@groves.net>
> ---
>  MAINTAINERS               |   9 +
>  fs/fuse/Makefile          |   2 +-
>  fs/fuse/dir.c             |   3 +
>  fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
>  fs/fuse/famfs_kfmap.h     |  63 +++++++
>  fs/fuse/fuse_i.h          |  16 +-
>  fs/fuse/inode.c           |   2 +-
>  include/uapi/linux/fuse.h |  42 +++++
>  8 files changed, 477 insertions(+), 4 deletions(-)
>  create mode 100644 fs/fuse/famfs.c
>  create mode 100644 fs/fuse/famfs_kfmap.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 00e94bec401e..2a5a7e0e8b28 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8808,6 +8808,15 @@ F:	Documentation/networking/failover.rst
>  F:	include/net/failover.h
>  F:	net/core/failover.c
>  
> +FAMFS
> +M:	John Groves <jgroves@micron.com>
> +M:	John Groves <John@Groves.net>
> +L:	linux-cxl@vger.kernel.org
> +L:	linux-fsdevel@vger.kernel.org
> +S:	Supported
> +F:	fs/fuse/famfs.c
> +F:	fs/fuse/famfs_kfmap.h
> +
>  FANOTIFY
>  M:	Jan Kara <jack@suse.cz>
>  R:	Amir Goldstein <amir73il@gmail.com>
> diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
> index 3f0f312a31c1..65a12975d734 100644
> --- a/fs/fuse/Makefile
> +++ b/fs/fuse/Makefile
> @@ -16,5 +16,5 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
>  fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
>  fuse-$(CONFIG_SYSCTL) += sysctl.o
>  fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
> -
> +fuse-$(CONFIG_FUSE_FAMFS_DAX) += famfs.o
>  virtiofs-y := virtio_fs.o
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index ae135c55b9f6..b28a1e912d6b 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -405,6 +405,9 @@ fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
>  	fmap_size = args.out_args[0].size;
>  	pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
>  
> +	/* Convert fmap into in-memory format and hang from inode */
> +	famfs_file_init_dax(fm, inode, fmap_buf, fmap_size);
> +
>  	return 0;
>  }
>  #endif
> diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
> new file mode 100644
> index 000000000000..e62c047d0950
> --- /dev/null
> +++ b/fs/fuse/famfs.c
> @@ -0,0 +1,344 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * famfs - dax file system for shared fabric-attached memory
> + *
> + * Copyright 2023-2025 Micron Technology, Inc.
> + *
> + * This file system, originally based on ramfs the dax support from xfs,
> + * is intended to allow multiple host systems to mount a common file system
> + * view of dax files that map to shared memory.
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#include <linux/path.h>
> +#include <linux/namei.h>
> +#include <linux/string.h>
> +
> +#include "famfs_kfmap.h"
> +#include "fuse_i.h"
> +
> +
> +void
> +__famfs_meta_free(void *famfs_meta)
> +{
> +	struct famfs_file_meta *fmap = famfs_meta;
> +
> +	if (!fmap)
> +		return;
> +
> +	if (fmap) {
> +		switch (fmap->fm_extent_type) {
> +		case SIMPLE_DAX_EXTENT:
> +			kfree(fmap->se);
> +			break;
> +		case INTERLEAVED_EXTENT:
> +			if (fmap->ie)
> +				kfree(fmap->ie->ie_strips);
> +
> +			kfree(fmap->ie);
> +			break;
> +		default:
> +			pr_err("%s: invalid fmap type\n", __func__);
> +			break;
> +		}
> +	}
> +	kfree(fmap);
> +}
> +
> +static int
> +famfs_check_ext_alignment(struct famfs_meta_simple_ext *se)
> +{
> +	int errs = 0;
> +
> +	if (se->dev_index != 0)
> +		errs++;
> +
> +	/* TODO: pass in alignment so we can support the other page sizes */
> +	if (!IS_ALIGNED(se->ext_offset, PMD_SIZE))
> +		errs++;
> +
> +	if (!IS_ALIGNED(se->ext_len, PMD_SIZE))
> +		errs++;
> +
> +	return errs;
> +}
> +
> +/**
> + * famfs_meta_alloc() - Allocate famfs file metadata
> + * @metap:       Pointer to an mcache_map_meta pointer
> + * @ext_count:  The number of extents needed
> + */
> +static int
> +famfs_meta_alloc_v3(
> +	void *fmap_buf,
> +	size_t fmap_buf_size,
> +	struct famfs_file_meta **metap)
> +{
> +	struct famfs_file_meta *meta = NULL;
> +	struct fuse_famfs_fmap_header *fmh;
> +	size_t extent_total = 0;
> +	size_t next_offset = 0;
> +	int errs = 0;
> +	int i, j;
> +	int rc;
> +
> +	fmh = (struct fuse_famfs_fmap_header *)fmap_buf;
> +
> +	/* Move past fmh in fmap_buf */
> +	next_offset += sizeof(*fmh);
> +	if (next_offset > fmap_buf_size) {
> +		pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +		       __func__, __LINE__, next_offset, fmap_buf_size);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	if (fmh->nextents < 1) {
> +		pr_err("%s: nextents %d < 1\n", __func__, fmh->nextents);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	if (fmh->nextents > FUSE_FAMFS_MAX_EXTENTS) {
> +		pr_err("%s: nextents %d > max (%d) 1\n",
> +		       __func__, fmh->nextents, FUSE_FAMFS_MAX_EXTENTS);
> +		rc = -E2BIG;
> +		goto errout;
> +	}
> +
> +	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
> +	if (!meta)
> +		return -ENOMEM;
> +	meta->error = false;
> +
> +	meta->file_type = fmh->file_type;
> +	meta->file_size = fmh->file_size;
> +	meta->fm_extent_type = fmh->ext_type;
> +
> +	switch (fmh->ext_type) {
> +	case FUSE_FAMFS_EXT_SIMPLE: {
> +		struct fuse_famfs_simple_ext *se_in;
> +
> +		se_in = (struct fuse_famfs_simple_ext *)(fmap_buf + next_offset);
> +
> +		/* Move past simple extents */
> +		next_offset += fmh->nextents * sizeof(*se_in);
> +		if (next_offset > fmap_buf_size) {
> +			pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +			       __func__, __LINE__, next_offset, fmap_buf_size);
> +			rc = -EINVAL;
> +			goto errout;
> +		}
> +
> +		meta->fm_nextents = fmh->nextents;
> +
> +		meta->se = kcalloc(meta->fm_nextents, sizeof(*(meta->se)),
> +				   GFP_KERNEL);
> +		if (!meta->se) {
> +			rc = -ENOMEM;
> +			goto errout;
> +		}
> +
> +		if ((meta->fm_nextents > FUSE_FAMFS_MAX_EXTENTS) ||
> +		    (meta->fm_nextents < 1)) {
> +			rc = -EINVAL;
> +			goto errout;
> +		}
> +
> +		for (i = 0; i < fmh->nextents; i++) {
> +			meta->se[i].dev_index  = se_in[i].se_devindex;
> +			meta->se[i].ext_offset = se_in[i].se_offset;
> +			meta->se[i].ext_len    = se_in[i].se_len;
> +
> +			/* Record bitmap of referenced daxdev indices */
> +			meta->dev_bitmap |= (1 << meta->se[i].dev_index);
> +
> +			errs += famfs_check_ext_alignment(&meta->se[i]);
> +
> +			extent_total += meta->se[i].ext_len;
> +		}
> +		break;
> +	}
> +
> +	case FUSE_FAMFS_EXT_INTERLEAVE: {
> +		s64 size_remainder = meta->file_size;
> +		struct fuse_famfs_iext *ie_in;
> +		int niext = fmh->nextents;
> +
> +		meta->fm_niext = niext;
> +
> +		/* Allocate interleaved extent */
> +		meta->ie = kcalloc(niext, sizeof(*(meta->ie)), GFP_KERNEL);
> +		if (!meta->ie) {
> +			rc = -ENOMEM;
> +			goto errout;
> +		}
> +
> +		/*
> +		 * Each interleaved extent has a simple extent list of strips.
> +		 * Outer loop is over separate interleaved extents
> +		 */
> +		for (i = 0; i < niext; i++) {
> +			u64 nstrips;
> +			struct fuse_famfs_simple_ext *sie_in;
> +
> +			/* ie_in = one interleaved extent in fmap_buf */
> +			ie_in = (struct fuse_famfs_iext *)
> +				(fmap_buf + next_offset);
> +
> +			/* Move past one interleaved extent header in fmap_buf */
> +			next_offset += sizeof(*ie_in);
> +			if (next_offset > fmap_buf_size) {
> +				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +				       __func__, __LINE__, next_offset, fmap_buf_size);
> +				rc = -EINVAL;
> +				goto errout;
> +			}
> +
> +			nstrips = ie_in->ie_nstrips;
> +			meta->ie[i].fie_chunk_size = ie_in->ie_chunk_size;
> +			meta->ie[i].fie_nstrips    = ie_in->ie_nstrips;
> +			meta->ie[i].fie_nbytes     = ie_in->ie_nbytes;
> +
> +			if (!meta->ie[i].fie_nbytes) {
> +				pr_err("%s: zero-length interleave!\n",
> +				       __func__);
> +				rc = -EINVAL;
> +				goto errout;
> +			}
> +
> +			/* sie_in = the strip extents in fmap_buf */
> +			sie_in = (struct fuse_famfs_simple_ext *)
> +				(fmap_buf + next_offset);
> +
> +			/* Move past strip extents in fmap_buf */
> +			next_offset += nstrips * sizeof(*sie_in);
> +			if (next_offset > fmap_buf_size) {
> +				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +				       __func__, __LINE__, next_offset, fmap_buf_size);
> +				rc = -EINVAL;
> +				goto errout;
> +			}
> +
> +			if ((nstrips > FUSE_FAMFS_MAX_STRIPS) || (nstrips < 1)) {
> +				pr_err("%s: invalid nstrips=%lld (max=%d)\n",
> +				       __func__, nstrips,
> +				       FUSE_FAMFS_MAX_STRIPS);
> +				errs++;
> +			}
> +
> +			/* Allocate strip extent array */
> +			meta->ie[i].ie_strips = kcalloc(ie_in->ie_nstrips,
> +					sizeof(meta->ie[i].ie_strips[0]),
> +							GFP_KERNEL);
> +			if (!meta->ie[i].ie_strips) {
> +				rc = -ENOMEM;
> +				goto errout;
> +			}
> +
> +			/* Inner loop is over strips */
> +			for (j = 0; j < nstrips; j++) {
> +				struct famfs_meta_simple_ext *strips_out;
> +				u64 devindex = sie_in[j].se_devindex;
> +				u64 offset   = sie_in[j].se_offset;
> +				u64 len      = sie_in[j].se_len;
> +
> +				strips_out = meta->ie[i].ie_strips;
> +				strips_out[j].dev_index  = devindex;
> +				strips_out[j].ext_offset = offset;
> +				strips_out[j].ext_len    = len;
> +
> +				/* Record bitmap of referenced daxdev indices */
> +				meta->dev_bitmap |= (1 << devindex);
> +
> +				extent_total += len;
> +				errs += famfs_check_ext_alignment(&strips_out[j]);
> +				size_remainder -= len;
> +			}
> +		}
> +
> +		if (size_remainder > 0) {
> +			/* Sum of interleaved extent sizes is less than file size! */
> +			pr_err("%s: size_remainder %lld (0x%llx)\n",
> +			       __func__, size_remainder, size_remainder);
> +			rc = -EINVAL;
> +			goto errout;
> +		}
> +		break;
> +	}
> +
> +	default:
> +		pr_err("%s: invalid ext_type %d\n", __func__, fmh->ext_type);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	if (errs > 0) {
> +		pr_err("%s: %d alignment errors found\n", __func__, errs);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	/* More sanity checks */
> +	if (extent_total < meta->file_size) {
> +		pr_err("%s: file size %ld larger than map size %ld\n",
> +		       __func__, meta->file_size, extent_total);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	*metap = meta;
> +
> +	return 0;
> +errout:
> +	__famfs_meta_free(meta);
> +	return rc;
> +}
> +
> +int
> +famfs_file_init_dax(
> +	struct fuse_mount *fm,
> +	struct inode *inode,
> +	void *fmap_buf,
> +	size_t fmap_size)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct famfs_file_meta *meta = NULL;
> +	int rc;
> +
> +	if (fi->famfs_meta) {
> +		pr_notice("%s: i_no=%ld fmap_size=%ld ALREADY INITIALIZED\n",
> +			  __func__,
> +			  inode->i_ino, fmap_size);
> +		return -EEXIST;
> +	}
> +
> +	rc = famfs_meta_alloc_v3(fmap_buf, fmap_size, &meta);
> +	if (rc)
> +		goto errout;
> +
> +	/* Publish the famfs metadata on fi->famfs_meta */
> +	inode_lock(inode);
> +	if (fi->famfs_meta) {
> +		rc = -EEXIST; /* file already has famfs metadata */
> +	} else {
> +		if (famfs_meta_set(fi, meta) != NULL) {
> +			pr_err("%s: file already had metadata\n", __func__);
> +			rc = -EALREADY;
> +			goto errout;
> +		}
> +		i_size_write(inode, meta->file_size);
> +		inode->i_flags |= S_DAX;
> +	}
> +	inode_unlock(inode);
> +
> + errout:
> +	if (rc)
> +		__famfs_meta_free(meta);
> +
> +	return rc;
> +}
> +
> diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
> new file mode 100644
> index 000000000000..ce785d76719c
> --- /dev/null
> +++ b/fs/fuse/famfs_kfmap.h
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * famfs - dax file system for shared fabric-attached memory
> + *
> + * Copyright 2023-2025 Micron Technology, Inc.
> + */
> +#ifndef FAMFS_KFMAP_H
> +#define FAMFS_KFMAP_H
> +
> +/*
> + * These structures are the in-memory metadata format for famfs files. Metadata
> + * retrieved via the GET_FMAP response is converted to this format for use in
> + * resolving file mapping faults.
> + */
> +
> +enum famfs_file_type {
> +	FAMFS_REG,
> +	FAMFS_SUPERBLOCK,
> +	FAMFS_LOG,
> +};
> +
> +/* We anticipate the possiblity of supporting additional types of extents */
> +enum famfs_extent_type {
> +	SIMPLE_DAX_EXTENT,
> +	INTERLEAVED_EXTENT,
> +	INVALID_EXTENT_TYPE,
> +};
> +
> +struct famfs_meta_simple_ext {
> +	u64 dev_index;
> +	u64 ext_offset;
> +	u64 ext_len;
> +};
> +
> +struct famfs_meta_interleaved_ext {
> +	u64 fie_nstrips;
> +	u64 fie_chunk_size;
> +	u64 fie_nbytes;
> +	struct famfs_meta_simple_ext *ie_strips;
> +};
> +
> +/*
> + * Each famfs dax file has this hanging from its fuse_inode->famfs_meta
> + */
> +struct famfs_file_meta {
> +	bool                   error;
> +	enum famfs_file_type   file_type;
> +	size_t                 file_size;
> +	enum famfs_extent_type fm_extent_type;
> +	u64 dev_bitmap; /* bitmap of referenced daxdevs by index */
> +	union { /* This will make code a bit more readable */
> +		struct {
> +			size_t         fm_nextents;
> +			struct famfs_meta_simple_ext  *se;
> +		};
> +		struct {
> +			size_t         fm_niext;
> +			struct famfs_meta_interleaved_ext *ie;
> +		};
> +	};
> +};
> +
> +#endif /* FAMFS_KFMAP_H */
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 437177c2f092..d8e0ac784224 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -1557,11 +1557,18 @@ extern void fuse_sysctl_unregister(void);
>  #endif /* CONFIG_SYSCTL */
>  
>  /* famfs.c */
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> +int famfs_file_init_dax(struct fuse_mount *fm,
> +			     struct inode *inode, void *fmap_buf,
> +			     size_t fmap_size);
> +void __famfs_meta_free(void *map);
> +#endif
> +
>  static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
>  						       void *meta)
>  {
>  #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> -	return xchg(&fi->famfs_meta, meta);
> +	return cmpxchg(&fi->famfs_meta, NULL, meta);
>  #else
>  	return NULL;
>  #endif
> @@ -1569,7 +1576,12 @@ static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
>  
>  static inline void famfs_meta_free(struct fuse_inode *fi)
>  {
> -	/* Stub wil be connected in a subsequent commit */
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> +	if (fi->famfs_meta != NULL) {
> +		__famfs_meta_free(fi->famfs_meta);
> +		famfs_meta_set(fi, NULL);
> +	}
> +#endif
>  }
>  
>  static inline int fuse_file_famfs(struct fuse_inode *fi)
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 848c8818e6f7..e86bf330117f 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -118,7 +118,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  		fuse_inode_backing_set(fi, NULL);
>  
>  	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> -		famfs_meta_set(fi, NULL);
> +		fi->famfs_meta = NULL; /* XXX new inodes currently not zeroed; why not? */
>  
>  	return &fi->inode;
>  
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index d85fb692cf3b..0f6ff1ffb23d 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -1286,4 +1286,46 @@ struct fuse_uring_cmd_req {
>  	uint8_t padding[6];
>  };
>  
> +/* Famfs fmap message components */
> +
> +#define FAMFS_FMAP_VERSION 1
> +
> +#define FUSE_FAMFS_MAX_EXTENTS 2
> +#define FUSE_FAMFS_MAX_STRIPS 16

FYI, after thinking through the conversation with Darrick,  I'm planning 
to drop FUSE_FAMFS_MAX_(EXTENTS|STRIPS) in the next version.  In the 
response to GET_FMAP, it's the structures below serialized into a message 
buffer. If it fits, it's good - and if not it's invalid. When the
in-memory metadata (defined in famfs_kfmap.h) gets assembled, if there is
a reason to apply limits it can be done - but I don't currently see a reason
do to that (so if I'm currently enforcing limits there, I'll probably drop
that.


> +
> +enum fuse_famfs_file_type {
> +	FUSE_FAMFS_FILE_REG,
> +	FUSE_FAMFS_FILE_SUPERBLOCK,
> +	FUSE_FAMFS_FILE_LOG,
> +};
> +
> +enum famfs_ext_type {
> +	FUSE_FAMFS_EXT_SIMPLE = 0,
> +	FUSE_FAMFS_EXT_INTERLEAVE = 1,
> +};
> +
> +struct fuse_famfs_simple_ext {
> +	uint32_t se_devindex;
> +	uint32_t reserved;
> +	uint64_t se_offset;
> +	uint64_t se_len;
> +};
> +
> +struct fuse_famfs_iext { /* Interleaved extent */
> +	uint32_t ie_nstrips;
> +	uint32_t ie_chunk_size;
> +	uint64_t ie_nbytes; /* Total bytes for this interleaved_ext; sum of strips may be more */
> +	uint64_t reserved;
> +};
> +
> +struct fuse_famfs_fmap_header {
> +	uint8_t file_type; /* enum famfs_file_type */
> +	uint8_t reserved;
> +	uint16_t fmap_version;
> +	uint32_t ext_type; /* enum famfs_log_ext_type */
> +	uint32_t nextents;
> +	uint32_t reserved0;
> +	uint64_t file_size;
> +	uint64_t reserved1;
> +};
>  #endif /* _LINUX_FUSE_H */
> -- 
> 2.49.0
>

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months, 3 weeks ago

On Thu, Apr 24, 2025 at 08:43:33AM -0500, John Groves wrote:
> On 25/04/20 08:33PM, John Groves wrote:
> > On completion of GET_FMAP message/response, setup the full famfs
> > metadata such that it's possible to handle read/write/mmap directly to
> > dax. Note that the devdax_iomap plumbing is not in yet...
> > 
> > Update MAINTAINERS for the new files.
> > 
> > Signed-off-by: John Groves <john@groves.net>
> > ---
> >  MAINTAINERS               |   9 +
> >  fs/fuse/Makefile          |   2 +-
> >  fs/fuse/dir.c             |   3 +
> >  fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
> >  fs/fuse/famfs_kfmap.h     |  63 +++++++
> >  fs/fuse/fuse_i.h          |  16 +-
> >  fs/fuse/inode.c           |   2 +-
> >  include/uapi/linux/fuse.h |  42 +++++
> >  8 files changed, 477 insertions(+), 4 deletions(-)
> >  create mode 100644 fs/fuse/famfs.c
> >  create mode 100644 fs/fuse/famfs_kfmap.h
> > 

<snip>

> > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > index d85fb692cf3b..0f6ff1ffb23d 100644
> > --- a/include/uapi/linux/fuse.h
> > +++ b/include/uapi/linux/fuse.h
> > @@ -1286,4 +1286,46 @@ struct fuse_uring_cmd_req {
> >  	uint8_t padding[6];
> >  };
> >  
> > +/* Famfs fmap message components */
> > +
> > +#define FAMFS_FMAP_VERSION 1
> > +
> > +#define FUSE_FAMFS_MAX_EXTENTS 2
> > +#define FUSE_FAMFS_MAX_STRIPS 16
> 
> FYI, after thinking through the conversation with Darrick,  I'm planning 
> to drop FUSE_FAMFS_MAX_(EXTENTS|STRIPS) in the next version.  In the 
> response to GET_FMAP, it's the structures below serialized into a message 
> buffer. If it fits, it's good - and if not it's invalid. When the
> in-memory metadata (defined in famfs_kfmap.h) gets assembled, if there is
> a reason to apply limits it can be done - but I don't currently see a reason
> do to that (so if I'm currently enforcing limits there, I'll probably drop
> that.

You could also define GET_FMAP to have an offset in the request buffer,
and have the famfs daemon send back the next offset at the end of its
reply (or -1ULL to stop).  Then the kernel can call GET_FMAP again with
that new offset to get more mappings.

Though at this point maybe it should go the /other/ way, where the fuse
server can sends a "notification" to the kernel to populate its mapping
data?  fuse already defines a handful of notifications for invalidating
pagecache and directory links.

(Ugly wart: notifications aren't yet implemented for the iouring channel)

--D

> 
> > +
> > +enum fuse_famfs_file_type {
> > +	FUSE_FAMFS_FILE_REG,
> > +	FUSE_FAMFS_FILE_SUPERBLOCK,
> > +	FUSE_FAMFS_FILE_LOG,
> > +};
> > +
> > +enum famfs_ext_type {
> > +	FUSE_FAMFS_EXT_SIMPLE = 0,
> > +	FUSE_FAMFS_EXT_INTERLEAVE = 1,
> > +};
> > +
> > +struct fuse_famfs_simple_ext {
> > +	uint32_t se_devindex;
> > +	uint32_t reserved;
> > +	uint64_t se_offset;
> > +	uint64_t se_len;
> > +};
> > +
> > +struct fuse_famfs_iext { /* Interleaved extent */
> > +	uint32_t ie_nstrips;
> > +	uint32_t ie_chunk_size;
> > +	uint64_t ie_nbytes; /* Total bytes for this interleaved_ext; sum of strips may be more */
> > +	uint64_t reserved;
> > +};
> > +
> > +struct fuse_famfs_fmap_header {
> > +	uint8_t file_type; /* enum famfs_file_type */
> > +	uint8_t reserved;
> > +	uint16_t fmap_version;
> > +	uint32_t ext_type; /* enum famfs_log_ext_type */
> > +	uint32_t nextents;
> > +	uint32_t reserved0;
> > +	uint64_t file_size;
> > +	uint64_t reserved1;
> > +};
> >  #endif /* _LINUX_FUSE_H */
> > -- 
> > 2.49.0
> >

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by John Groves 7 months, 2 weeks ago

On 25/04/24 07:38AM, Darrick J. Wong wrote:
> On Thu, Apr 24, 2025 at 08:43:33AM -0500, John Groves wrote:
> > On 25/04/20 08:33PM, John Groves wrote:
> > > On completion of GET_FMAP message/response, setup the full famfs
> > > metadata such that it's possible to handle read/write/mmap directly to
> > > dax. Note that the devdax_iomap plumbing is not in yet...
> > > 
> > > Update MAINTAINERS for the new files.
> > > 
> > > Signed-off-by: John Groves <john@groves.net>
> > > ---
> > >  MAINTAINERS               |   9 +
> > >  fs/fuse/Makefile          |   2 +-
> > >  fs/fuse/dir.c             |   3 +
> > >  fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
> > >  fs/fuse/famfs_kfmap.h     |  63 +++++++
> > >  fs/fuse/fuse_i.h          |  16 +-
> > >  fs/fuse/inode.c           |   2 +-
> > >  include/uapi/linux/fuse.h |  42 +++++
> > >  8 files changed, 477 insertions(+), 4 deletions(-)
> > >  create mode 100644 fs/fuse/famfs.c
> > >  create mode 100644 fs/fuse/famfs_kfmap.h
> > > 
> 
> <snip>
> 
> > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > > index d85fb692cf3b..0f6ff1ffb23d 100644
> > > --- a/include/uapi/linux/fuse.h
> > > +++ b/include/uapi/linux/fuse.h
> > > @@ -1286,4 +1286,46 @@ struct fuse_uring_cmd_req {
> > >  	uint8_t padding[6];
> > >  };
> > >  
> > > +/* Famfs fmap message components */
> > > +
> > > +#define FAMFS_FMAP_VERSION 1
> > > +
> > > +#define FUSE_FAMFS_MAX_EXTENTS 2
> > > +#define FUSE_FAMFS_MAX_STRIPS 16
> > 
> > FYI, after thinking through the conversation with Darrick,  I'm planning 
> > to drop FUSE_FAMFS_MAX_(EXTENTS|STRIPS) in the next version.  In the 
> > response to GET_FMAP, it's the structures below serialized into a message 
> > buffer. If it fits, it's good - and if not it's invalid. When the
> > in-memory metadata (defined in famfs_kfmap.h) gets assembled, if there is
> > a reason to apply limits it can be done - but I don't currently see a reason
> > do to that (so if I'm currently enforcing limits there, I'll probably drop
> > that.
> 
> You could also define GET_FMAP to have an offset in the request buffer,
> and have the famfs daemon send back the next offset at the end of its
> reply (or -1ULL to stop).  Then the kernel can call GET_FMAP again with
> that new offset to get more mappings.
> 
> Though at this point maybe it should go the /other/ way, where the fuse
> server can sends a "notification" to the kernel to populate its mapping
> data?  fuse already defines a handful of notifications for invalidating
> pagecache and directory links.
> 
> (Ugly wart: notifications aren't yet implemented for the iouring channel)

I don't have fully-formed thoughts about notifications yet; thinking...

If the fmap stuff may be shared by more than one use case (as has always
seemed possible), it's a good idea to think through a couple of things: 
1) is there anything important missing from this general approach, and 
2) do you need to *partially* cache fmaps? (or is the "offset" idea above 
just to deal with an fmap that might otherwise overflow a response size?)

The current approach lets the kernel retrieve and cache simple and 
interleaved fmaps (and BTW interleaved can be multi-dev or single-dev - 
there are current weird cases where that's useful). Also too, FWIW everything
that can be done with simple ext list fmaps can be done with a collection
of interleaved extents, each with strip count = 1. But I think there is a
worthwhile clarity to having both.

But the current implementation does not contemplate partially cached fmaps.

Adding notification could address revoking them post-haste (is that why
you're thinking about notifications? And if not can you elaborate on what
you're after there?).

> 
> --D

Cheers,
John

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months, 2 weeks ago

On Sun, Apr 27, 2025 at 08:48:30PM -0500, John Groves wrote:
> On 25/04/24 07:38AM, Darrick J. Wong wrote:
> > On Thu, Apr 24, 2025 at 08:43:33AM -0500, John Groves wrote:
> > > On 25/04/20 08:33PM, John Groves wrote:
> > > > On completion of GET_FMAP message/response, setup the full famfs
> > > > metadata such that it's possible to handle read/write/mmap directly to
> > > > dax. Note that the devdax_iomap plumbing is not in yet...
> > > > 
> > > > Update MAINTAINERS for the new files.
> > > > 
> > > > Signed-off-by: John Groves <john@groves.net>
> > > > ---
> > > >  MAINTAINERS               |   9 +
> > > >  fs/fuse/Makefile          |   2 +-
> > > >  fs/fuse/dir.c             |   3 +
> > > >  fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
> > > >  fs/fuse/famfs_kfmap.h     |  63 +++++++
> > > >  fs/fuse/fuse_i.h          |  16 +-
> > > >  fs/fuse/inode.c           |   2 +-
> > > >  include/uapi/linux/fuse.h |  42 +++++
> > > >  8 files changed, 477 insertions(+), 4 deletions(-)
> > > >  create mode 100644 fs/fuse/famfs.c
> > > >  create mode 100644 fs/fuse/famfs_kfmap.h
> > > > 
> > 
> > <snip>
> > 
> > > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> > > > index d85fb692cf3b..0f6ff1ffb23d 100644
> > > > --- a/include/uapi/linux/fuse.h
> > > > +++ b/include/uapi/linux/fuse.h
> > > > @@ -1286,4 +1286,46 @@ struct fuse_uring_cmd_req {
> > > >  	uint8_t padding[6];
> > > >  };
> > > >  
> > > > +/* Famfs fmap message components */
> > > > +
> > > > +#define FAMFS_FMAP_VERSION 1
> > > > +
> > > > +#define FUSE_FAMFS_MAX_EXTENTS 2
> > > > +#define FUSE_FAMFS_MAX_STRIPS 16
> > > 
> > > FYI, after thinking through the conversation with Darrick,  I'm planning 
> > > to drop FUSE_FAMFS_MAX_(EXTENTS|STRIPS) in the next version.  In the 
> > > response to GET_FMAP, it's the structures below serialized into a message 
> > > buffer. If it fits, it's good - and if not it's invalid. When the
> > > in-memory metadata (defined in famfs_kfmap.h) gets assembled, if there is
> > > a reason to apply limits it can be done - but I don't currently see a reason
> > > do to that (so if I'm currently enforcing limits there, I'll probably drop
> > > that.
> > 
> > You could also define GET_FMAP to have an offset in the request buffer,
> > and have the famfs daemon send back the next offset at the end of its
> > reply (or -1ULL to stop).  Then the kernel can call GET_FMAP again with
> > that new offset to get more mappings.
> > 
> > Though at this point maybe it should go the /other/ way, where the fuse
> > server can sends a "notification" to the kernel to populate its mapping
> > data?  fuse already defines a handful of notifications for invalidating
> > pagecache and directory links.
> > 
> > (Ugly wart: notifications aren't yet implemented for the iouring channel)
> 
> I don't have fully-formed thoughts about notifications yet; thinking...

Me neither.  The existing ones seem like they /could/ be useful for 

> If the fmap stuff may be shared by more than one use case (as has always
> seemed possible), it's a good idea to think through a couple of things: 
> 1) is there anything important missing from this general approach, and 

Well for general iomap caching, I think we'd need to pull in a lot more
of the iomap fields:

struct fuse_iomap {
	u64		addr;	/* disk offset of mapping, bytes */
	loff_t		offset;	/* file offset of mapping, bytes */
	u64		length;	/* length of mapping, bytes */
	u16		type;	/* type of mapping */
	u16		flags;	/* flags for mapping */
	u32		devindex;
	u64		validity_cookie; /* used with .iomap_valid() */
};

fuse would use devindex to find the block_device/dax_device, but
otherwise the fields are exactly the same as struct iomap.  Given that
this is exposed to userspace we'd probably want to add some padding.

The validity cookie I'm not 100% sure about -- buffered IO uses it to
detect stale iomappings after we've locked a folio for write, having
dropped whatever locks protect the iomappings.  The ->iomap_valid
function compares the iomap::validity_cookie against some internal magic
value (this would have to be the iomap cache) to decide if revalidation
is needed.

One way to make this work is to implement the cookie entirely within the
fuse-iomap cache itself -- every time a new mapping comes in (or a range
gets invalidated) the cache bumps its cookie.  The fuse server doesn't
have to implement the cookie itself, but it will have to push a new
mapping or invalidate something every time the mappings change.

Another way would be to have the fuse server implement the cookie
itself, but now we have to find a way to have the kernel and userspace
share a piece of memory where the cookie lives.  I don't like this
option, but it does give the fuse server direct control over when the
cookie value changes.

> 2) do you need to *partially* cache fmaps? (or is the "offset" idea above 
> just to deal with an fmap that might otherwise overflow a response size?)

It's mostly to cap the amount of mapping data being copied into the
kernel in a specific GET_FMAP call.  For famfs I don't think you have
that many mappings, but for (say) an XFS filesystem there could be
billions of them.

Though at that point it might make more sense to populate the cache
piecemeal as file IO actually happens.

I wouldn't split an existing mapping, FWIW.  Think "I have 1,000,000
mappings and I'm only going to upload them 1,000 at a time", not "I'm
going to upload mappings for 100MB worth of file range at a time".

> The current approach lets the kernel retrieve and cache simple and 
> interleaved fmaps (and BTW interleaved can be multi-dev or single-dev - 
> there are current weird cases where that's useful). Also too, FWIW everything
> that can be done with simple ext list fmaps can be done with a collection
> of interleaved extents, each with strip count = 1. But I think there is a
> worthwhile clarity to having both.

<nod> I don't know what Miklos' opinion is about having multiple
fusecmds that do similar things -- on the one hand keeping yours and my
efforts separate explodes the amount of userspace abi that everyone must
maintain, but on the other hand it then doesn't couple our projects
together, which might be a good thing if it turns out that our domain
models are /really/ actually quite different.

(Especially because I suspect that interleaving is the norm for memory,
whereas we try to avoid that for disk filesystems.)

> But the current implementation does not contemplate partially cached fmaps.
> 
> Adding notification could address revoking them post-haste (is that why
> you're thinking about notifications? And if not can you elaborate on what
> you're after there?).

Yeah, invalidating the mapping cache at random places.  If, say, you
implement a clustered filesystem with iomap, the metadata server could
inform the fuse server on the local node that a certain range of inode X
has been written to, at which point you need to revoke any local leases,
invalidate the pagecache, and invalidate the iomapping cache to force
the client to requery the server.

Or if your fuse server wants to implement its own weird operations (e.g.
XFS EXCHANGE-RANGE) this would make that possible without needing to
add a bunch of code to fs/fuse/ for the benefit of a single fuse driver.

--D

> 
> > 
> > --D
> 
> Cheers,
> John
> 
>

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Miklos Szeredi 7 months, 1 week ago

On Mon, 28 Apr 2025 at 21:00, Darrick J. Wong <djwong@kernel.org> wrote:

> <nod> I don't know what Miklos' opinion is about having multiple
> fusecmds that do similar things -- on the one hand keeping yours and my
> efforts separate explodes the amount of userspace abi that everyone must
> maintain, but on the other hand it then doesn't couple our projects
> together, which might be a good thing if it turns out that our domain
> models are /really/ actually quite different.

Sharing the interface at least would definitely be worthwhile, as
there does not seem to be a great deal of difference between the
generic one and the famfs specific one.  Only implementing part of the
functionality that the generic one provides would be fine.

> (Especially because I suspect that interleaving is the norm for memory,
> whereas we try to avoid that for disk filesystems.)

So interleaved extents are just like normal ones except they repeat,
right?  What about adding a special "repeat last N extent
descriptions" type of extent?

> > But the current implementation does not contemplate partially cached fmaps.
> >
> > Adding notification could address revoking them post-haste (is that why
> > you're thinking about notifications? And if not can you elaborate on what
> > you're after there?).
>
> Yeah, invalidating the mapping cache at random places.  If, say, you
> implement a clustered filesystem with iomap, the metadata server could
> inform the fuse server on the local node that a certain range of inode X
> has been written to, at which point you need to revoke any local leases,
> invalidate the pagecache, and invalidate the iomapping cache to force
> the client to requery the server.
>
> Or if your fuse server wants to implement its own weird operations (e.g.
> XFS EXCHANGE-RANGE) this would make that possible without needing to
> add a bunch of code to fs/fuse/ for the benefit of a single fuse driver.

Wouldn't existing invalidation framework be sufficient?

Thanks,
Miklos

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by John Groves 7 months ago

On 25/05/06 06:56PM, Miklos Szeredi wrote:
> On Mon, 28 Apr 2025 at 21:00, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> > <nod> I don't know what Miklos' opinion is about having multiple
> > fusecmds that do similar things -- on the one hand keeping yours and my
> > efforts separate explodes the amount of userspace abi that everyone must
> > maintain, but on the other hand it then doesn't couple our projects
> > together, which might be a good thing if it turns out that our domain
> > models are /really/ actually quite different.
> 
> Sharing the interface at least would definitely be worthwhile, as
> there does not seem to be a great deal of difference between the
> generic one and the famfs specific one.  Only implementing part of the
> functionality that the generic one provides would be fine.

Agreed. I'm coming around to thinking the most practical approach would be
to share the GET_FMAP message/response, but to add a separate response
format for Darrick's use case - when the time comes. In this patch set, 
that starts with 'struct fuse_famfs_fmap_header' and is followed by the 
approriate extent structures, serialized in the message. Collectively 
that's an fmap in message format.

Side note: the current patch set sends back the logically-variable-sized 
fmap in a fixed-size message, but V2 of the series will address that; 
I got some help from Bernd there, but haven't finished it yet.

So the next version of the patch set would, say, add a more generic first
'struct fmap_header' that would indicate whether the next item would be
'struct fuse_famfs_fmap_header' (i.e. my/famfs metadata) or some other
to be codified metadata format. I'm going here because I'm dubious that
we even *can* do grand-unified-fmap-metadata (or that we should try).

This will require versioning the affected structures, unless we think
the fmap-in-message structure can be opaque to the rest of fuse. @miklos,
is there an example to follow regarding struct versioning in 
already-existing fuse structures?

> 
> > (Especially because I suspect that interleaving is the norm for memory,
> > whereas we try to avoid that for disk filesystems.)
> 
> So interleaved extents are just like normal ones except they repeat,
> right?  What about adding a special "repeat last N extent
> descriptions" type of extent?

It's a bit more than that. The comment at [1] makes it possible to understand
the scheme, but I'd be happy to talk through it with you on a call if that
seems helpful.

An interleaved extent stripes data spread across N memory devices in raid 0
format; the space from each device is described by a single simple extent 
(so it's contigous), but it's not consumed contiguously - it's consumed in 
fixed-sized chunks that precess across the devices. Notwithstanding that I 
couldn't explain it very well when we talked about it at LPC, I think I 
could make it pretty clear in a pretty brief call now.

In any case, you have my word that it's actually quite elegant :D
(seriously, but also with a smile...)

> 
> > > But the current implementation does not contemplate partially cached fmaps.
> > >
> > > Adding notification could address revoking them post-haste (is that why
> > > you're thinking about notifications? And if not can you elaborate on what
> > > you're after there?).
> >
> > Yeah, invalidating the mapping cache at random places.  If, say, you
> > implement a clustered filesystem with iomap, the metadata server could
> > inform the fuse server on the local node that a certain range of inode X
> > has been written to, at which point you need to revoke any local leases,
> > invalidate the pagecache, and invalidate the iomapping cache to force
> > the client to requery the server.
> >
> > Or if your fuse server wants to implement its own weird operations (e.g.
> > XFS EXCHANGE-RANGE) this would make that possible without needing to
> > add a bunch of code to fs/fuse/ for the benefit of a single fuse driver.
> 
> Wouldn't existing invalidation framework be sufficient?
> 
> Thanks,
> Miklos

My current thinking is that Darrick's use case doesn't need GET_DAXDEV, but
famfs does. I think Darrick's use case has one backing device, and that should
be passed in at mount time. Correct me if you think that might be wrong.

Famfs doesn't necessarily have just one backing dev, which means that famfs
could pass in the *primary* backing dev at mount time, but it would still
need GET_DAXDEV to get the rest. But if I just use GET_FMAP every time, I
only need one way to do this.

I'll add a few more responses to Darrick's reply...

Thanks,
John

[1] https://github.com/cxl-micron-reskit/famfs-linux/blob/c57553c4ca91f0634f137285840ab25be8a87c30/fs/fuse/famfs_kfmap.h#L13

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months ago

On Mon, May 12, 2025 at 02:51:45PM -0500, John Groves wrote:
> On 25/05/06 06:56PM, Miklos Szeredi wrote:
> > On Mon, 28 Apr 2025 at 21:00, Darrick J. Wong <djwong@kernel.org> wrote:
> > 
> > > <nod> I don't know what Miklos' opinion is about having multiple
> > > fusecmds that do similar things -- on the one hand keeping yours and my
> > > efforts separate explodes the amount of userspace abi that everyone must
> > > maintain, but on the other hand it then doesn't couple our projects
> > > together, which might be a good thing if it turns out that our domain
> > > models are /really/ actually quite different.
> > 
> > Sharing the interface at least would definitely be worthwhile, as
> > there does not seem to be a great deal of difference between the
> > generic one and the famfs specific one.  Only implementing part of the
> > functionality that the generic one provides would be fine.
> 
> Agreed. I'm coming around to thinking the most practical approach would be
> to share the GET_FMAP message/response, but to add a separate response
> format for Darrick's use case - when the time comes. In this patch set, 
> that starts with 'struct fuse_famfs_fmap_header' and is followed by the 
> approriate extent structures, serialized in the message. Collectively 
> that's an fmap in message format.

Well in that case I might as well just plumb in the pieces I need as
separate fuse commands.  fuse_args::opcode is u32, there's plenty of
space left.

> Side note: the current patch set sends back the logically-variable-sized 
> fmap in a fixed-size message, but V2 of the series will address that; 
> I got some help from Bernd there, but haven't finished it yet.
> 
> So the next version of the patch set would, say, add a more generic first
> 'struct fmap_header' that would indicate whether the next item would be
> 'struct fuse_famfs_fmap_header' (i.e. my/famfs metadata) or some other
> to be codified metadata format. I'm going here because I'm dubious that
> we even *can* do grand-unified-fmap-metadata (or that we should try).
> 
> This will require versioning the affected structures, unless we think
> the fmap-in-message structure can be opaque to the rest of fuse. @miklos,
> is there an example to follow regarding struct versioning in 
> already-existing fuse structures?

/me is a n00b, but isn't that a simple matter of making sure that new
revisions change the structure size, and then you can key off of that?

> > > (Especially because I suspect that interleaving is the norm for memory,
> > > whereas we try to avoid that for disk filesystems.)
> > 
> > So interleaved extents are just like normal ones except they repeat,
> > right?  What about adding a special "repeat last N extent
> > descriptions" type of extent?
> 
> It's a bit more than that. The comment at [1] makes it possible to understand
> the scheme, but I'd be happy to talk through it with you on a call if that
> seems helpful.
> 
> An interleaved extent stripes data spread across N memory devices in raid 0
> format; the space from each device is described by a single simple extent 
> (so it's contigous), but it's not consumed contiguously - it's consumed in 
> fixed-sized chunks that precess across the devices. Notwithstanding that I 
> couldn't explain it very well when we talked about it at LPC, I think I 
> could make it pretty clear in a pretty brief call now.
> 
> In any case, you have my word that it's actually quite elegant :D
> (seriously, but also with a smile...)

Admittedly the more I think about the interleaving in famfs vs straight
block mappings for disk filesystems, the more I think they ought to be
separate interfaces for code that solves different problems.  Then both
our codebases will remain relatively cohesive.

> > > > But the current implementation does not contemplate partially cached fmaps.
> > > >
> > > > Adding notification could address revoking them post-haste (is that why
> > > > you're thinking about notifications? And if not can you elaborate on what
> > > > you're after there?).
> > >
> > > Yeah, invalidating the mapping cache at random places.  If, say, you
> > > implement a clustered filesystem with iomap, the metadata server could
> > > inform the fuse server on the local node that a certain range of inode X
> > > has been written to, at which point you need to revoke any local leases,
> > > invalidate the pagecache, and invalidate the iomapping cache to force
> > > the client to requery the server.
> > >
> > > Or if your fuse server wants to implement its own weird operations (e.g.
> > > XFS EXCHANGE-RANGE) this would make that possible without needing to
> > > add a bunch of code to fs/fuse/ for the benefit of a single fuse driver.
> > 
> > Wouldn't existing invalidation framework be sufficient?
> > 
> > Thanks,
> > Miklos
> 
> My current thinking is that Darrick's use case doesn't need GET_DAXDEV, but
> famfs does. I think Darrick's use case has one backing device, and that should
> be passed in at mount time. Correct me if you think that might be wrong.

Technically speaking iomap can operate on /any/ block or dax device as
long as you have a reference to them.  Once I get more of the plumbing
sorted out I'll start thinking about how to handle multi-device
filesystems like XFS which can put file data on more than 1 block
device.

I was thinking that the fuse server could just send a REGISTER_DEVICE
notification to the fuse driver (I know, again with the notifications
:)), the kernel replies with a magic cookie, and that's what gets passed
in the {read,write,map}_dev field.

Right now I reconfigured fuse2fs to present itself as a "fuseblk" driver
so that at least we know that inode->i_sb->s_bdev is a valid pointer.
It turns out to be useful because the kernel sends FUSE_DESTROY commands
synchronously during unmount, which avoids the situation where umount
exits but the block device still can't be opened O_EXCL because the fuse
server program is still exiting.  It may be useful for some day wiring
up some of the block device ops to fuse servers.  Though I think it
might conflict with CONFIG_BLK_DEV_WRITE_MOUNTED=y

I just barely got directio writes and pagecache read/write working
through iomap today, though I'm still getting used to the fuse inode
locking model and sorting through the bugs. :)

(I wonder how nasty would it be to pass fds to the fuse kernel driver
from fuseblk servers?)

> Famfs doesn't necessarily have just one backing dev, which means that famfs
> could pass in the *primary* backing dev at mount time, but it would still
> need GET_DAXDEV to get the rest. But if I just use GET_FMAP every time, I
> only need one way to do this.
> 
> I'll add a few more responses to Darrick's reply...

Hehhe onto that message go I.

--D

> 
> Thanks,
> John
> 
> [1] https://github.com/cxl-micron-reskit/famfs-linux/blob/c57553c4ca91f0634f137285840ab25be8a87c30/fs/fuse/famfs_kfmap.h#L13
> 
>

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months, 1 week ago

On Tue, May 06, 2025 at 06:56:29PM +0200, Miklos Szeredi wrote:
> On Mon, 28 Apr 2025 at 21:00, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> > <nod> I don't know what Miklos' opinion is about having multiple
> > fusecmds that do similar things -- on the one hand keeping yours and my
> > efforts separate explodes the amount of userspace abi that everyone must
> > maintain, but on the other hand it then doesn't couple our projects
> > together, which might be a good thing if it turns out that our domain
> > models are /really/ actually quite different.
> 
> Sharing the interface at least would definitely be worthwhile, as
> there does not seem to be a great deal of difference between the
> generic one and the famfs specific one.  Only implementing part of the
> functionality that the generic one provides would be fine.

Well right now my barely functional prototype exposes this interface
for communicating mappings to the kernel.  I've only gotten as far as
exposing the ->iomap_{begin,end} and ->iomap_ioend calls to the fuse
server with no caching, because the only functions I've implemented so
far are FIEMAP, SEEK_{DATA,HOLE}, and directio.

So basically the kernel sends a FUSE_IOMAP_BEGIN command with the
desired (pos, count) file range to the fuse server, which responds with
a struct fuse_iomap_begin_out object that is translated into a struct
iomap.

The fuse server then responds with a read mapping and a write mapping,
which tell the kernel from where to read data, and where to write data.
As a shortcut, the write mapping can be of type
FUSE_IOMAP_TYPE_PURE_OVERWRITE to avoid having to fill out fields twice.

iomap_end is only called if there were errors while processing the
mapping, or if the fuse server sets FUSE_IOMAP_F_WANT_IOMAP_END.

iomap_ioend is called after read or write IOs complete, so that the
filesystem can update mapping metadata (e.g. unwritten extent
conversion, remapping after an out of place write, ondisk isize update).

Some of the flags here might not be needed or workable; I was merely
cutting and pasting the #defines from iomap.h.

#define FUSE_IOMAP_TYPE_PURE_OVERWRITE	(0xFFFF) /* use read mapping data */
#define FUSE_IOMAP_TYPE_HOLE		0	/* no blocks allocated, need allocation */
#define FUSE_IOMAP_TYPE_DELALLOC	1	/* delayed allocation blocks */
#define FUSE_IOMAP_TYPE_MAPPED		2	/* blocks allocated at @addr */
#define FUSE_IOMAP_TYPE_UNWRITTEN	3	/* blocks allocated at @addr in unwritten state */
#define FUSE_IOMAP_TYPE_INLINE		4	/* data inline in the inode */

#define FUSE_IOMAP_DEV_SBDEV		(0)	/* use superblock bdev */

#define FUSE_IOMAP_F_NEW		(1U << 0)
#define FUSE_IOMAP_F_DIRTY		(1U << 1)
#define FUSE_IOMAP_F_SHARED		(1U << 2)
#define FUSE_IOMAP_F_MERGED		(1U << 3)
#define FUSE_IOMAP_F_XATTR		(1U << 5)
#define FUSE_IOMAP_F_BOUNDARY		(1U << 6)
#define FUSE_IOMAP_F_ANON_WRITE		(1U << 7)

#define FUSE_IOMAP_F_WANT_IOMAP_END	(1U << 15) /* want ->iomap_end call */

#define FUSE_IOMAP_OP_WRITE		(1 << 0) /* writing, must allocate blocks */
#define FUSE_IOMAP_OP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
#define FUSE_IOMAP_OP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
#define FUSE_IOMAP_OP_FAULT		(1 << 3) /* mapping for page fault */
#define FUSE_IOMAP_OP_DIRECT		(1 << 4) /* direct I/O */
#define FUSE_IOMAP_OP_NOWAIT		(1 << 5) /* do not block */
#define FUSE_IOMAP_OP_OVERWRITE_ONLY	(1 << 6) /* only pure overwrites allowed */
#define FUSE_IOMAP_OP_UNSHARE		(1 << 7) /* unshare_file_range */
#define FUSE_IOMAP_OP_ATOMIC		(1 << 9) /* torn-write protection */
#define FUSE_IOMAP_OP_DONTCACHE		(1 << 10) /* dont retain pagecache */

#define FUSE_IOMAP_NULL_ADDR		-1ULL	/* addr is not valid */

struct fuse_iomap_begin_in {
	uint32_t opflags;	/* FUSE_IOMAP_OP_* */
	uint32_t reserved;
	uint64_t ino;		/* matches st_ino provided by getattr/open */
	uint64_t pos;		/* file position, in bytes */
	uint64_t count;		/* operation length, in bytes */
};

struct fuse_iomap_begin_out {
	uint64_t offset;	/* file offset of mapping, bytes */
	uint64_t length;	/* length of both mappings, bytes */

	uint64_t read_addr;	/* disk offset of mapping, bytes */
	uint16_t read_type;	/* FUSE_IOMAP_TYPE_* */
	uint16_t read_flags;	/* FUSE_IOMAP_F_* */
	uint32_t read_dev;	/* FUSE_IOMAP_DEV_* */

	uint64_t write_addr;	/* disk offset of mapping, bytes */
	uint16_t write_type;	/* FUSE_IOMAP_TYPE_* */
	uint16_t write_flags;	/* FUSE_IOMAP_F_* */
	uint32_t write_dev;	/* FUSE_IOMAP_DEV_* */
};

struct fuse_iomap_end_in {
	uint32_t opflags;	/* FUSE_IOMAP_OP_* */
	uint32_t reserved;
	uint64_t ino;		/* matches st_ino provided iomap_begin */
	uint64_t pos;		/* file position, in bytes */
	uint64_t count;		/* operation length, in bytes */
	int64_t written;	/* bytes processed */

	uint64_t map_length;	/* length of mapping, bytes */
	uint64_t map_addr;	/* disk offset of mapping, bytes */
	uint16_t map_type;	/* FUSE_IOMAP_TYPE_* */
	uint16_t map_flags;	/* FUSE_IOMAP_F_* */
	uint32_t map_dev;	/* FUSE_IOMAP_DEV_* */
};

/* out of place write extent */
#define FUSE_IOMAP_IOEND_SHARED		(1U << 0)
/* unwritten extent */
#define FUSE_IOMAP_IOEND_UNWRITTEN	(1U << 1)
/* don't merge into previous ioend */
#define FUSE_IOMAP_IOEND_BOUNDARY	(1U << 2)
/* is direct I/O */
#define FUSE_IOMAP_IOEND_DIRECT		(1U << 3)

/* is append ioend */
#define FUSE_IOMAP_IOEND_APPEND		(1U << 15)

struct fuse_iomap_ioend_in {
	uint16_t ioendflags;	/* FUSE_IOMAP_IOEND_* */
	uint16_t reserved;
	int32_t error;		/* negative errno or 0 */
	uint64_t ino;		/* matches st_ino provided iomap_begin */
	uint64_t pos;		/* file position, in bytes */
	uint64_t addr;		/* disk offset of new mapping, in bytes */
	uint32_t written;	/* bytes processed */
	uint32_t reserved1;
};

> > (Especially because I suspect that interleaving is the norm for memory,
> > whereas we try to avoid that for disk filesystems.)
> 
> So interleaved extents are just like normal ones except they repeat,
> right?  What about adding a special "repeat last N extent
> descriptions" type of extent?

Yeah, I suppose a mapping cache could do that.  From talking to John
last week, it sounds like the mappings are supposed to be static for the
life of the file, as opposed to ext* where truncates and fallocate can
appear at any time.

One thing I forgot to ask John -- can there be multiple sets of
interleaved mappings per file?  e.g. the first 32g of a file are split
between 4 memory controllers, whereas the next 64g are split between 4
different domains?

> > > But the current implementation does not contemplate partially cached fmaps.
> > >
> > > Adding notification could address revoking them post-haste (is that why
> > > you're thinking about notifications? And if not can you elaborate on what
> > > you're after there?).
> >
> > Yeah, invalidating the mapping cache at random places.  If, say, you
> > implement a clustered filesystem with iomap, the metadata server could
> > inform the fuse server on the local node that a certain range of inode X
> > has been written to, at which point you need to revoke any local leases,
> > invalidate the pagecache, and invalidate the iomapping cache to force
> > the client to requery the server.
> >
> > Or if your fuse server wants to implement its own weird operations (e.g.
> > XFS EXCHANGE-RANGE) this would make that possible without needing to
> > add a bunch of code to fs/fuse/ for the benefit of a single fuse driver.
> 
> Wouldn't existing invalidation framework be sufficient?

I'm a little confused, are you talking about FUSE_NOTIFY_INVAL_INODE?
If so, then I think that's the wrong layer -- INVAL_INODE invalidates
the page cache, whereas I'm talking about caching the file space
mappings that iomap uses to construct bios for disk IO, and possibly
wanting to invalidate parts of that cache to force the kernel to upcall
the fuse server for a new mapping.

(Obviously this only applies to fuse servers for ondisk filesystems.)

--D

> Thanks,
> Miklos
>

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Miklos Szeredi 7 months ago

On Thu, 8 May 2025 at 17:56, Darrick J. Wong <djwong@kernel.org> wrote:

> Well right now my barely functional prototype exposes this interface
> for communicating mappings to the kernel.  I've only gotten as far as
> exposing the ->iomap_{begin,end} and ->iomap_ioend calls to the fuse
> server with no caching, because the only functions I've implemented so
> far are FIEMAP, SEEK_{DATA,HOLE}, and directio.
>
> So basically the kernel sends a FUSE_IOMAP_BEGIN command with the
> desired (pos, count) file range to the fuse server, which responds with
> a struct fuse_iomap_begin_out object that is translated into a struct
> iomap.
>
> The fuse server then responds with a read mapping and a write mapping,
> which tell the kernel from where to read data, and where to write data.

So far so good.

The iomap layer is non-caching, right?   This means that e.g. a
direct_io request spanning two extents will result in two separate
requests, since one FUSE_IOMAP_BEGIN can only return one extent.

And the next direct_io request may need to repeat the query for the
same extent as the previous one if the I/O boundary wasn't on the
extent boundary (which is likely).

So some sort of caching would make sense, but seeing the multitude of
FUSE_IOMAP_OP_ types I'm not clearly seeing how that would look.

> I'm a little confused, are you talking about FUSE_NOTIFY_INVAL_INODE?
> If so, then I think that's the wrong layer -- INVAL_INODE invalidates
> the page cache, whereas I'm talking about caching the file space
> mappings that iomap uses to construct bios for disk IO, and possibly
> wanting to invalidate parts of that cache to force the kernel to upcall
> the fuse server for a new mapping.

Maybe I'm confused, as the layering is not very clear in my head yet.

But in your example you did say that invalidation of data as well as
mapping needs to be invalidated, so I thought that the simplest thing
to do is to just invalidate the cached mapping from
FUSE_NOTIFY_INVAL_INODE as well.

Thanks,
Miklos

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months ago

On Tue, May 13, 2025 at 11:14:55AM +0200, Miklos Szeredi wrote:
> On Thu, 8 May 2025 at 17:56, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> > Well right now my barely functional prototype exposes this interface
> > for communicating mappings to the kernel.  I've only gotten as far as
> > exposing the ->iomap_{begin,end} and ->iomap_ioend calls to the fuse
> > server with no caching, because the only functions I've implemented so
> > far are FIEMAP, SEEK_{DATA,HOLE}, and directio.
> >
> > So basically the kernel sends a FUSE_IOMAP_BEGIN command with the
> > desired (pos, count) file range to the fuse server, which responds with
> > a struct fuse_iomap_begin_out object that is translated into a struct
> > iomap.
> >
> > The fuse server then responds with a read mapping and a write mapping,
> > which tell the kernel from where to read data, and where to write data.
> 
> So far so good.
> 
> The iomap layer is non-caching, right?   This means that e.g. a
> direct_io request spanning two extents will result in two separate
> requests, since one FUSE_IOMAP_BEGIN can only return one extent.

Originally it wasn't supposed to be cached at all.  Then history taught
us a lesson. :P

In hindsight, there needs to be coordination of the space mapping
manipulations that go on between pagecache writes and reclaim writeback.
Pagecache write can get an unwritten iomap, then go to sleep while it
tries to get a folio.  In the meantime, writeback can find the folio for
that range, write it back to the disk (which converts unwritten to
written) and reclaim the folio.  Now the first process wakes up and
grabs a new folio.  Because its unwritten mapping is now stale, it must
not start zeroing that folio; it needs to go get a new mapping.

So iomap still doesn't need caching per se, but it needs writer threads
to revalidate the mapping after locking a folio.  The reason for caching
iomaps under the fuse_inode somewhere is that I don't want the
revalidations to have to jump all the way out to userspace with a folio
lock held.

That said, on a VM on this 12 year old workstation, I can get about
2.0GB/s direct writes in fuse2fs and 2.2GB/s in kernel ext4, and that's
with initiating iomap_begin/end/ioends with no caching of the mappings.
Pagecache writes run at about 1.9GB/s through fuse2fs and 1.5GB/s
through the kernel, but only if I tweak fuse to use large folios and a
relatively unconstrained bdi.  2GB/s might be enough IO for anyone. ;)

> And the next direct_io request may need to repeat the query for the
> same extent as the previous one if the I/O boundary wasn't on the
> extent boundary (which is likely).
> 
> So some sort of caching would make sense, but seeing the multitude of
> FUSE_IOMAP_OP_ types I'm not clearly seeing how that would look.

Yeah, it's confusing.  The design doc tries to clarify this, but this is
roughly what we need for fuse:

FUSE_IOMAP_OP_WRITE being set means we're writing to the file.
FUSE_IOMAP_OP_ZERO being set means we're zeroing the file.
Neither of those being set means we're reading the file.

(3 different operations)

FUSE_IOMAP_OP_DIRECT being set means directio, and it not being set
means pagecache.

(and one flag, for 6 different types of IO)

FUSE_IOMAP_OP_REPORT is set all by itself for things like FIEMAP and
SEEK_DATA/HOLE.

> > I'm a little confused, are you talking about FUSE_NOTIFY_INVAL_INODE?
> > If so, then I think that's the wrong layer -- INVAL_INODE invalidates
> > the page cache, whereas I'm talking about caching the file space
> > mappings that iomap uses to construct bios for disk IO, and possibly
> > wanting to invalidate parts of that cache to force the kernel to upcall
> > the fuse server for a new mapping.
> 
> Maybe I'm confused, as the layering is not very clear in my head yet.
> 
> But in your example you did say that invalidation of data as well as
> mapping needs to be invalidated, so I thought that the simplest thing
> to do is to just invalidate the cached mapping from
> FUSE_NOTIFY_INVAL_INODE as well.

For now I want to keep the two invalidation types separate while I build
out more of the prototype so that I can be more sure that I haven't
broken any existing code. :)

The mapping invalidation might be more useful for things like FICLONE on
weird filesystems where the file allocation unit size is larger than the
block size and we actually need to invalidate more mappings than the vfs
knows about.

But I'm only 80% sure of that, as I'm still figuring out how to create a
notification and send it from fuse2fs and haven't gotten to the caching
layer yet.

--D

> Thanks,
> Miklos
>

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Miklos Szeredi 7 months ago

On Thu, 15 May 2025 at 04:06, Darrick J. Wong <djwong@kernel.org> wrote:

> Yeah, it's confusing.  The design doc tries to clarify this, but this is
> roughly what we need for fuse:
>
> FUSE_IOMAP_OP_WRITE being set means we're writing to the file.
> FUSE_IOMAP_OP_ZERO being set means we're zeroing the file.
> Neither of those being set means we're reading the file.
>
> (3 different operations)

Okay, I get why these need to be distinct cases.

Am I right that the only read is sanely cacheable?

> FUSE_IOMAP_OP_DIRECT being set means directio, and it not being set
> means pagecache.
>
> (and one flag, for 6 different types of IO)

Why does this make a difference?

Okay, maybe I can imagine difference allocation strategies.  Which
means that it only matters for the write case?

> FUSE_IOMAP_OP_REPORT is set all by itself for things like FIEMAP and
> SEEK_DATA/HOLE.

Which should again always be the same as the read case, no?

Thanks,
Miklos

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months ago

On Fri, May 16, 2025 at 12:06:44PM +0200, Miklos Szeredi wrote:
> On Thu, 15 May 2025 at 04:06, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> > Yeah, it's confusing.  The design doc tries to clarify this, but this is
> > roughly what we need for fuse:
> >
> > FUSE_IOMAP_OP_WRITE being set means we're writing to the file.
> > FUSE_IOMAP_OP_ZERO being set means we're zeroing the file.
> > Neither of those being set means we're reading the file.
> >
> > (3 different operations)
> 
> Okay, I get why these need to be distinct cases.
> 
> Am I right that the only read is sanely cacheable?

That depends on the filesystem.  Old filesystems (e.g. the ones that
don't support out of place writes or unwritten extents) most likely can
cache mappings for writes and zeroing.  Filesystems with static mappings
(like zonefs which are convenient wrappers around hardware) can cache
most everything too.

My next step for this prototype is to go build a real cache and make
fuse2fs manage the cache, which puts the filesystem in charge of
maintaining the cache however is appropriate for the design.

> > FUSE_IOMAP_OP_DIRECT being set means directio, and it not being set
> > means pagecache.
> >
> > (and one flag, for 6 different types of IO)
> 
> Why does this make a difference?

Different allocation strategies -- we can use delayed allocation for
pagecache writes, whereas with direct writes we must have real disk
space.

> Okay, maybe I can imagine difference allocation strategies.  Which
> means that it only matters for the write case?

Probably.  I don't see why a directio read would be any different from a
pageacache read(ahead) but the distinction exists for the in-kernel
iomap callers.

> > FUSE_IOMAP_OP_REPORT is set all by itself for things like FIEMAP and
> > SEEK_DATA/HOLE.
> 
> Which should again always be the same as the read case, no?

Not entirely -- if the fuse driver is doing weird caching things with
file data blocks, a read requires it to invalidate its own cache,
whereas it needn't do anything for a mapping report.  fuse2fs is guilty
of this, because it does ... crazy things.

Also for now I don't support read/write to inline data files, though I
think it would be possible to use the FUSE_READ/FUSE_WRITE for that...
as soon as I find a filesystem where inline data for regular files isn't
a giant trash fire and can be QAd properly.

--D

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by Darrick J. Wong 7 months, 3 weeks ago

On Sun, Apr 20, 2025 at 08:33:40PM -0500, John Groves wrote:
> On completion of GET_FMAP message/response, setup the full famfs
> metadata such that it's possible to handle read/write/mmap directly to
> dax. Note that the devdax_iomap plumbing is not in yet...
> 
> Update MAINTAINERS for the new files.
> 
> Signed-off-by: John Groves <john@groves.net>
> ---
>  MAINTAINERS               |   9 +
>  fs/fuse/Makefile          |   2 +-
>  fs/fuse/dir.c             |   3 +
>  fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
>  fs/fuse/famfs_kfmap.h     |  63 +++++++
>  fs/fuse/fuse_i.h          |  16 +-
>  fs/fuse/inode.c           |   2 +-
>  include/uapi/linux/fuse.h |  42 +++++
>  8 files changed, 477 insertions(+), 4 deletions(-)
>  create mode 100644 fs/fuse/famfs.c
>  create mode 100644 fs/fuse/famfs_kfmap.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 00e94bec401e..2a5a7e0e8b28 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8808,6 +8808,15 @@ F:	Documentation/networking/failover.rst
>  F:	include/net/failover.h
>  F:	net/core/failover.c
>  
> +FAMFS
> +M:	John Groves <jgroves@micron.com>
> +M:	John Groves <John@Groves.net>
> +L:	linux-cxl@vger.kernel.org
> +L:	linux-fsdevel@vger.kernel.org
> +S:	Supported
> +F:	fs/fuse/famfs.c
> +F:	fs/fuse/famfs_kfmap.h
> +
>  FANOTIFY
>  M:	Jan Kara <jack@suse.cz>
>  R:	Amir Goldstein <amir73il@gmail.com>
> diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
> index 3f0f312a31c1..65a12975d734 100644
> --- a/fs/fuse/Makefile
> +++ b/fs/fuse/Makefile
> @@ -16,5 +16,5 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
>  fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
>  fuse-$(CONFIG_SYSCTL) += sysctl.o
>  fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
> -
> +fuse-$(CONFIG_FUSE_FAMFS_DAX) += famfs.o
>  virtiofs-y := virtio_fs.o
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index ae135c55b9f6..b28a1e912d6b 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -405,6 +405,9 @@ fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
>  	fmap_size = args.out_args[0].size;
>  	pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
>  
> +	/* Convert fmap into in-memory format and hang from inode */
> +	famfs_file_init_dax(fm, inode, fmap_buf, fmap_size);
> +
>  	return 0;
>  }
>  #endif
> diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
> new file mode 100644
> index 000000000000..e62c047d0950
> --- /dev/null
> +++ b/fs/fuse/famfs.c
> @@ -0,0 +1,344 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * famfs - dax file system for shared fabric-attached memory
> + *
> + * Copyright 2023-2025 Micron Technology, Inc.
> + *
> + * This file system, originally based on ramfs the dax support from xfs,
> + * is intended to allow multiple host systems to mount a common file system
> + * view of dax files that map to shared memory.
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#include <linux/path.h>
> +#include <linux/namei.h>
> +#include <linux/string.h>
> +
> +#include "famfs_kfmap.h"
> +#include "fuse_i.h"
> +
> +
> +void
> +__famfs_meta_free(void *famfs_meta)
> +{
> +	struct famfs_file_meta *fmap = famfs_meta;
> +
> +	if (!fmap)
> +		return;
> +
> +	if (fmap) {
> +		switch (fmap->fm_extent_type) {
> +		case SIMPLE_DAX_EXTENT:
> +			kfree(fmap->se);
> +			break;
> +		case INTERLEAVED_EXTENT:

Are interleaved extents not DAX extents?  Why does one constant refer to
DAX but the other does not?

> +			if (fmap->ie)
> +				kfree(fmap->ie->ie_strips);
> +
> +			kfree(fmap->ie);
> +			break;
> +		default:
> +			pr_err("%s: invalid fmap type\n", __func__);
> +			break;
> +		}
> +	}
> +	kfree(fmap);
> +}
> +
> +static int
> +famfs_check_ext_alignment(struct famfs_meta_simple_ext *se)
> +{
> +	int errs = 0;
> +
> +	if (se->dev_index != 0)
> +		errs++;
> +
> +	/* TODO: pass in alignment so we can support the other page sizes */
> +	if (!IS_ALIGNED(se->ext_offset, PMD_SIZE))
> +		errs++;
> +
> +	if (!IS_ALIGNED(se->ext_len, PMD_SIZE))
> +		errs++;
> +
> +	return errs;
> +}
> +
> +/**
> + * famfs_meta_alloc() - Allocate famfs file metadata
> + * @metap:       Pointer to an mcache_map_meta pointer
> + * @ext_count:  The number of extents needed
> + */
> +static int
> +famfs_meta_alloc_v3(

Err, what's with "v3"?  This is a new fs, right?

> +	void *fmap_buf,
> +	size_t fmap_buf_size,
> +	struct famfs_file_meta **metap)
> +{
> +	struct famfs_file_meta *meta = NULL;
> +	struct fuse_famfs_fmap_header *fmh;
> +	size_t extent_total = 0;
> +	size_t next_offset = 0;
> +	int errs = 0;
> +	int i, j;
> +	int rc;
> +
> +	fmh = (struct fuse_famfs_fmap_header *)fmap_buf;
> +
> +	/* Move past fmh in fmap_buf */
> +	next_offset += sizeof(*fmh);
> +	if (next_offset > fmap_buf_size) {
> +		pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +		       __func__, __LINE__, next_offset, fmap_buf_size);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	if (fmh->nextents < 1) {
> +		pr_err("%s: nextents %d < 1\n", __func__, fmh->nextents);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	if (fmh->nextents > FUSE_FAMFS_MAX_EXTENTS) {
> +		pr_err("%s: nextents %d > max (%d) 1\n",
> +		       __func__, fmh->nextents, FUSE_FAMFS_MAX_EXTENTS);
> +		rc = -E2BIG;
> +		goto errout;
> +	}
> +
> +	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
> +	if (!meta)
> +		return -ENOMEM;
> +	meta->error = false;
> +
> +	meta->file_type = fmh->file_type;
> +	meta->file_size = fmh->file_size;
> +	meta->fm_extent_type = fmh->ext_type;
> +
> +	switch (fmh->ext_type) {
> +	case FUSE_FAMFS_EXT_SIMPLE: {
> +		struct fuse_famfs_simple_ext *se_in;
> +
> +		se_in = (struct fuse_famfs_simple_ext *)(fmap_buf + next_offset);
> +
> +		/* Move past simple extents */
> +		next_offset += fmh->nextents * sizeof(*se_in);
> +		if (next_offset > fmap_buf_size) {
> +			pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +			       __func__, __LINE__, next_offset, fmap_buf_size);
> +			rc = -EINVAL;
> +			goto errout;
> +		}
> +
> +		meta->fm_nextents = fmh->nextents;
> +
> +		meta->se = kcalloc(meta->fm_nextents, sizeof(*(meta->se)),
> +				   GFP_KERNEL);
> +		if (!meta->se) {
> +			rc = -ENOMEM;
> +			goto errout;
> +		}
> +
> +		if ((meta->fm_nextents > FUSE_FAMFS_MAX_EXTENTS) ||

FUSE_FAMFS_MAX_EXTENTS is 2?  I gather that simple files in famfs refer
to contiguous regions, but why two mappings?

> +		    (meta->fm_nextents < 1)) {
> +			rc = -EINVAL;
> +			goto errout;
> +		}
> +
> +		for (i = 0; i < fmh->nextents; i++) {
> +			meta->se[i].dev_index  = se_in[i].se_devindex;
> +			meta->se[i].ext_offset = se_in[i].se_offset;
> +			meta->se[i].ext_len    = se_in[i].se_len;
> +
> +			/* Record bitmap of referenced daxdev indices */
> +			meta->dev_bitmap |= (1 << meta->se[i].dev_index);
> +
> +			errs += famfs_check_ext_alignment(&meta->se[i]);

Shouldn't you bail out at the first bad mapping?

> +			extent_total += meta->se[i].ext_len;
> +		}

I took a look at what's already in uapi/linux/fuse.h and saw that
there are two operations -- FUSE_{SETUP,REMOVE}MAPPING.  Those two fuse
upcalls seem to manage an interval tree in struct fuse_inode_dax, which
is used to feed fuse_iomap_begin.  Can you reuse this existing uapi
instead of defining a new one that's already pretty similar?

I'm wondering why create all this new code when fuse/dax.c already seems
to have the ability to cache mappings and pass them to dax_iomap_rw
without restrictions on the number of mappings and all that?

Maybe you're trying to avoid runtime upcalls, but then I would think
that you could teach the fuse/dax.c mapping code to pin the mappings
if there aren't that many of them in the first place, rather than
reinventing mappings?

It occurred to me (perhaps naively) that maybe you created FUSE_GETFMAP
because of this interleaving thing because it's probably faster to
upload a template for that than it would be to upload a large number of
mappings.  But I don't really grok why the interleaving exists, though I
guess it's for memory controllers interleaving memory devices or
something for better throughput?

I also see that famfs_meta_to_dax_offset does a linear walk of the
mapping array, which does not seem like it will be inefficient when
there are many mappings.

> +		break;
> +	}
> +
> +	case FUSE_FAMFS_EXT_INTERLEAVE: {
> +		s64 size_remainder = meta->file_size;
> +		struct fuse_famfs_iext *ie_in;
> +		int niext = fmh->nextents;
> +
> +		meta->fm_niext = niext;
> +
> +		/* Allocate interleaved extent */
> +		meta->ie = kcalloc(niext, sizeof(*(meta->ie)), GFP_KERNEL);
> +		if (!meta->ie) {
> +			rc = -ENOMEM;
> +			goto errout;
> +		}
> +
> +		/*
> +		 * Each interleaved extent has a simple extent list of strips.
> +		 * Outer loop is over separate interleaved extents

Hmm, so there's no checking on fmh->nextents here, so I guess we can
have as many sets of interleaved extents as we want?  Each with up to 16
simple mappings?

--D

> +		 */
> +		for (i = 0; i < niext; i++) {
> +			u64 nstrips;
> +			struct fuse_famfs_simple_ext *sie_in;
> +
> +			/* ie_in = one interleaved extent in fmap_buf */
> +			ie_in = (struct fuse_famfs_iext *)
> +				(fmap_buf + next_offset);
> +
> +			/* Move past one interleaved extent header in fmap_buf */
> +			next_offset += sizeof(*ie_in);
> +			if (next_offset > fmap_buf_size) {
> +				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +				       __func__, __LINE__, next_offset, fmap_buf_size);
> +				rc = -EINVAL;
> +				goto errout;
> +			}
> +
> +			nstrips = ie_in->ie_nstrips;
> +			meta->ie[i].fie_chunk_size = ie_in->ie_chunk_size;
> +			meta->ie[i].fie_nstrips    = ie_in->ie_nstrips;
> +			meta->ie[i].fie_nbytes     = ie_in->ie_nbytes;
> +
> +			if (!meta->ie[i].fie_nbytes) {
> +				pr_err("%s: zero-length interleave!\n",
> +				       __func__);
> +				rc = -EINVAL;
> +				goto errout;
> +			}
> +
> +			/* sie_in = the strip extents in fmap_buf */
> +			sie_in = (struct fuse_famfs_simple_ext *)
> +				(fmap_buf + next_offset);
> +
> +			/* Move past strip extents in fmap_buf */
> +			next_offset += nstrips * sizeof(*sie_in);
> +			if (next_offset > fmap_buf_size) {
> +				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> +				       __func__, __LINE__, next_offset, fmap_buf_size);
> +				rc = -EINVAL;
> +				goto errout;
> +			}
> +
> +			if ((nstrips > FUSE_FAMFS_MAX_STRIPS) || (nstrips < 1)) {
> +				pr_err("%s: invalid nstrips=%lld (max=%d)\n",
> +				       __func__, nstrips,
> +				       FUSE_FAMFS_MAX_STRIPS);
> +				errs++;
> +			}
> +
> +			/* Allocate strip extent array */
> +			meta->ie[i].ie_strips = kcalloc(ie_in->ie_nstrips,
> +					sizeof(meta->ie[i].ie_strips[0]),
> +							GFP_KERNEL);
> +			if (!meta->ie[i].ie_strips) {
> +				rc = -ENOMEM;
> +				goto errout;
> +			}
> +
> +			/* Inner loop is over strips */
> +			for (j = 0; j < nstrips; j++) {
> +				struct famfs_meta_simple_ext *strips_out;
> +				u64 devindex = sie_in[j].se_devindex;
> +				u64 offset   = sie_in[j].se_offset;
> +				u64 len      = sie_in[j].se_len;
> +
> +				strips_out = meta->ie[i].ie_strips;
> +				strips_out[j].dev_index  = devindex;
> +				strips_out[j].ext_offset = offset;
> +				strips_out[j].ext_len    = len;
> +
> +				/* Record bitmap of referenced daxdev indices */
> +				meta->dev_bitmap |= (1 << devindex);
> +
> +				extent_total += len;
> +				errs += famfs_check_ext_alignment(&strips_out[j]);
> +				size_remainder -= len;
> +			}
> +		}
> +
> +		if (size_remainder > 0) {
> +			/* Sum of interleaved extent sizes is less than file size! */
> +			pr_err("%s: size_remainder %lld (0x%llx)\n",
> +			       __func__, size_remainder, size_remainder);
> +			rc = -EINVAL;
> +			goto errout;
> +		}
> +		break;
> +	}
> +
> +	default:
> +		pr_err("%s: invalid ext_type %d\n", __func__, fmh->ext_type);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	if (errs > 0) {
> +		pr_err("%s: %d alignment errors found\n", __func__, errs);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	/* More sanity checks */
> +	if (extent_total < meta->file_size) {
> +		pr_err("%s: file size %ld larger than map size %ld\n",
> +		       __func__, meta->file_size, extent_total);
> +		rc = -EINVAL;
> +		goto errout;
> +	}
> +
> +	*metap = meta;
> +
> +	return 0;
> +errout:
> +	__famfs_meta_free(meta);
> +	return rc;
> +}
> +
> +int
> +famfs_file_init_dax(
> +	struct fuse_mount *fm,
> +	struct inode *inode,
> +	void *fmap_buf,
> +	size_t fmap_size)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct famfs_file_meta *meta = NULL;
> +	int rc;
> +
> +	if (fi->famfs_meta) {
> +		pr_notice("%s: i_no=%ld fmap_size=%ld ALREADY INITIALIZED\n",
> +			  __func__,
> +			  inode->i_ino, fmap_size);
> +		return -EEXIST;
> +	}
> +
> +	rc = famfs_meta_alloc_v3(fmap_buf, fmap_size, &meta);
> +	if (rc)
> +		goto errout;
> +
> +	/* Publish the famfs metadata on fi->famfs_meta */
> +	inode_lock(inode);
> +	if (fi->famfs_meta) {
> +		rc = -EEXIST; /* file already has famfs metadata */
> +	} else {
> +		if (famfs_meta_set(fi, meta) != NULL) {
> +			pr_err("%s: file already had metadata\n", __func__);
> +			rc = -EALREADY;
> +			goto errout;
> +		}
> +		i_size_write(inode, meta->file_size);
> +		inode->i_flags |= S_DAX;
> +	}
> +	inode_unlock(inode);
> +
> + errout:
> +	if (rc)
> +		__famfs_meta_free(meta);
> +
> +	return rc;
> +}
> +
> diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
> new file mode 100644
> index 000000000000..ce785d76719c
> --- /dev/null
> +++ b/fs/fuse/famfs_kfmap.h
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * famfs - dax file system for shared fabric-attached memory
> + *
> + * Copyright 2023-2025 Micron Technology, Inc.
> + */
> +#ifndef FAMFS_KFMAP_H
> +#define FAMFS_KFMAP_H
> +
> +/*
> + * These structures are the in-memory metadata format for famfs files. Metadata
> + * retrieved via the GET_FMAP response is converted to this format for use in
> + * resolving file mapping faults.
> + */
> +
> +enum famfs_file_type {
> +	FAMFS_REG,
> +	FAMFS_SUPERBLOCK,
> +	FAMFS_LOG,
> +};
> +
> +/* We anticipate the possiblity of supporting additional types of extents */
> +enum famfs_extent_type {
> +	SIMPLE_DAX_EXTENT,
> +	INTERLEAVED_EXTENT,
> +	INVALID_EXTENT_TYPE,
> +};
> +
> +struct famfs_meta_simple_ext {
> +	u64 dev_index;
> +	u64 ext_offset;
> +	u64 ext_len;
> +};
> +
> +struct famfs_meta_interleaved_ext {
> +	u64 fie_nstrips;
> +	u64 fie_chunk_size;
> +	u64 fie_nbytes;
> +	struct famfs_meta_simple_ext *ie_strips;
> +};
> +
> +/*
> + * Each famfs dax file has this hanging from its fuse_inode->famfs_meta
> + */
> +struct famfs_file_meta {
> +	bool                   error;
> +	enum famfs_file_type   file_type;
> +	size_t                 file_size;
> +	enum famfs_extent_type fm_extent_type;
> +	u64 dev_bitmap; /* bitmap of referenced daxdevs by index */
> +	union { /* This will make code a bit more readable */
> +		struct {
> +			size_t         fm_nextents;
> +			struct famfs_meta_simple_ext  *se;
> +		};
> +		struct {
> +			size_t         fm_niext;
> +			struct famfs_meta_interleaved_ext *ie;
> +		};
> +	};
> +};
> +
> +#endif /* FAMFS_KFMAP_H */
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 437177c2f092..d8e0ac784224 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -1557,11 +1557,18 @@ extern void fuse_sysctl_unregister(void);
>  #endif /* CONFIG_SYSCTL */
>  
>  /* famfs.c */
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> +int amfs_file_init_dax(struct fuse_mount *fm,
> +			     struct inode *inode, void *fmap_buf,
> +			     size_t fmap_size);
> +void __famfs_meta_free(void *map);
> +#endif
> +
>  static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
>  						       void *meta)
>  {
>  #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> -	return xchg(&fi->famfs_meta, meta);
> +	return cmpxchg(&fi->famfs_meta, NULL, meta);
>  #else
>  	return NULL;
>  #endif
> @@ -1569,7 +1576,12 @@ static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
>  
>  static inline void famfs_meta_free(struct fuse_inode *fi)
>  {
> -	/* Stub wil be connected in a subsequent commit */
> +#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
> +	if (fi->famfs_meta != NULL) {
> +		__famfs_meta_free(fi->famfs_meta);
> +		famfs_meta_set(fi, NULL);
> +	}
> +#endif
>  }
>  
>  static inline int fuse_file_famfs(struct fuse_inode *fi)
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 848c8818e6f7..e86bf330117f 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -118,7 +118,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  		fuse_inode_backing_set(fi, NULL);
>  
>  	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
> -		famfs_meta_set(fi, NULL);
> +		fi->famfs_meta = NULL; /* XXX new inodes currently not zeroed; why not? */
>  
>  	return &fi->inode;
>  
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index d85fb692cf3b..0f6ff1ffb23d 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -1286,4 +1286,46 @@ struct fuse_uring_cmd_req {
>  	uint8_t padding[6];
>  };
>  
> +/* Famfs fmap message components */
> +
> +#define FAMFS_FMAP_VERSION 1
> +
> +#define FUSE_FAMFS_MAX_EXTENTS 2
> +#define FUSE_FAMFS_MAX_STRIPS 16
> +
> +enum fuse_famfs_file_type {
> +	FUSE_FAMFS_FILE_REG,
> +	FUSE_FAMFS_FILE_SUPERBLOCK,
> +	FUSE_FAMFS_FILE_LOG,
> +};
> +
> +enum famfs_ext_type {
> +	FUSE_FAMFS_EXT_SIMPLE = 0,
> +	FUSE_FAMFS_EXT_INTERLEAVE = 1,
> +};
> +
> +struct fuse_famfs_simple_ext {
> +	uint32_t se_devindex;
> +	uint32_t reserved;
> +	uint64_t se_offset;
> +	uint64_t se_len;
> +};
> +
> +struct fuse_famfs_iext { /* Interleaved extent */
> +	uint32_t ie_nstrips;
> +	uint32_t ie_chunk_size;
> +	uint64_t ie_nbytes; /* Total bytes for this interleaved_ext; sum of strips may be more */
> +	uint64_t reserved;
> +};
> +
> +struct fuse_famfs_fmap_header {
> +	uint8_t file_type; /* enum famfs_file_type */
> +	uint8_t reserved;
> +	uint16_t fmap_version;
> +	uint32_t ext_type; /* enum famfs_log_ext_type */
> +	uint32_t nextents;
> +	uint32_t reserved0;
> +	uint64_t file_size;
> +	uint64_t reserved1;
> +};
>  #endif /* _LINUX_FUSE_H */
> -- 
> 2.49.0
> 
>

Re: [RFC PATCH 13/19] famfs_fuse: Create files with famfs fmaps

Posted by John Groves 7 months, 3 weeks ago

On 25/04/21 02:57PM, Darrick J. Wong wrote:
> On Sun, Apr 20, 2025 at 08:33:40PM -0500, John Groves wrote:
> > On completion of GET_FMAP message/response, setup the full famfs
> > metadata such that it's possible to handle read/write/mmap directly to
> > dax. Note that the devdax_iomap plumbing is not in yet...
> > 
> > Update MAINTAINERS for the new files.
> > 
> > Signed-off-by: John Groves <john@groves.net>
> > ---
> >  MAINTAINERS               |   9 +
> >  fs/fuse/Makefile          |   2 +-
> >  fs/fuse/dir.c             |   3 +
> >  fs/fuse/famfs.c           | 344 ++++++++++++++++++++++++++++++++++++++
> >  fs/fuse/famfs_kfmap.h     |  63 +++++++
> >  fs/fuse/fuse_i.h          |  16 +-
> >  fs/fuse/inode.c           |   2 +-
> >  include/uapi/linux/fuse.h |  42 +++++
> >  8 files changed, 477 insertions(+), 4 deletions(-)
> >  create mode 100644 fs/fuse/famfs.c
> >  create mode 100644 fs/fuse/famfs_kfmap.h
> > 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 00e94bec401e..2a5a7e0e8b28 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -8808,6 +8808,15 @@ F:	Documentation/networking/failover.rst
> >  F:	include/net/failover.h
> >  F:	net/core/failover.c
> >  
> > +FAMFS
> > +M:	John Groves <jgroves@micron.com>
> > +M:	John Groves <John@Groves.net>
> > +L:	linux-cxl@vger.kernel.org
> > +L:	linux-fsdevel@vger.kernel.org
> > +S:	Supported
> > +F:	fs/fuse/famfs.c
> > +F:	fs/fuse/famfs_kfmap.h
> > +
> >  FANOTIFY
> >  M:	Jan Kara <jack@suse.cz>
> >  R:	Amir Goldstein <amir73il@gmail.com>
> > diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
> > index 3f0f312a31c1..65a12975d734 100644
> > --- a/fs/fuse/Makefile
> > +++ b/fs/fuse/Makefile
> > @@ -16,5 +16,5 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
> >  fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
> >  fuse-$(CONFIG_SYSCTL) += sysctl.o
> >  fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
> > -
> > +fuse-$(CONFIG_FUSE_FAMFS_DAX) += famfs.o
> >  virtiofs-y := virtio_fs.o
> > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> > index ae135c55b9f6..b28a1e912d6b 100644
> > --- a/fs/fuse/dir.c
> > +++ b/fs/fuse/dir.c
> > @@ -405,6 +405,9 @@ fuse_get_fmap(struct fuse_mount *fm, struct inode *inode, u64 nodeid)
> >  	fmap_size = args.out_args[0].size;
> >  	pr_notice("%s: nodei=%lld fmap_size=%ld\n", __func__, nodeid, fmap_size);
> >  
> > +	/* Convert fmap into in-memory format and hang from inode */
> > +	famfs_file_init_dax(fm, inode, fmap_buf, fmap_size);
> > +
> >  	return 0;
> >  }
> >  #endif
> > diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
> > new file mode 100644
> > index 000000000000..e62c047d0950
> > --- /dev/null
> > +++ b/fs/fuse/famfs.c
> > @@ -0,0 +1,344 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * famfs - dax file system for shared fabric-attached memory
> > + *
> > + * Copyright 2023-2025 Micron Technology, Inc.
> > + *
> > + * This file system, originally based on ramfs the dax support from xfs,
> > + * is intended to allow multiple host systems to mount a common file system
> > + * view of dax files that map to shared memory.
> > + */
> > +
> > +#include <linux/fs.h>
> > +#include <linux/mm.h>
> > +#include <linux/dax.h>
> > +#include <linux/iomap.h>
> > +#include <linux/path.h>
> > +#include <linux/namei.h>
> > +#include <linux/string.h>
> > +
> > +#include "famfs_kfmap.h"
> > +#include "fuse_i.h"
> > +
> > +
> > +void
> > +__famfs_meta_free(void *famfs_meta)
> > +{
> > +	struct famfs_file_meta *fmap = famfs_meta;
> > +
> > +	if (!fmap)
> > +		return;
> > +
> > +	if (fmap) {
> > +		switch (fmap->fm_extent_type) {
> > +		case SIMPLE_DAX_EXTENT:
> > +			kfree(fmap->se);
> > +			break;
> > +		case INTERLEAVED_EXTENT:
> 
> Are interleaved extents not DAX extents?  Why does one constant refer to
> DAX but the other does not?

All extents are DAX. Naming evolved over 2+ years, and could be cleaned up.

> 
> > +			if (fmap->ie)
> > +				kfree(fmap->ie->ie_strips);
> > +
> > +			kfree(fmap->ie);
> > +			break;
> > +		default:
> > +			pr_err("%s: invalid fmap type\n", __func__);
> > +			break;
> > +		}
> > +	}
> > +	kfree(fmap);
> > +}
> > +
> > +static int
> > +famfs_check_ext_alignment(struct famfs_meta_simple_ext *se)
> > +{
> > +	int errs = 0;
> > +
> > +	if (se->dev_index != 0)
> > +		errs++;
> > +
> > +	/* TODO: pass in alignment so we can support the other page sizes */
> > +	if (!IS_ALIGNED(se->ext_offset, PMD_SIZE))
> > +		errs++;
> > +
> > +	if (!IS_ALIGNED(se->ext_len, PMD_SIZE))
> > +		errs++;
> > +
> > +	return errs;
> > +}
> > +
> > +/**
> > + * famfs_meta_alloc() - Allocate famfs file metadata
> > + * @metap:       Pointer to an mcache_map_meta pointer
> > + * @ext_count:  The number of extents needed
> > + */
> > +static int
> > +famfs_meta_alloc_v3(
> 
> Err, what's with "v3"?  This is a new fs, right?


Um, been working on this for 2+ years so there's a not-very-public legacy.
But I agree naming should be cleaned up.

> 
> > +	void *fmap_buf,
> > +	size_t fmap_buf_size,
> > +	struct famfs_file_meta **metap)
> > +{
> > +	struct famfs_file_meta *meta = NULL;
> > +	struct fuse_famfs_fmap_header *fmh;
> > +	size_t extent_total = 0;
> > +	size_t next_offset = 0;
> > +	int errs = 0;
> > +	int i, j;
> > +	int rc;
> > +
> > +	fmh = (struct fuse_famfs_fmap_header *)fmap_buf;
> > +
> > +	/* Move past fmh in fmap_buf */
> > +	next_offset += sizeof(*fmh);
> > +	if (next_offset > fmap_buf_size) {
> > +		pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> > +		       __func__, __LINE__, next_offset, fmap_buf_size);
> > +		rc = -EINVAL;
> > +		goto errout;
> > +	}
> > +
> > +	if (fmh->nextents < 1) {
> > +		pr_err("%s: nextents %d < 1\n", __func__, fmh->nextents);
> > +		rc = -EINVAL;
> > +		goto errout;
> > +	}
> > +
> > +	if (fmh->nextents > FUSE_FAMFS_MAX_EXTENTS) {
> > +		pr_err("%s: nextents %d > max (%d) 1\n",
> > +		       __func__, fmh->nextents, FUSE_FAMFS_MAX_EXTENTS);
> > +		rc = -E2BIG;
> > +		goto errout;
> > +	}
> > +
> > +	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
> > +	if (!meta)
> > +		return -ENOMEM;
> > +	meta->error = false;
> > +
> > +	meta->file_type = fmh->file_type;
> > +	meta->file_size = fmh->file_size;
> > +	meta->fm_extent_type = fmh->ext_type;
> > +
> > +	switch (fmh->ext_type) {
> > +	case FUSE_FAMFS_EXT_SIMPLE: {
> > +		struct fuse_famfs_simple_ext *se_in;
> > +
> > +		se_in = (struct fuse_famfs_simple_ext *)(fmap_buf + next_offset);
> > +
> > +		/* Move past simple extents */
> > +		next_offset += fmh->nextents * sizeof(*se_in);
> > +		if (next_offset > fmap_buf_size) {
> > +			pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
> > +			       __func__, __LINE__, next_offset, fmap_buf_size);
> > +			rc = -EINVAL;
> > +			goto errout;
> > +		}
> > +
> > +		meta->fm_nextents = fmh->nextents;
> > +
> > +		meta->se = kcalloc(meta->fm_nextents, sizeof(*(meta->se)),
> > +				   GFP_KERNEL);
> > +		if (!meta->se) {
> > +			rc = -ENOMEM;
> > +			goto errout;
> > +		}
> > +
> > +		if ((meta->fm_nextents > FUSE_FAMFS_MAX_EXTENTS) ||
> 
> FUSE_FAMFS_MAX_EXTENTS is 2?  I gather that simple files in famfs refer
> to contiguous regions, but why two mappings?

There is no forward-looking, or even current-term reason why it should be 
limited to 2; But famfs files are strictly pre-allocated, so it takes some 
special code to test the multi-extent code paths. We do that internally, 
hence 2 (rather than 1).

Where we do exercise much bigger lists of the same extents in in interleaved
setups - where the limit is higher.

But dialing it up or even removing the limit provided the GET_FMAP message
validates should be fine.

> 
> > +		    (meta->fm_nextents < 1)) {
> > +			rc = -EINVAL;
> > +			goto errout;
> > +		}
> > +
> > +		for (i = 0; i < fmh->nextents; i++) {
> > +			meta->se[i].dev_index  = se_in[i].se_devindex;
> > +			meta->se[i].ext_offset = se_in[i].se_offset;
> > +			meta->se[i].ext_len    = se_in[i].se_len;
> > +
> > +			/* Record bitmap of referenced daxdev indices */
> > +			meta->dev_bitmap |= (1 << meta->se[i].dev_index);
> > +
> > +			errs += famfs_check_ext_alignment(&meta->se[i]);
> 
> Shouldn't you bail out at the first bad mapping?

Probably yes; need to dredge old memory about this...

> 
> > +			extent_total += meta->se[i].ext_len;
> > +		}
> 
> I took a look at what's already in uapi/linux/fuse.h and saw that
> there are two operations -- FUSE_{SETUP,REMOVE}MAPPING.  Those two fuse
> upcalls seem to manage an interval tree in struct fuse_inode_dax, which
> is used to feed fuse_iomap_begin.  Can you reuse this existing uapi
> instead of defining a new one that's already pretty similar?

OK, so the pre-existing DAX stuff in fuse is for virtiofs, which is doing
a very narrow thing (which I don't understand completely, but Stefan is
on this thread - though if I were him I might not be paying attention :)
My net assessment: the pre-existing fuse dax stuff was not a viable platform
for a file system with many files.

I initially implemented famfs as a standalone file system (patches easy
to find, and there are branches in my github kernel repos - including one
called famfs_dual that has BOTH). The existing DAX stuff in fuse is quite
different from the fs-dax interface that xfs uses - and has no notify_failure
etc.

> 
> I'm wondering why create all this new code when fuse/dax.c already seems
> to have the ability to cache mappings and pass them to dax_iomap_rw
> without restrictions on the number of mappings and all that?
> 
> Maybe you're trying to avoid runtime upcalls, but then I would think
> that you could teach the fuse/dax.c mapping code to pin the mappings
> if there aren't that many of them in the first place, rather than
> reinventing mappings?
> 
> It occurred to me (perhaps naively) that maybe you created FUSE_GETFMAP
> because of this interleaving thing because it's probably faster to
> upload a template for that than it would be to upload a large number of
> mappings.  But I don't really grok why the interleaving exists, though I
> guess it's for memory controllers interleaving memory devices or
> something for better throughput?

In famfsv1 (the standalone version), user space "pushed" mappings into
the kernel, but fuse doesn't do it that way. It wants to do readdir, lookup,
etc. So GET_FMAP was the answer I came up with - and so far it works fine.

> 
> I also see that famfs_meta_to_dax_offset does a linear walk of the
> mapping array, which does not seem like it will be inefficient when
> there are many mappings.

Right, that's no big deal. And if there's only one extent (or if the extents
are fixed-size), it's order 1.

> 
> > +		break;
> > +	}
> > +
> > +	case FUSE_FAMFS_EXT_INTERLEAVE: {
> > +		s64 size_remainder = meta->file_size;
> > +		struct fuse_famfs_iext *ie_in;
> > +		int niext = fmh->nextents;
> > +
> > +		meta->fm_niext = niext;
> > +
> > +		/* Allocate interleaved extent */
> > +		meta->ie = kcalloc(niext, sizeof(*(meta->ie)), GFP_KERNEL);
> > +		if (!meta->ie) {
> > +			rc = -ENOMEM;
> > +			goto errout;
> > +		}
> > +
> > +		/*
> > +		 * Each interleaved extent has a simple extent list of strips.
> > +		 * Outer loop is over separate interleaved extents
> 
> Hmm, so there's no checking on fmh->nextents here, so I guess we can
> have as many sets of interleaved extents as we want?  Each with up to 16
> simple mappings?
> 
> --D

OK, so I'm remembering a bit more about the legacy around extent limits. 
There are some MVP simplifications in the famfs metadata log format 
(which is orthogonal to the message and in-memory metadata formats here). 
An fmap in the log (a third format, but there is at least one more :-/) 
is a fully dimensioned compound structure that you can call sizeof on. 
So that is the second reason (in addition to preallocation) why we didn't 
need many extents.

Also, when we resolve file offsets to dax offsets, limit and validity
checking was already done when the GET_FMAP message was ingested.

I think for fuse famfs, that can be relaxed and ignored - especially if 
you're gonna test it :D.

Thanks for the review eyeballs, and let me know if you wanna talk through
some of this stuff.

Regards,
John