[RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs

John Groves posted 18 patches 3 months ago
[RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 3 months ago
* FUSE_DAX_FMAP flag in INIT request/reply

* fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
  famfs-enabled connection

Signed-off-by: John Groves <john@groves.net>
---
 fs/fuse/fuse_i.h          |  3 +++
 fs/fuse/inode.c           | 14 ++++++++++++++
 include/uapi/linux/fuse.h |  4 ++++
 3 files changed, 21 insertions(+)

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 9d87ac48d724..a592c1002861 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -873,6 +873,9 @@ struct fuse_conn {
 	/* Use io_uring for communication */
 	unsigned int io_uring;
 
+	/* dev_dax_iomap support for famfs */
+	unsigned int famfs_iomap:1;
+
 	/** Maximum stack depth for passthrough backing files */
 	int max_stack_depth;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 29147657a99f..e48e11c3f9f3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 			}
 			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
 				fc->io_uring = 1;
+			if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
+			    flags & FUSE_DAX_FMAP) {
+				/* XXX: Should also check that fuse server
+				 * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
+				 * since it is directing the kernel to access
+				 * dax memory directly - but this function
+				 * appears not to be called in fuse server
+				 * process context (b/c even if it drops
+				 * those capabilities, they are held here).
+				 */
+				fc->famfs_iomap = 1;
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1450,6 +1462,8 @@ void fuse_send_init(struct fuse_mount *fm)
 		flags |= FUSE_SUBMOUNTS;
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		flags |= FUSE_PASSTHROUGH;
+	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+		flags |= FUSE_DAX_FMAP;
 
 	/*
 	 * This is just an information flag for fuse server. No need to check
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 5e0eb41d967e..6c384640c79b 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -229,6 +229,8 @@
  *    - FUSE_URING_IN_OUT_HEADER_SZ
  *    - FUSE_URING_OP_IN_OUT_SZ
  *    - enum fuse_uring_cmd
+ *  7.43
+ *    - Add FUSE_DAX_FMAP capability - ability to handle in-kernel fsdax maps
  */
 
 #ifndef _LINUX_FUSE_H
@@ -435,6 +437,7 @@ struct fuse_file_lock {
  *		    of the request ID indicates resend requests
  * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts
  * FUSE_OVER_IO_URING: Indicate that client supports io-uring
+ * FUSE_DAX_FMAP: kernel supports dev_dax_iomap (aka famfs) fmaps
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -482,6 +485,7 @@ struct fuse_file_lock {
 #define FUSE_DIRECT_IO_RELAX	FUSE_DIRECT_IO_ALLOW_MMAP
 #define FUSE_ALLOW_IDMAP	(1ULL << 40)
 #define FUSE_OVER_IO_URING	(1ULL << 41)
+#define FUSE_DAX_FMAP		(1ULL << 42)
 
 /**
  * CUSE INIT request/reply flags
-- 
2.49.0
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Amir Goldstein 3 months ago
On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
>
> * FUSE_DAX_FMAP flag in INIT request/reply
>
> * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
>   famfs-enabled connection
>
> Signed-off-by: John Groves <john@groves.net>
> ---
>  fs/fuse/fuse_i.h          |  3 +++
>  fs/fuse/inode.c           | 14 ++++++++++++++
>  include/uapi/linux/fuse.h |  4 ++++
>  3 files changed, 21 insertions(+)
>
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 9d87ac48d724..a592c1002861 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -873,6 +873,9 @@ struct fuse_conn {
>         /* Use io_uring for communication */
>         unsigned int io_uring;
>
> +       /* dev_dax_iomap support for famfs */
> +       unsigned int famfs_iomap:1;
> +

pls move up to the bit fields members.

>         /** Maximum stack depth for passthrough backing files */
>         int max_stack_depth;
>
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 29147657a99f..e48e11c3f9f3 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>                         }
>                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
>                                 fc->io_uring = 1;
> +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> +                           flags & FUSE_DAX_FMAP) {
> +                               /* XXX: Should also check that fuse server
> +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> +                                * since it is directing the kernel to access
> +                                * dax memory directly - but this function
> +                                * appears not to be called in fuse server
> +                                * process context (b/c even if it drops
> +                                * those capabilities, they are held here).
> +                                */
> +                               fc->famfs_iomap = 1;
> +                       }

1. As long as the mapping requests are checking capabilities we should be ok
    Right?
2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
3. Darrick mentioned the need for a synchronic INIT variant for his work on
    blockdev iomap support [1]

I also wonder how much of your patches and Darrick's patches end up
being an overlap?

Thanks,
Amir.

[1] https://lore.kernel.org/linux-fsdevel/20250613174413.GM6138@frogsfrogsfrogs/
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 3 months ago
On 25/07/04 09:54AM, Amir Goldstein wrote:
> On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> >
> > * FUSE_DAX_FMAP flag in INIT request/reply
> >
> > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> >   famfs-enabled connection
> >
> > Signed-off-by: John Groves <john@groves.net>
> > ---
> >  fs/fuse/fuse_i.h          |  3 +++
> >  fs/fuse/inode.c           | 14 ++++++++++++++
> >  include/uapi/linux/fuse.h |  4 ++++
> >  3 files changed, 21 insertions(+)
> >
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 9d87ac48d724..a592c1002861 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -873,6 +873,9 @@ struct fuse_conn {
> >         /* Use io_uring for communication */
> >         unsigned int io_uring;
> >
> > +       /* dev_dax_iomap support for famfs */
> > +       unsigned int famfs_iomap:1;
> > +
> 
> pls move up to the bit fields members.

Oops, done, thanks.

> 
> >         /** Maximum stack depth for passthrough backing files */
> >         int max_stack_depth;
> >
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 29147657a99f..e48e11c3f9f3 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> >                         }
> >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> >                                 fc->io_uring = 1;
> > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > +                           flags & FUSE_DAX_FMAP) {
> > +                               /* XXX: Should also check that fuse server
> > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > +                                * since it is directing the kernel to access
> > +                                * dax memory directly - but this function
> > +                                * appears not to be called in fuse server
> > +                                * process context (b/c even if it drops
> > +                                * those capabilities, they are held here).
> > +                                */
> > +                               fc->famfs_iomap = 1;
> > +                       }
> 
> 1. As long as the mapping requests are checking capabilities we should be ok
>     Right?

It depends on the definition of "are", or maybe of "mapping requests" ;)

Forgive me if this *is* obvious, but the fuse server capabilities are what
I think need to be checked here - not the app that it accessing a file.

An app accessing a regular file doesn't need permission to do raw access to
the underlying block dev, but the fuse server does - becuase it is directing
the kernel to access that for apps.

> 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?

I *think* that's checking the capabilities of the app that is accessing the
file, and not the fuse server. But I might be wrong - I have not pulled very
hard on that thread yet.

> 3. Darrick mentioned the need for a synchronic INIT variant for his work on
>     blockdev iomap support [1]

I'm not sure that's the same thing (Darrick?), but I do think Darrick's
use case probably needs to check capabilities for a server that is sending
apps (via files) off to access extents of block devices.

> 
> I also wonder how much of your patches and Darrick's patches end up
> being an overlap?

Darrick and I spent some time hashing through this, and came to the conclusion
that the actual overlap is slim-to-none. 

> 
> Thanks,
> Amir.
> 
> [1] https://lore.kernel.org/linux-fsdevel/20250613174413.GM6138@frogsfrogsfrogs/

Thank you!
John
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Darrick J. Wong 3 months ago
On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> On 25/07/04 09:54AM, Amir Goldstein wrote:
> > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > >
> > > * FUSE_DAX_FMAP flag in INIT request/reply
> > >
> > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > >   famfs-enabled connection
> > >
> > > Signed-off-by: John Groves <john@groves.net>
> > > ---
> > >  fs/fuse/fuse_i.h          |  3 +++
> > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > >  include/uapi/linux/fuse.h |  4 ++++
> > >  3 files changed, 21 insertions(+)
> > >
> > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > index 9d87ac48d724..a592c1002861 100644
> > > --- a/fs/fuse/fuse_i.h
> > > +++ b/fs/fuse/fuse_i.h
> > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > >         /* Use io_uring for communication */
> > >         unsigned int io_uring;
> > >
> > > +       /* dev_dax_iomap support for famfs */
> > > +       unsigned int famfs_iomap:1;
> > > +
> > 
> > pls move up to the bit fields members.
> 
> Oops, done, thanks.
> 
> > 
> > >         /** Maximum stack depth for passthrough backing files */
> > >         int max_stack_depth;
> > >
> > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > index 29147657a99f..e48e11c3f9f3 100644
> > > --- a/fs/fuse/inode.c
> > > +++ b/fs/fuse/inode.c
> > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > >                         }
> > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > >                                 fc->io_uring = 1;
> > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > +                           flags & FUSE_DAX_FMAP) {
> > > +                               /* XXX: Should also check that fuse server
> > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > +                                * since it is directing the kernel to access
> > > +                                * dax memory directly - but this function
> > > +                                * appears not to be called in fuse server
> > > +                                * process context (b/c even if it drops
> > > +                                * those capabilities, they are held here).
> > > +                                */
> > > +                               fc->famfs_iomap = 1;
> > > +                       }
> > 
> > 1. As long as the mapping requests are checking capabilities we should be ok
> >     Right?
> 
> It depends on the definition of "are", or maybe of "mapping requests" ;)
> 
> Forgive me if this *is* obvious, but the fuse server capabilities are what
> I think need to be checked here - not the app that it accessing a file.
> 
> An app accessing a regular file doesn't need permission to do raw access to
> the underlying block dev, but the fuse server does - becuase it is directing
> the kernel to access that for apps.
> 
> > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> 
> I *think* that's checking the capabilities of the app that is accessing the
> file, and not the fuse server. But I might be wrong - I have not pulled very
> hard on that thread yet.

The init reply should be processed in the context of the fuse server.
At that point the kernel hasn't exposed the fs to user programs, so
(AFAICT) there won't be any other programs accessing that fuse mount.

> > 3. Darrick mentioned the need for a synchronic INIT variant for his work on
> >     blockdev iomap support [1]
> 
> I'm not sure that's the same thing (Darrick?), but I do think Darrick's
> use case probably needs to check capabilities for a server that is sending
> apps (via files) off to access extents of block devices.

I don't know either, Miklos hasn't responded to my questions.  I think
the motivation for a synchronous 

As for fuse/iomap, I just only need to ask the kernel if iomap support
is available before calling ext2fs_open2() because the iomap question
has some implications for how we open the ext4 filesystem.

> > I also wonder how much of your patches and Darrick's patches end up
> > being an overlap?
> 
> Darrick and I spent some time hashing through this, and came to the conclusion
> that the actual overlap is slim-to-none. 

Yeah.  The neat thing about FMAPs is that you can establish repeating
patterns, which is useful for interleaved DRAM/pmem devices.  Disk
filesystems don't do repeating patterns, so they'd much rather manage
non-repeating mappings.

--D

> > 
> > Thanks,
> > Amir.
> > 
> > [1] https://lore.kernel.org/linux-fsdevel/20250613174413.GM6138@frogsfrogsfrogs/
> 
> Thank you!
> John
> 
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 3 months ago
On 25/07/07 10:39AM, Darrick J. Wong wrote:
> On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > >
> > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > >
> > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > >   famfs-enabled connection
> > > >
> > > > Signed-off-by: John Groves <john@groves.net>
> > > > ---
> > > >  fs/fuse/fuse_i.h          |  3 +++
> > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > >  include/uapi/linux/fuse.h |  4 ++++
> > > >  3 files changed, 21 insertions(+)
> > > >
> > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > index 9d87ac48d724..a592c1002861 100644
> > > > --- a/fs/fuse/fuse_i.h
> > > > +++ b/fs/fuse/fuse_i.h
> > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > >         /* Use io_uring for communication */
> > > >         unsigned int io_uring;
> > > >
> > > > +       /* dev_dax_iomap support for famfs */
> > > > +       unsigned int famfs_iomap:1;
> > > > +
> > > 
> > > pls move up to the bit fields members.
> > 
> > Oops, done, thanks.
> > 
> > > 
> > > >         /** Maximum stack depth for passthrough backing files */
> > > >         int max_stack_depth;
> > > >
> > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > --- a/fs/fuse/inode.c
> > > > +++ b/fs/fuse/inode.c
> > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > >                         }
> > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > >                                 fc->io_uring = 1;
> > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > +                           flags & FUSE_DAX_FMAP) {
> > > > +                               /* XXX: Should also check that fuse server
> > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > +                                * since it is directing the kernel to access
> > > > +                                * dax memory directly - but this function
> > > > +                                * appears not to be called in fuse server
> > > > +                                * process context (b/c even if it drops
> > > > +                                * those capabilities, they are held here).
> > > > +                                */
> > > > +                               fc->famfs_iomap = 1;
> > > > +                       }
> > > 
> > > 1. As long as the mapping requests are checking capabilities we should be ok
> > >     Right?
> > 
> > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > 
> > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > I think need to be checked here - not the app that it accessing a file.
> > 
> > An app accessing a regular file doesn't need permission to do raw access to
> > the underlying block dev, but the fuse server does - becuase it is directing
> > the kernel to access that for apps.
> > 
> > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > 
> > I *think* that's checking the capabilities of the app that is accessing the
> > file, and not the fuse server. But I might be wrong - I have not pulled very
> > hard on that thread yet.
> 
> The init reply should be processed in the context of the fuse server.
> At that point the kernel hasn't exposed the fs to user programs, so
> (AFAICT) there won't be any other programs accessing that fuse mount.

Hmm. It would be good if you're right about that. My fuse server *is* running
as root, and when I check those capabilities in process_init_reply(), I
find those capabilities. So far so good.

Then I added code to my fuse server to drop those capabilities prior to
starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
capabilities disappear in process_init_reply() - but they did not disappear.

I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
capabilities need to be stashed away before the reply is processsed, when 
fs/fuse *is* running in fuse server context.

I'm somewhat surprised if that isn't already happening somewhere...

> 
> > > 3. Darrick mentioned the need for a synchronic INIT variant for his work on
> > >     blockdev iomap support [1]
> > 
> > I'm not sure that's the same thing (Darrick?), but I do think Darrick's
> > use case probably needs to check capabilities for a server that is sending
> > apps (via files) off to access extents of block devices.
> 
> I don't know either, Miklos hasn't responded to my questions.  I think
> the motivation for a synchronous 

?

> 
> As for fuse/iomap, I just only need to ask the kernel if iomap support
> is available before calling ext2fs_open2() because the iomap question
> has some implications for how we open the ext4 filesystem.
> 
> > > I also wonder how much of your patches and Darrick's patches end up
> > > being an overlap?
> > 
> > Darrick and I spent some time hashing through this, and came to the conclusion
> > that the actual overlap is slim-to-none. 
> 
> Yeah.  The neat thing about FMAPs is that you can establish repeating
> patterns, which is useful for interleaved DRAM/pmem devices.  Disk
> filesystems don't do repeating patterns, so they'd much rather manage
> non-repeating mappings.

Right. Interleaving is critical to how we use memory, so fmaps are designed
to support it.

Tangent: at some point a broader-than-just-me discussion of how block devices
have the device mapper, but memory has no such layout tools, might be good
to have. Without such a thing (which might or might not be possible/practical),
it's essential that famfs do the interleaving. Lacking a mapper layer also
means that we need dax to provide a clean "device abstraction" (meaning
a single CXL allocation [which has a uuid/tag] needs to appear as a single
dax device whether or not it's HPA-contiguous).

Cheers,
John

Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Darrick J. Wong 3 months ago
On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > >
> > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > >
> > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > >   famfs-enabled connection
> > > > >
> > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > ---
> > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > >  3 files changed, 21 insertions(+)
> > > > >
> > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > --- a/fs/fuse/fuse_i.h
> > > > > +++ b/fs/fuse/fuse_i.h
> > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > >         /* Use io_uring for communication */
> > > > >         unsigned int io_uring;
> > > > >
> > > > > +       /* dev_dax_iomap support for famfs */
> > > > > +       unsigned int famfs_iomap:1;
> > > > > +
> > > > 
> > > > pls move up to the bit fields members.
> > > 
> > > Oops, done, thanks.
> > > 
> > > > 
> > > > >         /** Maximum stack depth for passthrough backing files */
> > > > >         int max_stack_depth;
> > > > >
> > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > --- a/fs/fuse/inode.c
> > > > > +++ b/fs/fuse/inode.c
> > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > >                         }
> > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > >                                 fc->io_uring = 1;
> > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > +                               /* XXX: Should also check that fuse server
> > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > +                                * since it is directing the kernel to access
> > > > > +                                * dax memory directly - but this function
> > > > > +                                * appears not to be called in fuse server
> > > > > +                                * process context (b/c even if it drops
> > > > > +                                * those capabilities, they are held here).
> > > > > +                                */
> > > > > +                               fc->famfs_iomap = 1;
> > > > > +                       }
> > > > 
> > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > >     Right?
> > > 
> > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > 
> > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > I think need to be checked here - not the app that it accessing a file.
> > > 
> > > An app accessing a regular file doesn't need permission to do raw access to
> > > the underlying block dev, but the fuse server does - becuase it is directing
> > > the kernel to access that for apps.
> > > 
> > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > 
> > > I *think* that's checking the capabilities of the app that is accessing the
> > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > hard on that thread yet.
> > 
> > The init reply should be processed in the context of the fuse server.
> > At that point the kernel hasn't exposed the fs to user programs, so
> > (AFAICT) there won't be any other programs accessing that fuse mount.
> 
> Hmm. It would be good if you're right about that. My fuse server *is* running
> as root, and when I check those capabilities in process_init_reply(), I
> find those capabilities. So far so good.
> 
> Then I added code to my fuse server to drop those capabilities prior to
> starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> capabilities disappear in process_init_reply() - but they did not disappear.
> 
> I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> capabilities need to be stashed away before the reply is processsed, when 
> fs/fuse *is* running in fuse server context.
> 
> I'm somewhat surprised if that isn't already happening somewhere...

Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
command, it should still execute in the same process context as the fuse
server.

OTOH it also occurs to me that I have this code in fuse_send_init:

	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
	...
	ia->in.flags = flags;
	ia->in.flags2 = flags >> 32;

which means that we only advertise iomap support in FUSE_INIT if the
process running fuse_fill_super (which you hope is the fuse server)
actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
dropping privileges before you even open /dev/fuse?

Note: I might decide to relax that approach later on, since iomap
requires you to have opened a block device ... which implies that the
process had read/write access to start with; and maybe we're ok with
unprivileged fuse2fs servers running on a chmod 666 block device?

<shrug> always easier to /relax/ the privilege checks. :)

> > > > 3. Darrick mentioned the need for a synchronic INIT variant for his work on
> > > >     blockdev iomap support [1]
> > > 
> > > I'm not sure that's the same thing (Darrick?), but I do think Darrick's
> > > use case probably needs to check capabilities for a server that is sending
> > > apps (via files) off to access extents of block devices.
> > 
> > I don't know either, Miklos hasn't responded to my questions.  I think
> > the motivation for a synchronous 
> 
> ?

..."I don't know what his motivations for synchronous FUSE_INIT are."

I guess I fubard vim. :(

> > As for fuse/iomap, I just only need to ask the kernel if iomap support
> > is available before calling ext2fs_open2() because the iomap question
> > has some implications for how we open the ext4 filesystem.
> > 
> > > > I also wonder how much of your patches and Darrick's patches end up
> > > > being an overlap?
> > > 
> > > Darrick and I spent some time hashing through this, and came to the conclusion
> > > that the actual overlap is slim-to-none. 
> > 
> > Yeah.  The neat thing about FMAPs is that you can establish repeating
> > patterns, which is useful for interleaved DRAM/pmem devices.  Disk
> > filesystems don't do repeating patterns, so they'd much rather manage
> > non-repeating mappings.
> 
> Right. Interleaving is critical to how we use memory, so fmaps are designed
> to support it.
> 
> Tangent: at some point a broader-than-just-me discussion of how block devices
> have the device mapper, but memory has no such layout tools, might be good
> to have. Without such a thing (which might or might not be possible/practical),
> it's essential that famfs do the interleaving. Lacking a mapper layer also
> means that we need dax to provide a clean "device abstraction" (meaning
> a single CXL allocation [which has a uuid/tag] needs to appear as a single
> dax device whether or not it's HPA-contiguous).

Well it's not as simple as device-mapper, where we can intercept struct
bio and remap/split it to our heart's content.  I guess you could do
that with an iovec...?  Would be sorta amusing if you could software
RAID10 some DRAM. :P

--D

> Cheers,
> John
> 
> 
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 2 months, 4 weeks ago
On 25/07/08 06:53PM, Darrick J. Wong wrote:
> On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> > On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > > >
> > > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > > >
> > > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > > >   famfs-enabled connection
> > > > > >
> > > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > > ---
> > > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > > >  3 files changed, 21 insertions(+)
> > > > > >
> > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > > --- a/fs/fuse/fuse_i.h
> > > > > > +++ b/fs/fuse/fuse_i.h
> > > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > > >         /* Use io_uring for communication */
> > > > > >         unsigned int io_uring;
> > > > > >
> > > > > > +       /* dev_dax_iomap support for famfs */
> > > > > > +       unsigned int famfs_iomap:1;
> > > > > > +
> > > > > 
> > > > > pls move up to the bit fields members.
> > > > 
> > > > Oops, done, thanks.
> > > > 
> > > > > 
> > > > > >         /** Maximum stack depth for passthrough backing files */
> > > > > >         int max_stack_depth;
> > > > > >
> > > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > > --- a/fs/fuse/inode.c
> > > > > > +++ b/fs/fuse/inode.c
> > > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > > >                         }
> > > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > > >                                 fc->io_uring = 1;
> > > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > > +                               /* XXX: Should also check that fuse server
> > > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > > +                                * since it is directing the kernel to access
> > > > > > +                                * dax memory directly - but this function
> > > > > > +                                * appears not to be called in fuse server
> > > > > > +                                * process context (b/c even if it drops
> > > > > > +                                * those capabilities, they are held here).
> > > > > > +                                */
> > > > > > +                               fc->famfs_iomap = 1;
> > > > > > +                       }
> > > > > 
> > > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > > >     Right?
> > > > 
> > > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > > 
> > > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > > I think need to be checked here - not the app that it accessing a file.
> > > > 
> > > > An app accessing a regular file doesn't need permission to do raw access to
> > > > the underlying block dev, but the fuse server does - becuase it is directing
> > > > the kernel to access that for apps.
> > > > 
> > > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > > 
> > > > I *think* that's checking the capabilities of the app that is accessing the
> > > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > > hard on that thread yet.
> > > 
> > > The init reply should be processed in the context of the fuse server.
> > > At that point the kernel hasn't exposed the fs to user programs, so
> > > (AFAICT) there won't be any other programs accessing that fuse mount.
> > 
> > Hmm. It would be good if you're right about that. My fuse server *is* running
> > as root, and when I check those capabilities in process_init_reply(), I
> > find those capabilities. So far so good.
> > 
> > Then I added code to my fuse server to drop those capabilities prior to
> > starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> > prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> > capabilities disappear in process_init_reply() - but they did not disappear.
> > 
> > I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> > capabilities need to be stashed away before the reply is processsed, when 
> > fs/fuse *is* running in fuse server context.
> > 
> > I'm somewhat surprised if that isn't already happening somewhere...
> 
> Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
> command, it should still execute in the same process context as the fuse
> server.
> 
> OTOH it also occurs to me that I have this code in fuse_send_init:
> 
> 	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
> 		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
> 	...
> 	ia->in.flags = flags;
> 	ia->in.flags2 = flags >> 32;
> 
> which means that we only advertise iomap support in FUSE_INIT if the
> process running fuse_fill_super (which you hope is the fuse server)
> actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
> dropping privileges before you even open /dev/fuse?

Ah - that might be the answer. I will check if dropped capabilities 
disappear in fuse_send_init. If so, I can work with that - not advertising 
the famfs capability unless the capability is present at that point looks 
like a perfectly good option. Thanks for that idea!

> 
> Note: I might decide to relax that approach later on, since iomap
> requires you to have opened a block device ... which implies that the
> process had read/write access to start with; and maybe we're ok with
> unprivileged fuse2fs servers running on a chmod 666 block device?
> 
> <shrug> always easier to /relax/ the privilege checks. :)

My policy on security is that I'm against it...

> 
> > > > > 3. Darrick mentioned the need for a synchronic INIT variant for his work on
> > > > >     blockdev iomap support [1]
> > > > 
> > > > I'm not sure that's the same thing (Darrick?), but I do think Darrick's
> > > > use case probably needs to check capabilities for a server that is sending
> > > > apps (via files) off to access extents of block devices.
> > > 
> > > I don't know either, Miklos hasn't responded to my questions.  I think
> > > the motivation for a synchronous 
> > 
> > ?
> 
> ..."I don't know what his motivations for synchronous FUSE_INIT are."
> 
> I guess I fubard vim. :(

So I'm not alone...

> 
> > > As for fuse/iomap, I just only need to ask the kernel if iomap support
> > > is available before calling ext2fs_open2() because the iomap question
> > > has some implications for how we open the ext4 filesystem.
> > > 
> > > > > I also wonder how much of your patches and Darrick's patches end up
> > > > > being an overlap?
> > > > 
> > > > Darrick and I spent some time hashing through this, and came to the conclusion
> > > > that the actual overlap is slim-to-none. 
> > > 
> > > Yeah.  The neat thing about FMAPs is that you can establish repeating
> > > patterns, which is useful for interleaved DRAM/pmem devices.  Disk
> > > filesystems don't do repeating patterns, so they'd much rather manage
> > > non-repeating mappings.
> > 
> > Right. Interleaving is critical to how we use memory, so fmaps are designed
> > to support it.
> > 
> > Tangent: at some point a broader-than-just-me discussion of how block devices
> > have the device mapper, but memory has no such layout tools, might be good
> > to have. Without such a thing (which might or might not be possible/practical),
> > it's essential that famfs do the interleaving. Lacking a mapper layer also
> > means that we need dax to provide a clean "device abstraction" (meaning
> > a single CXL allocation [which has a uuid/tag] needs to appear as a single
> > dax device whether or not it's HPA-contiguous).
> 
> Well it's not as simple as device-mapper, where we can intercept struct
> bio and remap/split it to our heart's content.  I guess you could do
> that with an iovec...?  Would be sorta amusing if you could software
> RAID10 some DRAM. :P

SW RAID, and mapper in general, has a "store and forward" property (or maybe
"store, transmogrify, and forward") that doesn't really work for memory. 
It's vma's (and files) that can remap memory address regions. Layered vma's 
anyone? I need to think about whether that's utter nonsense, or just mostly 
nonsense.

Continuing to think about this...

Thanks!
John


Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 1 month, 3 weeks ago
On 25/07/10 08:32PM, John Groves wrote:
> On 25/07/08 06:53PM, Darrick J. Wong wrote:
> > On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> > > On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > > > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > > > >
> > > > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > > > >
> > > > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > > > >   famfs-enabled connection
> > > > > > >
> > > > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > > > ---
> > > > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > > > >  3 files changed, 21 insertions(+)
> > > > > > >
> > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > > > --- a/fs/fuse/fuse_i.h
> > > > > > > +++ b/fs/fuse/fuse_i.h
> > > > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > > > >         /* Use io_uring for communication */
> > > > > > >         unsigned int io_uring;
> > > > > > >
> > > > > > > +       /* dev_dax_iomap support for famfs */
> > > > > > > +       unsigned int famfs_iomap:1;
> > > > > > > +
> > > > > > 
> > > > > > pls move up to the bit fields members.
> > > > > 
> > > > > Oops, done, thanks.
> > > > > 
> > > > > > 
> > > > > > >         /** Maximum stack depth for passthrough backing files */
> > > > > > >         int max_stack_depth;
> > > > > > >
> > > > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > > > --- a/fs/fuse/inode.c
> > > > > > > +++ b/fs/fuse/inode.c
> > > > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > > > >                         }
> > > > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > > > >                                 fc->io_uring = 1;
> > > > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > > > +                               /* XXX: Should also check that fuse server
> > > > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > > > +                                * since it is directing the kernel to access
> > > > > > > +                                * dax memory directly - but this function
> > > > > > > +                                * appears not to be called in fuse server
> > > > > > > +                                * process context (b/c even if it drops
> > > > > > > +                                * those capabilities, they are held here).
> > > > > > > +                                */
> > > > > > > +                               fc->famfs_iomap = 1;
> > > > > > > +                       }
> > > > > > 
> > > > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > > > >     Right?
> > > > > 
> > > > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > > > 
> > > > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > > > I think need to be checked here - not the app that it accessing a file.
> > > > > 
> > > > > An app accessing a regular file doesn't need permission to do raw access to
> > > > > the underlying block dev, but the fuse server does - becuase it is directing
> > > > > the kernel to access that for apps.
> > > > > 
> > > > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > > > 
> > > > > I *think* that's checking the capabilities of the app that is accessing the
> > > > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > > > hard on that thread yet.
> > > > 
> > > > The init reply should be processed in the context of the fuse server.
> > > > At that point the kernel hasn't exposed the fs to user programs, so
> > > > (AFAICT) there won't be any other programs accessing that fuse mount.
> > > 
> > > Hmm. It would be good if you're right about that. My fuse server *is* running
> > > as root, and when I check those capabilities in process_init_reply(), I
> > > find those capabilities. So far so good.
> > > 
> > > Then I added code to my fuse server to drop those capabilities prior to
> > > starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> > > prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> > > capabilities disappear in process_init_reply() - but they did not disappear.
> > > 
> > > I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> > > capabilities need to be stashed away before the reply is processsed, when 
> > > fs/fuse *is* running in fuse server context.
> > > 
> > > I'm somewhat surprised if that isn't already happening somewhere...
> > 
> > Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
> > command, it should still execute in the same process context as the fuse
> > server.
> > 
> > OTOH it also occurs to me that I have this code in fuse_send_init:
> > 
> > 	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
> > 		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
> > 	...
> > 	ia->in.flags = flags;
> > 	ia->in.flags2 = flags >> 32;
> > 
> > which means that we only advertise iomap support in FUSE_INIT if the
> > process running fuse_fill_super (which you hope is the fuse server)
> > actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
> > dropping privileges before you even open /dev/fuse?
> 
> Ah - that might be the answer. I will check if dropped capabilities 
> disappear in fuse_send_init. If so, I can work with that - not advertising 
> the famfs capability unless the capability is present at that point looks 
> like a perfectly good option. Thanks for that idea!

Review: the famfs fuse server directs the kernel to provide access to raw
(memory) devices, so it should should be required to have have the
CAP_SYS_RAWIO capability. fs/fuse needs to detect this at init time,
and fail the connection/mount if the capability is missing.

I initially attempted to do this verification in process_init_reply(), but
that doesn't run in the fuse server process context.

I am now checking the capability in fuse_send_init(), and not advertising
the FUSE_DAX_FMAP capability (in in_args->flags[2]) unless the server has 
CAP_SYS_RAWIO.

That requires that process_init_reply() reject FUSE_DAX_FMAP from a server
if FUSE_DAX_FMAP was not set in in_args->flags[2]. process_init_reply() was
not previously checking the in_args, but no big deal - this works.

This leads to an apparent dilemma in libfuse. In fuse_lowlevel_ops->init(),
I should check for (flags & FUSE_DAX_IOMAP), and fail the connection if
that capability is not on offer. But fuse_lowlevel_ops->init() doesn't
have an obvious way to fail the connection. 

How should I do that? Hoping Bernd, Amir or the other libfuse people may 
have "the answer" (tm).

And of course if any of this doesn't sound like the way to go, let me know...

Thanks!
John

Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Darrick J. Wong 1 month, 3 weeks ago
On Mon, Aug 11, 2025 at 01:30:53PM -0500, John Groves wrote:
> On 25/07/10 08:32PM, John Groves wrote:
> > On 25/07/08 06:53PM, Darrick J. Wong wrote:
> > > On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> > > > On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > > > > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > > > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > > > > >
> > > > > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > > > > >
> > > > > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > > > > >   famfs-enabled connection
> > > > > > > >
> > > > > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > > > > ---
> > > > > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > > > > >  3 files changed, 21 insertions(+)
> > > > > > > >
> > > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > > > > --- a/fs/fuse/fuse_i.h
> > > > > > > > +++ b/fs/fuse/fuse_i.h
> > > > > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > > > > >         /* Use io_uring for communication */
> > > > > > > >         unsigned int io_uring;
> > > > > > > >
> > > > > > > > +       /* dev_dax_iomap support for famfs */
> > > > > > > > +       unsigned int famfs_iomap:1;
> > > > > > > > +
> > > > > > > 
> > > > > > > pls move up to the bit fields members.
> > > > > > 
> > > > > > Oops, done, thanks.
> > > > > > 
> > > > > > > 
> > > > > > > >         /** Maximum stack depth for passthrough backing files */
> > > > > > > >         int max_stack_depth;
> > > > > > > >
> > > > > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > > > > --- a/fs/fuse/inode.c
> > > > > > > > +++ b/fs/fuse/inode.c
> > > > > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > > > > >                         }
> > > > > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > > > > >                                 fc->io_uring = 1;
> > > > > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > > > > +                               /* XXX: Should also check that fuse server
> > > > > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > > > > +                                * since it is directing the kernel to access
> > > > > > > > +                                * dax memory directly - but this function
> > > > > > > > +                                * appears not to be called in fuse server
> > > > > > > > +                                * process context (b/c even if it drops
> > > > > > > > +                                * those capabilities, they are held here).
> > > > > > > > +                                */
> > > > > > > > +                               fc->famfs_iomap = 1;
> > > > > > > > +                       }
> > > > > > > 
> > > > > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > > > > >     Right?
> > > > > > 
> > > > > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > > > > 
> > > > > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > > > > I think need to be checked here - not the app that it accessing a file.
> > > > > > 
> > > > > > An app accessing a regular file doesn't need permission to do raw access to
> > > > > > the underlying block dev, but the fuse server does - becuase it is directing
> > > > > > the kernel to access that for apps.
> > > > > > 
> > > > > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > > > > 
> > > > > > I *think* that's checking the capabilities of the app that is accessing the
> > > > > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > > > > hard on that thread yet.
> > > > > 
> > > > > The init reply should be processed in the context of the fuse server.
> > > > > At that point the kernel hasn't exposed the fs to user programs, so
> > > > > (AFAICT) there won't be any other programs accessing that fuse mount.
> > > > 
> > > > Hmm. It would be good if you're right about that. My fuse server *is* running
> > > > as root, and when I check those capabilities in process_init_reply(), I
> > > > find those capabilities. So far so good.
> > > > 
> > > > Then I added code to my fuse server to drop those capabilities prior to
> > > > starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> > > > prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> > > > capabilities disappear in process_init_reply() - but they did not disappear.
> > > > 
> > > > I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> > > > capabilities need to be stashed away before the reply is processsed, when 
> > > > fs/fuse *is* running in fuse server context.
> > > > 
> > > > I'm somewhat surprised if that isn't already happening somewhere...
> > > 
> > > Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
> > > command, it should still execute in the same process context as the fuse
> > > server.
> > > 
> > > OTOH it also occurs to me that I have this code in fuse_send_init:
> > > 
> > > 	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
> > > 		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
> > > 	...
> > > 	ia->in.flags = flags;
> > > 	ia->in.flags2 = flags >> 32;
> > > 
> > > which means that we only advertise iomap support in FUSE_INIT if the
> > > process running fuse_fill_super (which you hope is the fuse server)
> > > actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
> > > dropping privileges before you even open /dev/fuse?
> > 
> > Ah - that might be the answer. I will check if dropped capabilities 
> > disappear in fuse_send_init. If so, I can work with that - not advertising 
> > the famfs capability unless the capability is present at that point looks 
> > like a perfectly good option. Thanks for that idea!
> 
> Review: the famfs fuse server directs the kernel to provide access to raw
> (memory) devices, so it should should be required to have have the
> CAP_SYS_RAWIO capability. fs/fuse needs to detect this at init time,
> and fail the connection/mount if the capability is missing.
> 
> I initially attempted to do this verification in process_init_reply(), but
> that doesn't run in the fuse server process context.
> 
> I am now checking the capability in fuse_send_init(), and not advertising
> the FUSE_DAX_FMAP capability (in in_args->flags[2]) unless the server has 
> CAP_SYS_RAWIO.
> 
> That requires that process_init_reply() reject FUSE_DAX_FMAP from a server
> if FUSE_DAX_FMAP was not set in in_args->flags[2]. process_init_reply() was
> not previously checking the in_args, but no big deal - this works.
> 
> This leads to an apparent dilemma in libfuse. In fuse_lowlevel_ops->init(),
> I should check for (flags & FUSE_DAX_IOMAP), and fail the connection if
> that capability is not on offer. But fuse_lowlevel_ops->init() doesn't
> have an obvious way to fail the connection. 

Yeah, I really wish it did.  I particularly wish that it had a way to
negotiate all the FUSE_INIT stuff before libfuse daemonizes and starts
up the event loop.  Well, not all of it -- by the time we get to
FUSE_INIT we've basically decided to commit to mounting.

For fuseblk servers this is horrible, because the kernel needs to be
able to open the block device with O_EXCL during the mount() process,
which means you actually have to be able to (re)open the block device
from op_init, which can fail.  Unless there's a way to drop O_EXCL from
an open fd?

The awful way that I handle failure in FUSE_INIT is to call
fuse_session_exit, but that grossly leaves a dead mount in its place.

Hey wait, is this what Mikulas was talking about when he mentioned
synchronous initialization?

For iomap I created a discovery ioctl so that you can open /dev/fuse and
ask the kernel about the iomap functionality that it supports, and you
can exit(1) without creating a fuse session.  The one goofy problem with
that is that there's a TOCTOU race if someone else does echo N >
/sys/module/fuse/parameters/enable_iomap, though fuse4fs can always
fall back to non-iomap mode.

--D

> How should I do that? Hoping Bernd, Amir or the other libfuse people may 
> have "the answer" (tm).
> 
> And of course if any of this doesn't sound like the way to go, let me know...
> 
> Thanks!
> John
> 
> 
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 1 month, 3 weeks ago
On 25/08/12 09:37AM, Darrick J. Wong wrote:
> On Mon, Aug 11, 2025 at 01:30:53PM -0500, John Groves wrote:
> > On 25/07/10 08:32PM, John Groves wrote:
> > > On 25/07/08 06:53PM, Darrick J. Wong wrote:
> > > > On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> > > > > On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > > > > > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > > > > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > > > > > >
> > > > > > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > > > > > >
> > > > > > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > > > > > >   famfs-enabled connection
> > > > > > > > >
> > > > > > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > > > > > ---
> > > > > > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > > > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > > > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > > > > > >  3 files changed, 21 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > > > > > --- a/fs/fuse/fuse_i.h
> > > > > > > > > +++ b/fs/fuse/fuse_i.h
> > > > > > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > > > > > >         /* Use io_uring for communication */
> > > > > > > > >         unsigned int io_uring;
> > > > > > > > >
> > > > > > > > > +       /* dev_dax_iomap support for famfs */
> > > > > > > > > +       unsigned int famfs_iomap:1;
> > > > > > > > > +
> > > > > > > > 
> > > > > > > > pls move up to the bit fields members.
> > > > > > > 
> > > > > > > Oops, done, thanks.
> > > > > > > 
> > > > > > > > 
> > > > > > > > >         /** Maximum stack depth for passthrough backing files */
> > > > > > > > >         int max_stack_depth;
> > > > > > > > >
> > > > > > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > > > > > --- a/fs/fuse/inode.c
> > > > > > > > > +++ b/fs/fuse/inode.c
> > > > > > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > > > > > >                         }
> > > > > > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > > > > > >                                 fc->io_uring = 1;
> > > > > > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > > > > > +                               /* XXX: Should also check that fuse server
> > > > > > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > > > > > +                                * since it is directing the kernel to access
> > > > > > > > > +                                * dax memory directly - but this function
> > > > > > > > > +                                * appears not to be called in fuse server
> > > > > > > > > +                                * process context (b/c even if it drops
> > > > > > > > > +                                * those capabilities, they are held here).
> > > > > > > > > +                                */
> > > > > > > > > +                               fc->famfs_iomap = 1;
> > > > > > > > > +                       }
> > > > > > > > 
> > > > > > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > > > > > >     Right?
> > > > > > > 
> > > > > > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > > > > > 
> > > > > > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > > > > > I think need to be checked here - not the app that it accessing a file.
> > > > > > > 
> > > > > > > An app accessing a regular file doesn't need permission to do raw access to
> > > > > > > the underlying block dev, but the fuse server does - becuase it is directing
> > > > > > > the kernel to access that for apps.
> > > > > > > 
> > > > > > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > > > > > 
> > > > > > > I *think* that's checking the capabilities of the app that is accessing the
> > > > > > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > > > > > hard on that thread yet.
> > > > > > 
> > > > > > The init reply should be processed in the context of the fuse server.
> > > > > > At that point the kernel hasn't exposed the fs to user programs, so
> > > > > > (AFAICT) there won't be any other programs accessing that fuse mount.
> > > > > 
> > > > > Hmm. It would be good if you're right about that. My fuse server *is* running
> > > > > as root, and when I check those capabilities in process_init_reply(), I
> > > > > find those capabilities. So far so good.
> > > > > 
> > > > > Then I added code to my fuse server to drop those capabilities prior to
> > > > > starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> > > > > prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> > > > > capabilities disappear in process_init_reply() - but they did not disappear.
> > > > > 
> > > > > I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> > > > > capabilities need to be stashed away before the reply is processsed, when 
> > > > > fs/fuse *is* running in fuse server context.
> > > > > 
> > > > > I'm somewhat surprised if that isn't already happening somewhere...
> > > > 
> > > > Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
> > > > command, it should still execute in the same process context as the fuse
> > > > server.
> > > > 
> > > > OTOH it also occurs to me that I have this code in fuse_send_init:
> > > > 
> > > > 	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
> > > > 		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
> > > > 	...
> > > > 	ia->in.flags = flags;
> > > > 	ia->in.flags2 = flags >> 32;
> > > > 
> > > > which means that we only advertise iomap support in FUSE_INIT if the
> > > > process running fuse_fill_super (which you hope is the fuse server)
> > > > actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
> > > > dropping privileges before you even open /dev/fuse?
> > > 
> > > Ah - that might be the answer. I will check if dropped capabilities 
> > > disappear in fuse_send_init. If so, I can work with that - not advertising 
> > > the famfs capability unless the capability is present at that point looks 
> > > like a perfectly good option. Thanks for that idea!
> > 
> > Review: the famfs fuse server directs the kernel to provide access to raw
> > (memory) devices, so it should should be required to have have the
> > CAP_SYS_RAWIO capability. fs/fuse needs to detect this at init time,
> > and fail the connection/mount if the capability is missing.
> > 
> > I initially attempted to do this verification in process_init_reply(), but
> > that doesn't run in the fuse server process context.
> > 
> > I am now checking the capability in fuse_send_init(), and not advertising
> > the FUSE_DAX_FMAP capability (in in_args->flags[2]) unless the server has 
> > CAP_SYS_RAWIO.
> > 
> > That requires that process_init_reply() reject FUSE_DAX_FMAP from a server
> > if FUSE_DAX_FMAP was not set in in_args->flags[2]. process_init_reply() was
> > not previously checking the in_args, but no big deal - this works.
> > 
> > This leads to an apparent dilemma in libfuse. In fuse_lowlevel_ops->init(),
> > I should check for (flags & FUSE_DAX_IOMAP), and fail the connection if
> > that capability is not on offer. But fuse_lowlevel_ops->init() doesn't
> > have an obvious way to fail the connection. 
> 
> Yeah, I really wish it did.  I particularly wish that it had a way to
> negotiate all the FUSE_INIT stuff before libfuse daemonizes and starts
> up the event loop.  Well, not all of it -- by the time we get to
> FUSE_INIT we've basically decided to commit to mounting.
> 
> For fuseblk servers this is horrible, because the kernel needs to be
> able to open the block device with O_EXCL during the mount() process,
> which means you actually have to be able to (re)open the block device
> from op_init, which can fail.  Unless there's a way to drop O_EXCL from
> an open fd?
> 
> The awful way that I handle failure in FUSE_INIT is to call
> fuse_session_exit, but that grossly leaves a dead mount in its place.
> 
> Hey wait, is this what Mikulas was talking about when he mentioned
> synchronous initialization?
> 
> For iomap I created a discovery ioctl so that you can open /dev/fuse and
> ask the kernel about the iomap functionality that it supports, and you
> can exit(1) without creating a fuse session.  The one goofy problem with
> that is that there's a TOCTOU race if someone else does echo N >
> /sys/module/fuse/parameters/enable_iomap, though fuse4fs can always
> fall back to non-iomap mode.
> 
> --D

Thanks Darrick.

Hmm - synchronous init would be nice.

I tried calling fuse_session_exit(), but the broken mount was not an
improvement over a can't-do-I/O mount - which I get if the kernel rejects 
the capability currently known as FUSE_DAX_FMAP due to lack of CAP_SYS_RAWIO.

In my case, I think letting the mount complete with FUSE_DAX_FMAP rejected
is easier to detect and cleanup than a fuse_session_exit() aborted mount.

Famfs mount is a cli operation that does a sequence of stuff before and after
the fork/exec of the famfs fuse server. That fork/exec can't really return 
an error in the conventional sense, so I'm stuck diagnosing whether the 
mount is good (which I already do, but it's a WIP). 

I already have to poll for the .meta files to appear (superblock and log), 
and that can be adapted pretty easily to check whether they can be read 
correctly (which they can't if famfs doesn't have daxdev access).

If mount was synchronous, I'd still need to give the fork/exec enough time
to fail and then detect that. That would probably be cleaner, but not by
a huge amount.

Thanks,
John

<snip>

Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Darrick J. Wong 1 month, 3 weeks ago
On Wed, Aug 13, 2025 at 08:07:00AM -0500, John Groves wrote:
> On 25/08/12 09:37AM, Darrick J. Wong wrote:
> > On Mon, Aug 11, 2025 at 01:30:53PM -0500, John Groves wrote:
> > > On 25/07/10 08:32PM, John Groves wrote:
> > > > On 25/07/08 06:53PM, Darrick J. Wong wrote:
> > > > > On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> > > > > > On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > > > > > > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > > > > > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > > > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > > > > > > >
> > > > > > > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > > > > > > >
> > > > > > > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > > > > > > >   famfs-enabled connection
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > > > > > > ---
> > > > > > > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > > > > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > > > > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > > > > > > >  3 files changed, 21 insertions(+)
> > > > > > > > > >
> > > > > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > > > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > > > > > > --- a/fs/fuse/fuse_i.h
> > > > > > > > > > +++ b/fs/fuse/fuse_i.h
> > > > > > > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > > > > > > >         /* Use io_uring for communication */
> > > > > > > > > >         unsigned int io_uring;
> > > > > > > > > >
> > > > > > > > > > +       /* dev_dax_iomap support for famfs */
> > > > > > > > > > +       unsigned int famfs_iomap:1;
> > > > > > > > > > +
> > > > > > > > > 
> > > > > > > > > pls move up to the bit fields members.
> > > > > > > > 
> > > > > > > > Oops, done, thanks.
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > >         /** Maximum stack depth for passthrough backing files */
> > > > > > > > > >         int max_stack_depth;
> > > > > > > > > >
> > > > > > > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > > > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > > > > > > --- a/fs/fuse/inode.c
> > > > > > > > > > +++ b/fs/fuse/inode.c
> > > > > > > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > > > > > > >                         }
> > > > > > > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > > > > > > >                                 fc->io_uring = 1;
> > > > > > > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > > > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > > > > > > +                               /* XXX: Should also check that fuse server
> > > > > > > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > > > > > > +                                * since it is directing the kernel to access
> > > > > > > > > > +                                * dax memory directly - but this function
> > > > > > > > > > +                                * appears not to be called in fuse server
> > > > > > > > > > +                                * process context (b/c even if it drops
> > > > > > > > > > +                                * those capabilities, they are held here).
> > > > > > > > > > +                                */
> > > > > > > > > > +                               fc->famfs_iomap = 1;
> > > > > > > > > > +                       }
> > > > > > > > > 
> > > > > > > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > > > > > > >     Right?
> > > > > > > > 
> > > > > > > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > > > > > > 
> > > > > > > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > > > > > > I think need to be checked here - not the app that it accessing a file.
> > > > > > > > 
> > > > > > > > An app accessing a regular file doesn't need permission to do raw access to
> > > > > > > > the underlying block dev, but the fuse server does - becuase it is directing
> > > > > > > > the kernel to access that for apps.
> > > > > > > > 
> > > > > > > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > > > > > > 
> > > > > > > > I *think* that's checking the capabilities of the app that is accessing the
> > > > > > > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > > > > > > hard on that thread yet.
> > > > > > > 
> > > > > > > The init reply should be processed in the context of the fuse server.
> > > > > > > At that point the kernel hasn't exposed the fs to user programs, so
> > > > > > > (AFAICT) there won't be any other programs accessing that fuse mount.
> > > > > > 
> > > > > > Hmm. It would be good if you're right about that. My fuse server *is* running
> > > > > > as root, and when I check those capabilities in process_init_reply(), I
> > > > > > find those capabilities. So far so good.
> > > > > > 
> > > > > > Then I added code to my fuse server to drop those capabilities prior to
> > > > > > starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> > > > > > prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> > > > > > capabilities disappear in process_init_reply() - but they did not disappear.
> > > > > > 
> > > > > > I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> > > > > > capabilities need to be stashed away before the reply is processsed, when 
> > > > > > fs/fuse *is* running in fuse server context.
> > > > > > 
> > > > > > I'm somewhat surprised if that isn't already happening somewhere...
> > > > > 
> > > > > Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
> > > > > command, it should still execute in the same process context as the fuse
> > > > > server.
> > > > > 
> > > > > OTOH it also occurs to me that I have this code in fuse_send_init:
> > > > > 
> > > > > 	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
> > > > > 		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
> > > > > 	...
> > > > > 	ia->in.flags = flags;
> > > > > 	ia->in.flags2 = flags >> 32;
> > > > > 
> > > > > which means that we only advertise iomap support in FUSE_INIT if the
> > > > > process running fuse_fill_super (which you hope is the fuse server)
> > > > > actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
> > > > > dropping privileges before you even open /dev/fuse?
> > > > 
> > > > Ah - that might be the answer. I will check if dropped capabilities 
> > > > disappear in fuse_send_init. If so, I can work with that - not advertising 
> > > > the famfs capability unless the capability is present at that point looks 
> > > > like a perfectly good option. Thanks for that idea!
> > > 
> > > Review: the famfs fuse server directs the kernel to provide access to raw
> > > (memory) devices, so it should should be required to have have the
> > > CAP_SYS_RAWIO capability. fs/fuse needs to detect this at init time,
> > > and fail the connection/mount if the capability is missing.
> > > 
> > > I initially attempted to do this verification in process_init_reply(), but
> > > that doesn't run in the fuse server process context.
> > > 
> > > I am now checking the capability in fuse_send_init(), and not advertising
> > > the FUSE_DAX_FMAP capability (in in_args->flags[2]) unless the server has 
> > > CAP_SYS_RAWIO.
> > > 
> > > That requires that process_init_reply() reject FUSE_DAX_FMAP from a server
> > > if FUSE_DAX_FMAP was not set in in_args->flags[2]. process_init_reply() was
> > > not previously checking the in_args, but no big deal - this works.
> > > 
> > > This leads to an apparent dilemma in libfuse. In fuse_lowlevel_ops->init(),
> > > I should check for (flags & FUSE_DAX_IOMAP), and fail the connection if
> > > that capability is not on offer. But fuse_lowlevel_ops->init() doesn't
> > > have an obvious way to fail the connection. 
> > 
> > Yeah, I really wish it did.  I particularly wish that it had a way to
> > negotiate all the FUSE_INIT stuff before libfuse daemonizes and starts
> > up the event loop.  Well, not all of it -- by the time we get to
> > FUSE_INIT we've basically decided to commit to mounting.
> > 
> > For fuseblk servers this is horrible, because the kernel needs to be
> > able to open the block device with O_EXCL during the mount() process,
> > which means you actually have to be able to (re)open the block device
> > from op_init, which can fail.  Unless there's a way to drop O_EXCL from
> > an open fd?
> > 
> > The awful way that I handle failure in FUSE_INIT is to call
> > fuse_session_exit, but that grossly leaves a dead mount in its place.
> > 
> > Hey wait, is this what Mikulas was talking about when he mentioned
> > synchronous initialization?
> > 
> > For iomap I created a discovery ioctl so that you can open /dev/fuse and
> > ask the kernel about the iomap functionality that it supports, and you
> > can exit(1) without creating a fuse session.  The one goofy problem with
> > that is that there's a TOCTOU race if someone else does echo N >
> > /sys/module/fuse/parameters/enable_iomap, though fuse4fs can always
> > fall back to non-iomap mode.
> > 
> > --D
> 
> Thanks Darrick.
> 
> Hmm - synchronous init would be nice.
> 
> I tried calling fuse_session_exit(), but the broken mount was not an
> improvement over a can't-do-I/O mount - which I get if the kernel rejects 
> the capability currently known as FUSE_DAX_FMAP due to lack of CAP_SYS_RAWIO.
> 
> In my case, I think letting the mount complete with FUSE_DAX_FMAP rejected
> is easier to detect and cleanup than a fuse_session_exit() aborted mount.

Yeah, you can always adjust the fuse server to react to an FMAP
rejection by returning EIO or something.  Though I guess it's nice to
have some detection that you can do prior to calling fuse_main so that
you can print complaints and exit(1) while the user is still paying
attention. :)

--D

> Famfs mount is a cli operation that does a sequence of stuff before and after
> the fork/exec of the famfs fuse server. That fork/exec can't really return 
> an error in the conventional sense, so I'm stuck diagnosing whether the 
> mount is good (which I already do, but it's a WIP). 
> 
> I already have to poll for the .meta files to appear (superblock and log), 
> and that can be adapted pretty easily to check whether they can be read 
> correctly (which they can't if famfs doesn't have daxdev access).
> 
> If mount was synchronous, I'd still need to give the fork/exec enough time
> to fail and then detect that. That would probably be cleaner, but not by
> a huge amount.
> 
> Thanks,
> John
> 
> <snip>
> 
> 
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Darrick J. Wong 2 months, 3 weeks ago
On Thu, Jul 10, 2025 at 08:32:13PM -0500, John Groves wrote:
> On 25/07/08 06:53PM, Darrick J. Wong wrote:
> > On Tue, Jul 08, 2025 at 07:02:03AM -0500, John Groves wrote:
> > > On 25/07/07 10:39AM, Darrick J. Wong wrote:
> > > > On Fri, Jul 04, 2025 at 08:39:59AM -0500, John Groves wrote:
> > > > > On 25/07/04 09:54AM, Amir Goldstein wrote:
> > > > > > On Thu, Jul 3, 2025 at 8:51 PM John Groves <John@groves.net> wrote:
> > > > > > >
> > > > > > > * FUSE_DAX_FMAP flag in INIT request/reply
> > > > > > >
> > > > > > > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> > > > > > >   famfs-enabled connection
> > > > > > >
> > > > > > > Signed-off-by: John Groves <john@groves.net>
> > > > > > > ---
> > > > > > >  fs/fuse/fuse_i.h          |  3 +++
> > > > > > >  fs/fuse/inode.c           | 14 ++++++++++++++
> > > > > > >  include/uapi/linux/fuse.h |  4 ++++
> > > > > > >  3 files changed, 21 insertions(+)
> > > > > > >
> > > > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > > > > > > index 9d87ac48d724..a592c1002861 100644
> > > > > > > --- a/fs/fuse/fuse_i.h
> > > > > > > +++ b/fs/fuse/fuse_i.h
> > > > > > > @@ -873,6 +873,9 @@ struct fuse_conn {
> > > > > > >         /* Use io_uring for communication */
> > > > > > >         unsigned int io_uring;
> > > > > > >
> > > > > > > +       /* dev_dax_iomap support for famfs */
> > > > > > > +       unsigned int famfs_iomap:1;
> > > > > > > +
> > > > > > 
> > > > > > pls move up to the bit fields members.
> > > > > 
> > > > > Oops, done, thanks.
> > > > > 
> > > > > > 
> > > > > > >         /** Maximum stack depth for passthrough backing files */
> > > > > > >         int max_stack_depth;
> > > > > > >
> > > > > > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > > > > > > index 29147657a99f..e48e11c3f9f3 100644
> > > > > > > --- a/fs/fuse/inode.c
> > > > > > > +++ b/fs/fuse/inode.c
> > > > > > > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> > > > > > >                         }
> > > > > > >                         if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> > > > > > >                                 fc->io_uring = 1;
> > > > > > > +                       if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > > > > > > +                           flags & FUSE_DAX_FMAP) {
> > > > > > > +                               /* XXX: Should also check that fuse server
> > > > > > > +                                * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > > > > > > +                                * since it is directing the kernel to access
> > > > > > > +                                * dax memory directly - but this function
> > > > > > > +                                * appears not to be called in fuse server
> > > > > > > +                                * process context (b/c even if it drops
> > > > > > > +                                * those capabilities, they are held here).
> > > > > > > +                                */
> > > > > > > +                               fc->famfs_iomap = 1;
> > > > > > > +                       }
> > > > > > 
> > > > > > 1. As long as the mapping requests are checking capabilities we should be ok
> > > > > >     Right?
> > > > > 
> > > > > It depends on the definition of "are", or maybe of "mapping requests" ;)
> > > > > 
> > > > > Forgive me if this *is* obvious, but the fuse server capabilities are what
> > > > > I think need to be checked here - not the app that it accessing a file.
> > > > > 
> > > > > An app accessing a regular file doesn't need permission to do raw access to
> > > > > the underlying block dev, but the fuse server does - becuase it is directing
> > > > > the kernel to access that for apps.
> > > > > 
> > > > > > 2. What's the deal with capable(CAP_SYS_ADMIN) in process_init_limits then?
> > > > > 
> > > > > I *think* that's checking the capabilities of the app that is accessing the
> > > > > file, and not the fuse server. But I might be wrong - I have not pulled very
> > > > > hard on that thread yet.
> > > > 
> > > > The init reply should be processed in the context of the fuse server.
> > > > At that point the kernel hasn't exposed the fs to user programs, so
> > > > (AFAICT) there won't be any other programs accessing that fuse mount.
> > > 
> > > Hmm. It would be good if you're right about that. My fuse server *is* running
> > > as root, and when I check those capabilities in process_init_reply(), I
> > > find those capabilities. So far so good.
> > > 
> > > Then I added code to my fuse server to drop those capabilities prior to
> > > starting the fuse session (prctl(PR_CAPBSET_DROP, CAP_SYS_RAWIO) and 
> > > prctl(PR_CAPBSET_DROP, CAP_SYS_ADMIN). I expected (hoped?) to see those 
> > > capabilities disappear in process_init_reply() - but they did not disappear.
> > > 
> > > I'm all ears if somebody can see a flaw in my logic here. Otherwise, the
> > > capabilities need to be stashed away before the reply is processsed, when 
> > > fs/fuse *is* running in fuse server context.
> > > 
> > > I'm somewhat surprised if that isn't already happening somewhere...
> > 
> > Hrm.  I *thought* that since FUSE_INIT isn't queued as a background
> > command, it should still execute in the same process context as the fuse
> > server.
> > 
> > OTOH it also occurs to me that I have this code in fuse_send_init:
> > 
> > 	if (has_capability_noaudit(current, CAP_SYS_RAWIO))
> > 		flags |= FUSE_IOMAP | FUSE_IOMAP_DIRECTIO | FUSE_IOMAP_PAGECACHE;
> > 	...
> > 	ia->in.flags = flags;
> > 	ia->in.flags2 = flags >> 32;
> > 
> > which means that we only advertise iomap support in FUSE_INIT if the
> > process running fuse_fill_super (which you hope is the fuse server)
> > actually has CAP_SYS_RAWIO.  Would that work for you?  Or are you
> > dropping privileges before you even open /dev/fuse?
> 
> Ah - that might be the answer. I will check if dropped capabilities 
> disappear in fuse_send_init. If so, I can work with that - not advertising 
> the famfs capability unless the capability is present at that point looks 
> like a perfectly good option. Thanks for that idea!

I thought of another twist -- what about a fuse server that runs with no
special privilege and is passed an open fd to a dax/block device?  Maybe
you're right that we need no explicit capability checks -- an open fd is
sufficient.

> > Note: I might decide to relax that approach later on, since iomap
> > requires you to have opened a block device ... which implies that the
> > process had read/write access to start with; and maybe we're ok with
> > unprivileged fuse2fs servers running on a chmod 666 block device?
> > 
> > <shrug> always easier to /relax/ the privilege checks. :)
> 
> My policy on security is that I'm against it...
> 
> > 
> > > > > > 3. Darrick mentioned the need for a synchronic INIT variant for his work on
> > > > > >     blockdev iomap support [1]
> > > > > 
> > > > > I'm not sure that's the same thing (Darrick?), but I do think Darrick's
> > > > > use case probably needs to check capabilities for a server that is sending
> > > > > apps (via files) off to access extents of block devices.
> > > > 
> > > > I don't know either, Miklos hasn't responded to my questions.  I think
> > > > the motivation for a synchronous 
> > > 
> > > ?
> > 
> > ..."I don't know what his motivations for synchronous FUSE_INIT are."
> > 
> > I guess I fubard vim. :(
> 
> So I'm not alone...
> 
> > 
> > > > As for fuse/iomap, I just only need to ask the kernel if iomap support
> > > > is available before calling ext2fs_open2() because the iomap question
> > > > has some implications for how we open the ext4 filesystem.
> > > > 
> > > > > > I also wonder how much of your patches and Darrick's patches end up
> > > > > > being an overlap?
> > > > > 
> > > > > Darrick and I spent some time hashing through this, and came to the conclusion
> > > > > that the actual overlap is slim-to-none. 
> > > > 
> > > > Yeah.  The neat thing about FMAPs is that you can establish repeating
> > > > patterns, which is useful for interleaved DRAM/pmem devices.  Disk
> > > > filesystems don't do repeating patterns, so they'd much rather manage
> > > > non-repeating mappings.
> > > 
> > > Right. Interleaving is critical to how we use memory, so fmaps are designed
> > > to support it.
> > > 
> > > Tangent: at some point a broader-than-just-me discussion of how block devices
> > > have the device mapper, but memory has no such layout tools, might be good
> > > to have. Without such a thing (which might or might not be possible/practical),
> > > it's essential that famfs do the interleaving. Lacking a mapper layer also
> > > means that we need dax to provide a clean "device abstraction" (meaning
> > > a single CXL allocation [which has a uuid/tag] needs to appear as a single
> > > dax device whether or not it's HPA-contiguous).
> > 
> > Well it's not as simple as device-mapper, where we can intercept struct
> > bio and remap/split it to our heart's content.  I guess you could do
> > that with an iovec...?  Would be sorta amusing if you could software
> > RAID10 some DRAM. :P
> 
> SW RAID, and mapper in general, has a "store and forward" property (or maybe
> "store, transmogrify, and forward") that doesn't really work for memory. 
> It's vma's (and files) that can remap memory address regions. Layered vma's 
> anyone? I need to think about whether that's utter nonsense, or just mostly 
> nonsense.

Oh but the ability to transmogrify is the key benefit of store and
forward!  Suppose you have to jack into some Klingon battle cruiser...

--D

> Continuing to think about this...
> 
> Thanks!
> John
> 
> 
> 
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by John Groves 3 months ago
On 25/07/03 01:50PM, John Groves wrote:
> * FUSE_DAX_FMAP flag in INIT request/reply
> 
> * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
>   famfs-enabled connection
> 
> Signed-off-by: John Groves <john@groves.net>
> ---
>  fs/fuse/fuse_i.h          |  3 +++
>  fs/fuse/inode.c           | 14 ++++++++++++++
>  include/uapi/linux/fuse.h |  4 ++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 9d87ac48d724..a592c1002861 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -873,6 +873,9 @@ struct fuse_conn {
>  	/* Use io_uring for communication */
>  	unsigned int io_uring;
>  
> +	/* dev_dax_iomap support for famfs */
> +	unsigned int famfs_iomap:1;
> +
>  	/** Maximum stack depth for passthrough backing files */
>  	int max_stack_depth;
>  
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 29147657a99f..e48e11c3f9f3 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
>  			}
>  			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
>  				fc->io_uring = 1;
> +			if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> +			    flags & FUSE_DAX_FMAP) {
> +				/* XXX: Should also check that fuse server
> +				 * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> +				 * since it is directing the kernel to access
> +				 * dax memory directly - but this function
> +				 * appears not to be called in fuse server
> +				 * process context (b/c even if it drops
> +				 * those capabilities, they are held here).
> +				 */
> +				fc->famfs_iomap = 1;

I think there should be a check here that the fuse server is 
capable(CAP_SYS_RAWIO) (or maybe CAP_SYS_ADMIN), but this function doesn't 
run in fuse server context. A famfs fuse server is providing fmaps, which 
map files to devdax memory, which should not be an unprivileged operation.

1) Does fs/fuse already store the capabilities of the fuse server?
2) If not, where do you suggest I do that, and where do you suggest I store
that info? The only dead-obvious place (to me) that fs/fuse runs in server
context is in fuse_dev_open(), but it doesn't store anything...

@Miklos, I'd appreciate your advice here.

Thanks!
John
Re: [RFC V2 10/18] famfs_fuse: Basic fuse kernel ABI enablement for famfs
Posted by Darrick J. Wong 3 months ago
On Thu, Jul 03, 2025 at 05:45:48PM -0500, John Groves wrote:
> On 25/07/03 01:50PM, John Groves wrote:
> > * FUSE_DAX_FMAP flag in INIT request/reply
> > 
> > * fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
> >   famfs-enabled connection
> > 
> > Signed-off-by: John Groves <john@groves.net>
> > ---
> >  fs/fuse/fuse_i.h          |  3 +++
> >  fs/fuse/inode.c           | 14 ++++++++++++++
> >  include/uapi/linux/fuse.h |  4 ++++
> >  3 files changed, 21 insertions(+)
> > 
> > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> > index 9d87ac48d724..a592c1002861 100644
> > --- a/fs/fuse/fuse_i.h
> > +++ b/fs/fuse/fuse_i.h
> > @@ -873,6 +873,9 @@ struct fuse_conn {
> >  	/* Use io_uring for communication */
> >  	unsigned int io_uring;
> >  
> > +	/* dev_dax_iomap support for famfs */
> > +	unsigned int famfs_iomap:1;
> > +
> >  	/** Maximum stack depth for passthrough backing files */
> >  	int max_stack_depth;
> >  
> > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> > index 29147657a99f..e48e11c3f9f3 100644
> > --- a/fs/fuse/inode.c
> > +++ b/fs/fuse/inode.c
> > @@ -1392,6 +1392,18 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
> >  			}
> >  			if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
> >  				fc->io_uring = 1;
> > +			if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
> > +			    flags & FUSE_DAX_FMAP) {
> > +				/* XXX: Should also check that fuse server
> > +				 * has CAP_SYS_RAWIO and/or CAP_SYS_ADMIN,
> > +				 * since it is directing the kernel to access
> > +				 * dax memory directly - but this function
> > +				 * appears not to be called in fuse server
> > +				 * process context (b/c even if it drops
> > +				 * those capabilities, they are held here).
> > +				 */
> > +				fc->famfs_iomap = 1;
> 
> I think there should be a check here that the fuse server is 
> capable(CAP_SYS_RAWIO) (or maybe CAP_SYS_ADMIN), but this function doesn't 
> run in fuse server context. A famfs fuse server is providing fmaps, which 
> map files to devdax memory, which should not be an unprivileged operation.

I thought process_init_reply /does/ run in the fuse server's context.
It calls process_init_limits, which checks for capable(CAP_SYS_ADMIN)...

--D

> 1) Does fs/fuse already store the capabilities of the fuse server?
> 2) If not, where do you suggest I do that, and where do you suggest I store
> that info? The only dead-obvious place (to me) that fs/fuse runs in server
> context is in fuse_dev_open(), but it doesn't store anything...
> 
> @Miklos, I'd appreciate your advice here.
> 
> Thanks!
> John
> 
>