[PATCH v3 01/22] mm: Add msharefs filesystem

Anthony Yznaga posted 22 patches 1 month, 2 weeks ago
[PATCH v3 01/22] mm: Add msharefs filesystem
Posted by Anthony Yznaga 1 month, 2 weeks ago
From: Khalid Aziz <khalid@kernel.org>

Add a pseudo filesystem that contains files and page table sharing
information that enables processes to share page table entries.
This patch adds the basic filesystem that can be mounted, a
CONFIG_MSHARE option to enable the feature, and documentation.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 Documentation/filesystems/index.rst    |  1 +
 Documentation/filesystems/msharefs.rst | 96 +++++++++++++++++++++++++
 include/uapi/linux/magic.h             |  1 +
 mm/Kconfig                             | 11 +++
 mm/Makefile                            |  4 ++
 mm/mshare.c                            | 97 ++++++++++++++++++++++++++
 6 files changed, 210 insertions(+)
 create mode 100644 Documentation/filesystems/msharefs.rst
 create mode 100644 mm/mshare.c

diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 11a599387266..dcd6605eb228 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -102,6 +102,7 @@ Documentation for filesystem implementations.
    fuse-passthrough
    inotify
    isofs
+   msharefs
    nilfs2
    nfs/index
    ntfs3
diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
new file mode 100644
index 000000000000..3e5b7d531821
--- /dev/null
+++ b/Documentation/filesystems/msharefs.rst
@@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================================
+Msharefs - A filesystem to support shared page tables
+=====================================================
+
+What is msharefs?
+-----------------
+
+msharefs is a pseudo filesystem that allows multiple processes to
+share page table entries for shared pages. To enable support for
+msharefs the kernel must be compiled with CONFIG_MSHARE set.
+
+msharefs is typically mounted like this::
+
+	mount -t msharefs none /sys/fs/mshare
+
+A file created on msharefs creates a new shared region where all
+processes mapping that region will map it using shared page table
+entries. Once the size of the region has been established via
+ftruncate() or fallocate(), the region can be mapped into processes
+and ioctls used to map and unmap objects within it. Note that an
+msharefs file is a control file and accessing mapped objects within
+a shared region through read or write of the file is not permitted.
+
+How to use mshare
+-----------------
+
+Here are the basic steps for using mshare:
+
+  1. Mount msharefs on /sys/fs/mshare::
+
+	mount -t msharefs msharefs /sys/fs/mshare
+
+  2. mshare regions have alignment and size requirements. Start
+     address for the region must be aligned to an address boundary and
+     be a multiple of fixed size. This alignment and size requirement
+     can be obtained by reading the file ``/sys/fs/mshare/mshare_info``
+     which returns a number in text format. mshare regions must be
+     aligned to this boundary and be a multiple of this size.
+
+  3. For the process creating an mshare region:
+
+    a. Create a file on /sys/fs/mshare, for example::
+
+        fd = open("/sys/fs/mshare/shareme",
+                        O_RDWR|O_CREAT|O_EXCL, 0600);
+
+    b. Establish the size of the region::
+
+        fallocate(fd, 0, 0, BUF_SIZE);
+
+      or::
+
+        ftruncate(fd, BUF_SIZE);
+
+    c. Map some memory in the region::
+
+	struct mshare_create mcreate;
+
+	mcreate.region_offset = 0;
+	mcreate.size = BUF_SIZE;
+	mcreate.offset = 0;
+	mcreate.prot = PROT_READ | PROT_WRITE;
+	mcreate.flags = MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED;
+	mcreate.fd = -1;
+
+	ioctl(fd, MSHAREFS_CREATE_MAPPING, &mcreate);
+
+    d. Map the mshare region into the process::
+
+	mmap(NULL, BUF_SIZE,
+		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+    e. Write and read to mshared region normally.
+
+
+  4. For processes attaching an mshare region:
+
+    a. Open the msharefs file, for example::
+
+	fd = open("/sys/fs/mshare/shareme", O_RDWR);
+
+    b. Get the size of the mshare region from the file::
+
+        fstat(fd, &sb);
+        mshare_size = sb.st_size;
+
+    c. Map the mshare region into the process::
+
+	mmap(NULL, mshare_size,
+		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+  5. To delete the mshare region::
+
+		unlink("/sys/fs/mshare/shareme");
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index bb575f3ab45e..e53dd6063cba 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -103,5 +103,6 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
+#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 4108bcd96784..8b50e9785729 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1400,6 +1400,17 @@ config PT_RECLAIM
 config FIND_NORMAL_PAGE
 	def_bool n
 
+config MSHARE
+	bool "Mshare"
+	depends on MMU
+	help
+	  Enable msharefs: A pseudo filesystem that allows multiple processes
+	  to share kernel resources for mapping shared pages. A file created on
+	  msharefs represents a shared region where all processes mapping that
+	  region will map objects within it with shared page table entries and
+	  VMAs. Ioctls are used to configure and map objects into the shared
+	  region.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..4af111b29c68 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,6 +48,10 @@ ifdef CONFIG_64BIT
 mmu-$(CONFIG_MMU)	+= mseal.o
 endif
 
+ifdef CONFIG_MSHARE
+mmu-$(CONFIG_MMU)	+= mshare.o
+endif
+
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page-writeback.o folio-compat.o \
 			   readahead.o swap.o truncate.o vmscan.o shrinker.o \
diff --git a/mm/mshare.c b/mm/mshare.c
new file mode 100644
index 000000000000..f703af49ec81
--- /dev/null
+++ b/mm/mshare.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Enable cooperating processes to share page table between
+ * them to reduce the extra memory consumed by multiple copies
+ * of page tables.
+ *
+ * This code adds an in-memory filesystem - msharefs.
+ * msharefs is used to manage page table sharing
+ *
+ *
+ * Copyright (C) 2024 Oracle Corp. All rights reserved.
+ * Author:	Khalid Aziz <khalid@kernel.org>
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <uapi/linux/magic.h>
+
+static const struct file_operations msharefs_file_operations = {
+	.open			= simple_open,
+};
+
+static const struct super_operations mshare_s_ops = {
+	.statfs		= simple_statfs,
+};
+
+static int
+msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct inode *inode;
+
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_magic		= MSHARE_MAGIC;
+	sb->s_op		= &mshare_s_ops;
+	sb->s_time_gran		= 1;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	inode->i_ino = 1;
+	inode->i_mode = S_IFDIR | 0777;
+	simple_inode_init_ts(inode);
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	set_nlink(inode, 2);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+msharefs_get_tree(struct fs_context *fc)
+{
+	return get_tree_nodev(fc, msharefs_fill_super);
+}
+
+static const struct fs_context_operations msharefs_context_ops = {
+	.get_tree	= msharefs_get_tree,
+};
+
+static int
+mshare_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &msharefs_context_ops;
+	return 0;
+}
+
+static struct file_system_type mshare_fs = {
+	.name			= "msharefs",
+	.init_fs_context	= mshare_init_fs_context,
+	.kill_sb		= kill_litter_super,
+};
+
+static int __init
+mshare_init(void)
+{
+	int ret;
+
+	ret = sysfs_create_mount_point(fs_kobj, "mshare");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&mshare_fs);
+	if (ret)
+		sysfs_remove_mount_point(fs_kobj, "mshare");
+
+	return ret;
+}
+
+core_initcall(mshare_init);
-- 
2.47.1
Re: [PATCH v3 01/22] mm: Add msharefs filesystem
Posted by Pedro Falcato 3 weeks, 2 days ago
On Tue, Aug 19, 2025 at 06:03:54PM -0700, Anthony Yznaga wrote:
> From: Khalid Aziz <khalid@kernel.org>
> 
> Add a pseudo filesystem that contains files and page table sharing
> information that enables processes to share page table entries.
> This patch adds the basic filesystem that can be mounted, a
> CONFIG_MSHARE option to enable the feature, and documentation.
> 
> Signed-off-by: Khalid Aziz <khalid@kernel.org>
> Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
> ---
>  Documentation/filesystems/index.rst    |  1 +
>  Documentation/filesystems/msharefs.rst | 96 +++++++++++++++++++++++++
>  include/uapi/linux/magic.h             |  1 +
>  mm/Kconfig                             | 11 +++
>  mm/Makefile                            |  4 ++
>  mm/mshare.c                            | 97 ++++++++++++++++++++++++++
>  6 files changed, 210 insertions(+)
>  create mode 100644 Documentation/filesystems/msharefs.rst
>  create mode 100644 mm/mshare.c
> 
> diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
> index 11a599387266..dcd6605eb228 100644
> --- a/Documentation/filesystems/index.rst
> +++ b/Documentation/filesystems/index.rst
> @@ -102,6 +102,7 @@ Documentation for filesystem implementations.
>     fuse-passthrough
>     inotify
>     isofs
> +   msharefs
>     nilfs2
>     nfs/index
>     ntfs3
> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
> new file mode 100644
> index 000000000000..3e5b7d531821
> --- /dev/null
> +++ b/Documentation/filesystems/msharefs.rst
> @@ -0,0 +1,96 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=====================================================
> +Msharefs - A filesystem to support shared page tables
> +=====================================================
> +
> +What is msharefs?
> +-----------------
> +
> +msharefs is a pseudo filesystem that allows multiple processes to
> +share page table entries for shared pages. To enable support for
> +msharefs the kernel must be compiled with CONFIG_MSHARE set.
> +
> +msharefs is typically mounted like this::
> +
> +	mount -t msharefs none /sys/fs/mshare
> +
> +A file created on msharefs creates a new shared region where all
> +processes mapping that region will map it using shared page table
> +entries. Once the size of the region has been established via
> +ftruncate() or fallocate(), the region can be mapped into processes
> +and ioctls used to map and unmap objects within it. Note that an
> +msharefs file is a control file and accessing mapped objects within
> +a shared region through read or write of the file is not permitted.
> +

Welp. I really really don't like this API.
I assume this has been discussed previously, but why do we need a new
magical pseudofs mounted under some random /sys directory?

But, ok, assuming we're thinking about something hugetlbfs like, that's not too
bad, and programs already know how to use it.

> +How to use mshare
> +-----------------
> +
> +Here are the basic steps for using mshare:
> +
> +  1. Mount msharefs on /sys/fs/mshare::
> +
> +	mount -t msharefs msharefs /sys/fs/mshare
> +
> +  2. mshare regions have alignment and size requirements. Start
> +     address for the region must be aligned to an address boundary and
> +     be a multiple of fixed size. This alignment and size requirement
> +     can be obtained by reading the file ``/sys/fs/mshare/mshare_info``
> +     which returns a number in text format. mshare regions must be
> +     aligned to this boundary and be a multiple of this size.
> +

I don't see why size and alignment needs to be taken into consideration by
userspace. You can simply establish a mapping and pad it out.

> +  3. For the process creating an mshare region:
> +
> +    a. Create a file on /sys/fs/mshare, for example::
> +
> +        fd = open("/sys/fs/mshare/shareme",
> +                        O_RDWR|O_CREAT|O_EXCL, 0600);

Ok, makes sense.

> +
> +    b. Establish the size of the region::
> +
> +        fallocate(fd, 0, 0, BUF_SIZE);
> +
> +      or::
> +
> +        ftruncate(fd, BUF_SIZE);
> +

Yep.

> +    c. Map some memory in the region::
> +
> +	struct mshare_create mcreate;
> +
> +	mcreate.region_offset = 0;
> +	mcreate.size = BUF_SIZE;
> +	mcreate.offset = 0;
> +	mcreate.prot = PROT_READ | PROT_WRITE;
> +	mcreate.flags = MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED;
> +	mcreate.fd = -1;
> +
> +	ioctl(fd, MSHAREFS_CREATE_MAPPING, &mcreate);

Why?? Do you want to map mappings in msharefs files, that can themselves be
mapped? Why do we need an ioctl here?

Really, this feature seems very overengineered. If you want to go the fs route,
doing a new pseudofs that's just like hugetlb, but without the hugepages, sounds
like a decent idea. Or enhancing tmpfs to actually support this kind of stuff.
Or properly doing a syscall that can try to attach the page-table-sharing
property to random VMAs.

But I'm wholly opposed to the idea of "mapping a file that itself has more
mappings, mappings which you establish using a magic filesystem and ioctls".

-- 
Pedro
Re: [PATCH v3 01/22] mm: Add msharefs filesystem
Posted by David Hildenbrand 3 weeks, 2 days ago
On 10.09.25 14:14, Pedro Falcato wrote:
> On Tue, Aug 19, 2025 at 06:03:54PM -0700, Anthony Yznaga wrote:
>> From: Khalid Aziz <khalid@kernel.org>
>>
>> Add a pseudo filesystem that contains files and page table sharing
>> information that enables processes to share page table entries.
>> This patch adds the basic filesystem that can be mounted, a
>> CONFIG_MSHARE option to enable the feature, and documentation.
>>
>> Signed-off-by: Khalid Aziz <khalid@kernel.org>
>> Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
>> ---
>>   Documentation/filesystems/index.rst    |  1 +
>>   Documentation/filesystems/msharefs.rst | 96 +++++++++++++++++++++++++
>>   include/uapi/linux/magic.h             |  1 +
>>   mm/Kconfig                             | 11 +++
>>   mm/Makefile                            |  4 ++
>>   mm/mshare.c                            | 97 ++++++++++++++++++++++++++
>>   6 files changed, 210 insertions(+)
>>   create mode 100644 Documentation/filesystems/msharefs.rst
>>   create mode 100644 mm/mshare.c
>>
>> diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
>> index 11a599387266..dcd6605eb228 100644
>> --- a/Documentation/filesystems/index.rst
>> +++ b/Documentation/filesystems/index.rst
>> @@ -102,6 +102,7 @@ Documentation for filesystem implementations.
>>      fuse-passthrough
>>      inotify
>>      isofs
>> +   msharefs
>>      nilfs2
>>      nfs/index
>>      ntfs3
>> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
>> new file mode 100644
>> index 000000000000..3e5b7d531821
>> --- /dev/null
>> +++ b/Documentation/filesystems/msharefs.rst
>> @@ -0,0 +1,96 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>> +
>> +=====================================================
>> +Msharefs - A filesystem to support shared page tables
>> +=====================================================
>> +
>> +What is msharefs?
>> +-----------------
>> +
>> +msharefs is a pseudo filesystem that allows multiple processes to
>> +share page table entries for shared pages. To enable support for
>> +msharefs the kernel must be compiled with CONFIG_MSHARE set.
>> +
>> +msharefs is typically mounted like this::
>> +
>> +	mount -t msharefs none /sys/fs/mshare
>> +
>> +A file created on msharefs creates a new shared region where all
>> +processes mapping that region will map it using shared page table
>> +entries. Once the size of the region has been established via
>> +ftruncate() or fallocate(), the region can be mapped into processes
>> +and ioctls used to map and unmap objects within it. Note that an
>> +msharefs file is a control file and accessing mapped objects within
>> +a shared region through read or write of the file is not permitted.
>> +
> 
> Welp. I really really don't like this API.
> I assume this has been discussed previously, but why do we need a new
> magical pseudofs mounted under some random /sys directory?
> 
> But, ok, assuming we're thinking about something hugetlbfs like, that's not too
> bad, and programs already know how to use it.
> 
>> +How to use mshare
>> +-----------------
>> +
>> +Here are the basic steps for using mshare:
>> +
>> +  1. Mount msharefs on /sys/fs/mshare::
>> +
>> +	mount -t msharefs msharefs /sys/fs/mshare
>> +
>> +  2. mshare regions have alignment and size requirements. Start
>> +     address for the region must be aligned to an address boundary and
>> +     be a multiple of fixed size. This alignment and size requirement
>> +     can be obtained by reading the file ``/sys/fs/mshare/mshare_info``
>> +     which returns a number in text format. mshare regions must be
>> +     aligned to this boundary and be a multiple of this size.
>> +
> 
> I don't see why size and alignment needs to be taken into consideration by
> userspace. You can simply establish a mapping and pad it out.
> 
>> +  3. For the process creating an mshare region:
>> +
>> +    a. Create a file on /sys/fs/mshare, for example::
>> +
>> +        fd = open("/sys/fs/mshare/shareme",
>> +                        O_RDWR|O_CREAT|O_EXCL, 0600);
> 
> Ok, makes sense.
> 
>> +
>> +    b. Establish the size of the region::
>> +
>> +        fallocate(fd, 0, 0, BUF_SIZE);
>> +
>> +      or::
>> +
>> +        ftruncate(fd, BUF_SIZE);
>> +
> 
> Yep.
> 
>> +    c. Map some memory in the region::
>> +
>> +	struct mshare_create mcreate;
>> +
>> +	mcreate.region_offset = 0;
>> +	mcreate.size = BUF_SIZE;
>> +	mcreate.offset = 0;
>> +	mcreate.prot = PROT_READ | PROT_WRITE;
>> +	mcreate.flags = MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED;
>> +	mcreate.fd = -1;
>> +
>> +	ioctl(fd, MSHAREFS_CREATE_MAPPING, &mcreate);
> 
> Why?? Do you want to map mappings in msharefs files, that can themselves be
> mapped? Why do we need an ioctl here?
> 
> Really, this feature seems very overengineered. If you want to go the fs route,
> doing a new pseudofs that's just like hugetlb, but without the hugepages, sounds
> like a decent idea. Or enhancing tmpfs to actually support this kind of stuff.
> Or properly doing a syscall that can try to attach the page-table-sharing
> property to random VMAs.
> 
> But I'm wholly opposed to the idea of "mapping a file that itself has more
> mappings, mappings which you establish using a magic filesystem and ioctls".

I don't remember the history (it's been a while) but there was this 
interest of

(a) Sharing page tables for smaller files (not just PUD size etc.)

(b) Supporting also ordinary file systems, not just tmpfs

(c) Having a way to update protection of parts of a mapping and
     immediately have it visible to everyone mapping that area.

In the past, I raised that some VM use cases around virtio-fs would be 
interested in having a "VMA container" that can be updated by the parent 
QEMU process, and what gets mapped in there would be immediately visible 
to the other processes.

I recall that initially I pushed for just generalizing the support for 
shared page tables so it could be used for other file systems. I recall 
problems around that, likely around protection changes etc.

So current mshare really is the idea of having a (let's call it) VMA 
container that can be mapped into processes where all processes will 
observe changes performed by other processes.

I agree that it's complicated, and the semantics are very, very, very weird.

-- 
Cheers

David / dhildenb
Re: [PATCH v3 01/22] mm: Add msharefs filesystem
Posted by Liam R. Howlett 3 weeks, 4 days ago
* Anthony Yznaga <anthony.yznaga@oracle.com> [250819 21:04]:
> From: Khalid Aziz <khalid@kernel.org>
> 
> Add a pseudo filesystem that contains files and page table sharing
> information that enables processes to share page table entries.
> This patch adds the basic filesystem that can be mounted, a
> CONFIG_MSHARE option to enable the feature, and documentation.
> 
> Signed-off-by: Khalid Aziz <khalid@kernel.org>
> Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
> ---
>  Documentation/filesystems/index.rst    |  1 +
>  Documentation/filesystems/msharefs.rst | 96 +++++++++++++++++++++++++
>  include/uapi/linux/magic.h             |  1 +
>  mm/Kconfig                             | 11 +++
>  mm/Makefile                            |  4 ++
>  mm/mshare.c                            | 97 ++++++++++++++++++++++++++
>  6 files changed, 210 insertions(+)
>  create mode 100644 Documentation/filesystems/msharefs.rst
>  create mode 100644 mm/mshare.c
> 
> diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
> index 11a599387266..dcd6605eb228 100644
> --- a/Documentation/filesystems/index.rst
> +++ b/Documentation/filesystems/index.rst
> @@ -102,6 +102,7 @@ Documentation for filesystem implementations.
>     fuse-passthrough
>     inotify
>     isofs
> +   msharefs
>     nilfs2
>     nfs/index
>     ntfs3
> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
> new file mode 100644
> index 000000000000..3e5b7d531821
> --- /dev/null
> +++ b/Documentation/filesystems/msharefs.rst
> @@ -0,0 +1,96 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=====================================================
> +Msharefs - A filesystem to support shared page tables
> +=====================================================
> +
> +What is msharefs?
> +-----------------
> +
> +msharefs is a pseudo filesystem that allows multiple processes to
> +share page table entries for shared pages. To enable support for
> +msharefs the kernel must be compiled with CONFIG_MSHARE set.
> +
> +msharefs is typically mounted like this::
> +
> +	mount -t msharefs none /sys/fs/mshare
> +
> +A file created on msharefs creates a new shared region where all
> +processes mapping that region will map it using shared page table
> +entries. Once the size of the region has been established via
> +ftruncate() or fallocate(), the region can be mapped into processes
> +and ioctls used to map and unmap objects within it. Note that an
> +msharefs file is a control file and accessing mapped objects within
> +a shared region through read or write of the file is not permitted.
> +
> +How to use mshare
> +-----------------
> +
> +Here are the basic steps for using mshare:
> +
> +  1. Mount msharefs on /sys/fs/mshare::
> +
> +	mount -t msharefs msharefs /sys/fs/mshare
> +
> +  2. mshare regions have alignment and size requirements. Start
> +     address for the region must be aligned to an address boundary and
> +     be a multiple of fixed size. This alignment and size requirement
> +     can be obtained by reading the file ``/sys/fs/mshare/mshare_info``
> +     which returns a number in text format. mshare regions must be
> +     aligned to this boundary and be a multiple of this size.
> +
> +  3. For the process creating an mshare region:
> +
> +    a. Create a file on /sys/fs/mshare, for example::
> +
> +        fd = open("/sys/fs/mshare/shareme",
> +                        O_RDWR|O_CREAT|O_EXCL, 0600);
> +
> +    b. Establish the size of the region::
> +
> +        fallocate(fd, 0, 0, BUF_SIZE);
> +
> +      or::
> +
> +        ftruncate(fd, BUF_SIZE);
> +
> +    c. Map some memory in the region::
> +
> +	struct mshare_create mcreate;
> +
> +	mcreate.region_offset = 0;
> +	mcreate.size = BUF_SIZE;
> +	mcreate.offset = 0;
> +	mcreate.prot = PROT_READ | PROT_WRITE;
> +	mcreate.flags = MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED;
> +	mcreate.fd = -1;
> +
> +	ioctl(fd, MSHAREFS_CREATE_MAPPING, &mcreate);
> +
> +    d. Map the mshare region into the process::
> +
> +	mmap(NULL, BUF_SIZE,
> +		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> +
> +    e. Write and read to mshared region normally.
> +
> +
> +  4. For processes attaching an mshare region:
> +
> +    a. Open the msharefs file, for example::
> +
> +	fd = open("/sys/fs/mshare/shareme", O_RDWR);
> +
> +    b. Get the size of the mshare region from the file::
> +
> +        fstat(fd, &sb);
> +        mshare_size = sb.st_size;
> +
> +    c. Map the mshare region into the process::
> +
> +	mmap(NULL, mshare_size,
> +		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> +
> +  5. To delete the mshare region::
> +
> +		unlink("/sys/fs/mshare/shareme");
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index bb575f3ab45e..e53dd6063cba 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -103,5 +103,6 @@
>  #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
>  #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
>  #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
> +#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
>  
>  #endif /* __LINUX_MAGIC_H__ */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 4108bcd96784..8b50e9785729 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1400,6 +1400,17 @@ config PT_RECLAIM
>  config FIND_NORMAL_PAGE
>  	def_bool n
>  
> +config MSHARE
> +	bool "Mshare"
> +	depends on MMU
> +	help
> +	  Enable msharefs: A pseudo filesystem that allows multiple processes
> +	  to share kernel resources for mapping shared pages. A file created on
> +	  msharefs represents a shared region where all processes mapping that
> +	  region will map objects within it with shared page table entries and
> +	  VMAs. Ioctls are used to configure and map objects into the shared
> +	  region.
> +
>  source "mm/damon/Kconfig"
>  
>  endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index ef54aa615d9d..4af111b29c68 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -48,6 +48,10 @@ ifdef CONFIG_64BIT
>  mmu-$(CONFIG_MMU)	+= mseal.o
>  endif
>  
> +ifdef CONFIG_MSHARE
> +mmu-$(CONFIG_MMU)	+= mshare.o
> +endif
> +
>  obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
>  			   maccess.o page-writeback.o folio-compat.o \
>  			   readahead.o swap.o truncate.o vmscan.o shrinker.o \
> diff --git a/mm/mshare.c b/mm/mshare.c
> new file mode 100644
> index 000000000000..f703af49ec81
> --- /dev/null
> +++ b/mm/mshare.c
> @@ -0,0 +1,97 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Enable cooperating processes to share page table between
> + * them to reduce the extra memory consumed by multiple copies
> + * of page tables.
> + *
> + * This code adds an in-memory filesystem - msharefs.
> + * msharefs is used to manage page table sharing
> + *
> + *
> + * Copyright (C) 2024 Oracle Corp. All rights reserved.
> + * Author:	Khalid Aziz <khalid@kernel.org>

Probably needs a new year or year range and another author?

> + *
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/fs_context.h>
> +#include <uapi/linux/magic.h>
> +
> +static const struct file_operations msharefs_file_operations = {
> +	.open			= simple_open,
> +};
> +
> +static const struct super_operations mshare_s_ops = {
> +	.statfs		= simple_statfs,
> +};
> +
> +static int
> +msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
> +{
> +	struct inode *inode;
> +
> +	sb->s_blocksize		= PAGE_SIZE;
> +	sb->s_blocksize_bits	= PAGE_SHIFT;
> +	sb->s_maxbytes		= MAX_LFS_FILESIZE;
> +	sb->s_magic		= MSHARE_MAGIC;
> +	sb->s_op		= &mshare_s_ops;
> +	sb->s_time_gran		= 1;
> +
> +	inode = new_inode(sb);
> +	if (!inode)
> +		return -ENOMEM;
> +
> +	inode->i_ino = 1;
> +	inode->i_mode = S_IFDIR | 0777;
> +	simple_inode_init_ts(inode);
> +	inode->i_op = &simple_dir_inode_operations;
> +	inode->i_fop = &simple_dir_operations;
> +	set_nlink(inode, 2);
> +
> +	sb->s_root = d_make_root(inode);
> +	if (!sb->s_root)
> +		return -ENOMEM;

I don't know the recovery here, but what about inode and inode link
count?

> +
> +	return 0;
> +}
> +
> +static int
> +msharefs_get_tree(struct fs_context *fc)
> +{
> +	return get_tree_nodev(fc, msharefs_fill_super);
> +}
> +
> +static const struct fs_context_operations msharefs_context_ops = {
> +	.get_tree	= msharefs_get_tree,
> +};
> +
> +static int
> +mshare_init_fs_context(struct fs_context *fc)
> +{
> +	fc->ops = &msharefs_context_ops;
> +	return 0;
> +}
> +
> +static struct file_system_type mshare_fs = {
> +	.name			= "msharefs",
> +	.init_fs_context	= mshare_init_fs_context,
> +	.kill_sb		= kill_litter_super,
> +};
> +
> +static int __init
> +mshare_init(void)
> +{
> +	int ret;
> +
> +	ret = sysfs_create_mount_point(fs_kobj, "mshare");
> +	if (ret)
> +		return ret;
> +
> +	ret = register_filesystem(&mshare_fs);
> +	if (ret)
> +		sysfs_remove_mount_point(fs_kobj, "mshare");
> +
> +	return ret;
> +}
> +
> +core_initcall(mshare_init);
> -- 
> 2.47.1
>
Re: [PATCH v3 01/22] mm: Add msharefs filesystem
Posted by Anthony Yznaga 3 weeks, 4 days ago

On 9/8/25 11:29 AM, Liam R. Howlett wrote:
> * Anthony Yznaga <anthony.yznaga@oracle.com> [250819 21:04]:
>> From: Khalid Aziz <khalid@kernel.org>
>>
>> Add a pseudo filesystem that contains files and page table sharing
>> information that enables processes to share page table entries.
>> This patch adds the basic filesystem that can be mounted, a
>> CONFIG_MSHARE option to enable the feature, and documentation.
>>
>> Signed-off-by: Khalid Aziz <khalid@kernel.org>
>> Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
>> ---
>>   Documentation/filesystems/index.rst    |  1 +
>>   Documentation/filesystems/msharefs.rst | 96 +++++++++++++++++++++++++
>>   include/uapi/linux/magic.h             |  1 +
>>   mm/Kconfig                             | 11 +++
>>   mm/Makefile                            |  4 ++
>>   mm/mshare.c                            | 97 ++++++++++++++++++++++++++
>>   6 files changed, 210 insertions(+)
>>   create mode 100644 Documentation/filesystems/msharefs.rst
>>   create mode 100644 mm/mshare.c
>>
>> diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
>> index 11a599387266..dcd6605eb228 100644
>> --- a/Documentation/filesystems/index.rst
>> +++ b/Documentation/filesystems/index.rst
>> @@ -102,6 +102,7 @@ Documentation for filesystem implementations.
>>      fuse-passthrough
>>      inotify
>>      isofs
>> +   msharefs
>>      nilfs2
>>      nfs/index
>>      ntfs3
>> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
>> new file mode 100644
>> index 000000000000..3e5b7d531821
>> --- /dev/null
>> +++ b/Documentation/filesystems/msharefs.rst
>> @@ -0,0 +1,96 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>> +
>> +=====================================================
>> +Msharefs - A filesystem to support shared page tables
>> +=====================================================
>> +
>> +What is msharefs?
>> +-----------------
>> +
>> +msharefs is a pseudo filesystem that allows multiple processes to
>> +share page table entries for shared pages. To enable support for
>> +msharefs the kernel must be compiled with CONFIG_MSHARE set.
>> +
>> +msharefs is typically mounted like this::
>> +
>> +	mount -t msharefs none /sys/fs/mshare
>> +
>> +A file created on msharefs creates a new shared region where all
>> +processes mapping that region will map it using shared page table
>> +entries. Once the size of the region has been established via
>> +ftruncate() or fallocate(), the region can be mapped into processes
>> +and ioctls used to map and unmap objects within it. Note that an
>> +msharefs file is a control file and accessing mapped objects within
>> +a shared region through read or write of the file is not permitted.
>> +
>> +How to use mshare
>> +-----------------
>> +
>> +Here are the basic steps for using mshare:
>> +
>> +  1. Mount msharefs on /sys/fs/mshare::
>> +
>> +	mount -t msharefs msharefs /sys/fs/mshare
>> +
>> +  2. mshare regions have alignment and size requirements. Start
>> +     address for the region must be aligned to an address boundary and
>> +     be a multiple of fixed size. This alignment and size requirement
>> +     can be obtained by reading the file ``/sys/fs/mshare/mshare_info``
>> +     which returns a number in text format. mshare regions must be
>> +     aligned to this boundary and be a multiple of this size.
>> +
>> +  3. For the process creating an mshare region:
>> +
>> +    a. Create a file on /sys/fs/mshare, for example::
>> +
>> +        fd = open("/sys/fs/mshare/shareme",
>> +                        O_RDWR|O_CREAT|O_EXCL, 0600);
>> +
>> +    b. Establish the size of the region::
>> +
>> +        fallocate(fd, 0, 0, BUF_SIZE);
>> +
>> +      or::
>> +
>> +        ftruncate(fd, BUF_SIZE);
>> +
>> +    c. Map some memory in the region::
>> +
>> +	struct mshare_create mcreate;
>> +
>> +	mcreate.region_offset = 0;
>> +	mcreate.size = BUF_SIZE;
>> +	mcreate.offset = 0;
>> +	mcreate.prot = PROT_READ | PROT_WRITE;
>> +	mcreate.flags = MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED;
>> +	mcreate.fd = -1;
>> +
>> +	ioctl(fd, MSHAREFS_CREATE_MAPPING, &mcreate);
>> +
>> +    d. Map the mshare region into the process::
>> +
>> +	mmap(NULL, BUF_SIZE,
>> +		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>> +
>> +    e. Write and read to mshared region normally.
>> +
>> +
>> +  4. For processes attaching an mshare region:
>> +
>> +    a. Open the msharefs file, for example::
>> +
>> +	fd = open("/sys/fs/mshare/shareme", O_RDWR);
>> +
>> +    b. Get the size of the mshare region from the file::
>> +
>> +        fstat(fd, &sb);
>> +        mshare_size = sb.st_size;
>> +
>> +    c. Map the mshare region into the process::
>> +
>> +	mmap(NULL, mshare_size,
>> +		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>> +
>> +  5. To delete the mshare region::
>> +
>> +		unlink("/sys/fs/mshare/shareme");
>> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
>> index bb575f3ab45e..e53dd6063cba 100644
>> --- a/include/uapi/linux/magic.h
>> +++ b/include/uapi/linux/magic.h
>> @@ -103,5 +103,6 @@
>>   #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
>>   #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
>>   #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
>> +#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
>>   
>>   #endif /* __LINUX_MAGIC_H__ */
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 4108bcd96784..8b50e9785729 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -1400,6 +1400,17 @@ config PT_RECLAIM
>>   config FIND_NORMAL_PAGE
>>   	def_bool n
>>   
>> +config MSHARE
>> +	bool "Mshare"
>> +	depends on MMU
>> +	help
>> +	  Enable msharefs: A pseudo filesystem that allows multiple processes
>> +	  to share kernel resources for mapping shared pages. A file created on
>> +	  msharefs represents a shared region where all processes mapping that
>> +	  region will map objects within it with shared page table entries and
>> +	  VMAs. Ioctls are used to configure and map objects into the shared
>> +	  region.
>> +
>>   source "mm/damon/Kconfig"
>>   
>>   endmenu
>> diff --git a/mm/Makefile b/mm/Makefile
>> index ef54aa615d9d..4af111b29c68 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -48,6 +48,10 @@ ifdef CONFIG_64BIT
>>   mmu-$(CONFIG_MMU)	+= mseal.o
>>   endif
>>   
>> +ifdef CONFIG_MSHARE
>> +mmu-$(CONFIG_MMU)	+= mshare.o
>> +endif
>> +
>>   obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
>>   			   maccess.o page-writeback.o folio-compat.o \
>>   			   readahead.o swap.o truncate.o vmscan.o shrinker.o \
>> diff --git a/mm/mshare.c b/mm/mshare.c
>> new file mode 100644
>> index 000000000000..f703af49ec81
>> --- /dev/null
>> +++ b/mm/mshare.c
>> @@ -0,0 +1,97 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Enable cooperating processes to share page table between
>> + * them to reduce the extra memory consumed by multiple copies
>> + * of page tables.
>> + *
>> + * This code adds an in-memory filesystem - msharefs.
>> + * msharefs is used to manage page table sharing
>> + *
>> + *
>> + * Copyright (C) 2024 Oracle Corp. All rights reserved.
>> + * Author:	Khalid Aziz <khalid@kernel.org>
> 
> Probably needs a new year or year range and another author?

Yes. I'll make sure the next series is updated.

> 
>> + *
>> + */
>> +
>> +#include <linux/fs.h>
>> +#include <linux/fs_context.h>
>> +#include <uapi/linux/magic.h>
>> +
>> +static const struct file_operations msharefs_file_operations = {
>> +	.open			= simple_open,
>> +};
>> +
>> +static const struct super_operations mshare_s_ops = {
>> +	.statfs		= simple_statfs,
>> +};
>> +
>> +static int
>> +msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
>> +{
>> +	struct inode *inode;
>> +
>> +	sb->s_blocksize		= PAGE_SIZE;
>> +	sb->s_blocksize_bits	= PAGE_SHIFT;
>> +	sb->s_maxbytes		= MAX_LFS_FILESIZE;
>> +	sb->s_magic		= MSHARE_MAGIC;
>> +	sb->s_op		= &mshare_s_ops;
>> +	sb->s_time_gran		= 1;
>> +
>> +	inode = new_inode(sb);
>> +	if (!inode)
>> +		return -ENOMEM;
>> +
>> +	inode->i_ino = 1;
>> +	inode->i_mode = S_IFDIR | 0777;
>> +	simple_inode_init_ts(inode);
>> +	inode->i_op = &simple_dir_inode_operations;
>> +	inode->i_fop = &simple_dir_operations;
>> +	set_nlink(inode, 2);
>> +
>> +	sb->s_root = d_make_root(inode);
>> +	if (!sb->s_root)
>> +		return -ENOMEM;
> 
> I don't know the recovery here, but what about inode and inode link
> count?

If d_make_root() returns NULL it will have called iput_final(inode) 
which takes care of freeing the inode.

> 
>> +
>> +	return 0;
>> +}
>> +
>> +static int
>> +msharefs_get_tree(struct fs_context *fc)
>> +{
>> +	return get_tree_nodev(fc, msharefs_fill_super);
>> +}
>> +
>> +static const struct fs_context_operations msharefs_context_ops = {
>> +	.get_tree	= msharefs_get_tree,
>> +};
>> +
>> +static int
>> +mshare_init_fs_context(struct fs_context *fc)
>> +{
>> +	fc->ops = &msharefs_context_ops;
>> +	return 0;
>> +}
>> +
>> +static struct file_system_type mshare_fs = {
>> +	.name			= "msharefs",
>> +	.init_fs_context	= mshare_init_fs_context,
>> +	.kill_sb		= kill_litter_super,
>> +};
>> +
>> +static int __init
>> +mshare_init(void)
>> +{
>> +	int ret;
>> +
>> +	ret = sysfs_create_mount_point(fs_kobj, "mshare");
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = register_filesystem(&mshare_fs);
>> +	if (ret)
>> +		sysfs_remove_mount_point(fs_kobj, "mshare");
>> +
>> +	return ret;
>> +}
>> +
>> +core_initcall(mshare_init);
>> -- 
>> 2.47.1
>>