From: Hongzhen Luo <hongzhen@linux.alibaba.com>
Currently, reading files with different paths (or names) but the same
content will consume multiple copies of the page cache, even if the
content of these page caches is the same. For example, reading
identical files (e.g., *.so files) from two different minor versions of
container images will cost multiple copies of the same page cache,
since different containers have different mount points. Therefore,
sharing the page cache for files with the same content can save memory.
This introduces the page cache share feature in erofs. It allocate a
deduplicated inode and use its page cache as shared. Reads for files
with identical content will ultimately be routed to the page cache of
the deduplicated inode. In this way, a single page cache satisfies
multiple read requests for different files with the same contents.
We introduce inode_share mount option to enable the page sharing mode
during mounting.
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
---
Documentation/filesystems/erofs.rst | 5 +
fs/erofs/Makefile | 1 +
fs/erofs/inode.c | 24 +----
fs/erofs/internal.h | 57 ++++++++++
fs/erofs/ishare.c | 161 ++++++++++++++++++++++++++++
fs/erofs/super.c | 56 +++++++++-
fs/erofs/xattr.c | 34 ++++++
fs/erofs/xattr.h | 3 +
8 files changed, 316 insertions(+), 25 deletions(-)
create mode 100644 fs/erofs/ishare.c
diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
index 08194f194b94..27d3caa3c73c 100644
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -128,7 +128,12 @@ device=%s Specify a path to an extra device to be used together.
fsid=%s Specify a filesystem image ID for Fscache back-end.
domain_id=%s Specify a domain ID in fscache mode so that different images
with the same blobs under a given domain ID can share storage.
+ Also used for inode page sharing mode which defines a sharing
+ domain.
fsoffset=%llu Specify block-aligned filesystem offset for the primary device.
+inode_share Enable inode page sharing for this filesystem. Inodes with
+ identical content within the same domain ID can share the
+ page cache.
=================== =========================================================
Sysfs Entries
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 549abc424763..a80e1762b607 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -10,3 +10,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
+erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += ishare.o
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index bce98c845a18..202cbbb4eada 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -203,7 +203,6 @@ static int erofs_read_inode(struct inode *inode)
static int erofs_fill_inode(struct inode *inode)
{
- struct erofs_inode *vi = EROFS_I(inode);
int err;
trace_erofs_fill_inode(inode);
@@ -235,28 +234,7 @@ static int erofs_fill_inode(struct inode *inode)
}
mapping_set_large_folios(inode->i_mapping);
- if (erofs_inode_is_data_compressed(vi->datalayout)) {
-#ifdef CONFIG_EROFS_FS_ZIP
- DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
- erofs_info, inode->i_sb,
- "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
- inode->i_mapping->a_ops = &z_erofs_aops;
-#else
- err = -EOPNOTSUPP;
-#endif
- } else {
- inode->i_mapping->a_ops = &erofs_aops;
-#ifdef CONFIG_EROFS_FS_ONDEMAND
- if (erofs_is_fscache_mode(inode->i_sb))
- inode->i_mapping->a_ops = &erofs_fscache_access_aops;
-#endif
-#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
- if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb)))
- inode->i_mapping->a_ops = &erofs_fileio_aops;
-#endif
- }
-
- return err;
+ return erofs_inode_set_aops(inode, inode, false);
}
/*
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index ec79e8b44d3b..15945e3308b8 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -179,6 +179,7 @@ struct erofs_sb_info {
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
#define EROFS_MOUNT_DIRECT_IO 0x00000100
+#define EROFS_MOUNT_INODE_SHARE 0x00000200
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
@@ -269,6 +270,11 @@ static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
/* default readahead size of directories */
#define EROFS_DIR_RA_BYTES 16384
+struct erofs_inode_fingerprint {
+ u8 *opaque;
+ int size;
+};
+
struct erofs_inode {
erofs_nid_t nid;
@@ -304,6 +310,18 @@ struct erofs_inode {
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ struct list_head ishare_list;
+ union {
+ /* for each anon shared inode */
+ struct {
+ struct erofs_inode_fingerprint fingerprint;
+ spinlock_t ishare_lock;
+ };
+ /* for each real inode */
+ struct inode *sharedinode;
+ };
+#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
};
@@ -410,6 +428,7 @@ extern const struct inode_operations erofs_dir_iops;
extern const struct file_operations erofs_file_fops;
extern const struct file_operations erofs_dir_fops;
+extern const struct file_operations erofs_ishare_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
@@ -455,6 +474,32 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
return NULL;
}
+static inline int erofs_inode_set_aops(struct inode *inode,
+ struct inode *realinode, bool no_fscache)
+{
+ if (erofs_inode_is_data_compressed(EROFS_I(realinode)->datalayout)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+ DO_ONCE_LITE_IF(realinode->i_blkbits != PAGE_SHIFT,
+ erofs_info, realinode->i_sb,
+ "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
+ inode->i_mapping->a_ops = &z_erofs_aops;
+#else
+ return -EOPNOTSUPP;
+#endif
+ } else {
+ inode->i_mapping->a_ops = &erofs_aops;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (!no_fscache && erofs_is_fscache_mode(realinode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (erofs_is_fileio_mode(EROFS_SB(realinode->i_sb)))
+ inode->i_mapping->a_ops = &erofs_fileio_aops;
+#endif
+ }
+ return 0;
+}
+
int erofs_register_sysfs(struct super_block *sb);
void erofs_unregister_sysfs(struct super_block *sb);
int __init erofs_init_sysfs(void);
@@ -541,6 +586,18 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) {
static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+int __init erofs_init_ishare(void);
+void erofs_exit_ishare(void);
+bool erofs_ishare_fill_inode(struct inode *inode);
+void erofs_ishare_free_inode(struct inode *inode);
+#else
+static inline int erofs_init_ishare(void) { return 0; }
+static inline void erofs_exit_ishare(void) {}
+static inline bool erofs_ishare_fill_inode(struct inode *inode) { return false; }
+static inline void erofs_ishare_free_inode(struct inode *inode) {}
+#endif
+
long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
diff --git a/fs/erofs/ishare.c b/fs/erofs/ishare.c
new file mode 100644
index 000000000000..6b710c935afb
--- /dev/null
+++ b/fs/erofs/ishare.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include <linux/xxhash.h>
+#include <linux/mount.h>
+#include "internal.h"
+#include "xattr.h"
+
+#include "../internal.h"
+
+static struct vfsmount *erofs_ishare_mnt;
+
+static int erofs_ishare_iget5_eq(struct inode *inode, void *data)
+{
+ struct erofs_inode_fingerprint *fp1 = &EROFS_I(inode)->fingerprint;
+ struct erofs_inode_fingerprint *fp2 = data;
+
+ return fp1->size == fp2->size &&
+ !memcmp(fp1->opaque, fp2->opaque, fp2->size);
+}
+
+static int erofs_ishare_iget5_set(struct inode *inode, void *data)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ vi->fingerprint = *(struct erofs_inode_fingerprint *)data;
+ INIT_LIST_HEAD(&vi->ishare_list);
+ spin_lock_init(&vi->ishare_lock);
+ return 0;
+}
+
+bool erofs_ishare_fill_inode(struct inode *inode)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_inode_fingerprint fp;
+ struct inode *sharedinode;
+ unsigned long hash;
+
+ if (erofs_xattr_fill_inode_fingerprint(&fp, inode, sbi->domain_id))
+ return false;
+ hash = xxh32(fp.opaque, fp.size, 0);
+ sharedinode = iget5_locked(erofs_ishare_mnt->mnt_sb, hash,
+ erofs_ishare_iget5_eq, erofs_ishare_iget5_set,
+ &fp);
+ if (!sharedinode) {
+ kfree(fp.opaque);
+ return false;
+ }
+
+ if (inode_state_read_once(sharedinode) & I_NEW) {
+ if (erofs_inode_set_aops(sharedinode, inode, true)) {
+ iget_failed(sharedinode);
+ kfree(fp.opaque);
+ return false;
+ }
+ sharedinode->i_mode = vi->vfs_inode.i_mode;
+ sharedinode->i_size = vi->vfs_inode.i_size;
+ unlock_new_inode(sharedinode);
+ } else {
+ kfree(fp.opaque);
+ }
+ vi->sharedinode = sharedinode;
+ INIT_LIST_HEAD(&vi->ishare_list);
+ spin_lock(&EROFS_I(sharedinode)->ishare_lock);
+ list_add(&vi->ishare_list, &EROFS_I(sharedinode)->ishare_list);
+ spin_unlock(&EROFS_I(sharedinode)->ishare_lock);
+ return true;
+}
+
+void erofs_ishare_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct inode *sharedinode = vi->sharedinode;
+
+ if (!sharedinode)
+ return;
+ spin_lock(&EROFS_I(sharedinode)->ishare_lock);
+ list_del(&vi->ishare_list);
+ spin_unlock(&EROFS_I(sharedinode)->ishare_lock);
+ iput(sharedinode);
+ vi->sharedinode = NULL;
+}
+
+static int erofs_ishare_file_open(struct inode *inode, struct file *file)
+{
+ struct inode *sharedinode = EROFS_I(inode)->sharedinode;
+ struct file *realfile;
+
+ if (file->f_flags & O_DIRECT)
+ return -EINVAL;
+ realfile = alloc_empty_backing_file(O_RDONLY|O_NOATIME, current_cred());
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+ ihold(sharedinode);
+ realfile->f_op = &erofs_file_fops;
+ realfile->f_inode = sharedinode;
+ realfile->f_mapping = sharedinode->i_mapping;
+ path_get(&file->f_path);
+ backing_file_set_user_path(realfile, &file->f_path);
+
+ file_ra_state_init(&realfile->f_ra, file->f_mapping);
+ realfile->private_data = EROFS_I(inode);
+ file->private_data = realfile;
+ return 0;
+}
+
+static int erofs_ishare_file_release(struct inode *inode, struct file *file)
+{
+ struct file *realfile = file->private_data;
+
+ iput(realfile->f_inode);
+ fput(realfile);
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t erofs_ishare_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct file *realfile = iocb->ki_filp->private_data;
+ struct kiocb dedup_iocb;
+ ssize_t nread;
+
+ if (!iov_iter_count(to))
+ return 0;
+ kiocb_clone(&dedup_iocb, iocb, realfile);
+ nread = filemap_read(&dedup_iocb, to, 0);
+ iocb->ki_pos = dedup_iocb.ki_pos;
+ return nread;
+}
+
+static int erofs_ishare_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *realfile = file->private_data;
+
+ vma_set_file(vma, realfile);
+ return generic_file_readonly_mmap(file, vma);
+}
+
+const struct file_operations erofs_ishare_fops = {
+ .open = erofs_ishare_file_open,
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_ishare_file_read_iter,
+ .mmap = erofs_ishare_mmap,
+ .release = erofs_ishare_file_release,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .splice_read = filemap_splice_read,
+};
+
+int __init erofs_init_ishare(void)
+{
+ erofs_ishare_mnt = kern_mount(&erofs_anon_fs_type);
+ return PTR_ERR_OR_ZERO(erofs_ishare_mnt);
+}
+
+void erofs_exit_ishare(void)
+{
+ kern_unmount(erofs_ishare_mnt);
+}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 960da62636ad..1f2b8732b29e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -396,6 +396,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
enum {
Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
+ Opt_inode_share,
};
static const struct constant_table erofs_param_cache_strategy[] = {
@@ -423,6 +424,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("domain_id", Opt_domain_id),
fsparam_flag_no("directio", Opt_directio),
fsparam_u64("fsoffset", Opt_fsoffset),
+ fsparam_flag("inode_share", Opt_inode_share),
{}
};
@@ -551,6 +553,13 @@ static int erofs_fc_parse_param(struct fs_context *fc,
case Opt_fsoffset:
sbi->dif0.fsoff = result.uint_64;
break;
+ case Opt_inode_share:
+#if defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
+ set_opt(&sbi->opt, INODE_SHARE);
+#else
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+ break;
}
return 0;
}
@@ -649,6 +658,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &erofs_sops;
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && test_opt(&sbi->opt, INODE_SHARE)) {
+ errorfc(fc, "FSDAX is not allowed when inode_ishare is on");
+ return -EINVAL;
+ }
+
sbi->blkszbits = PAGE_SHIFT;
if (!sb->s_bdev) {
/*
@@ -719,6 +733,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
erofs_info(sb, "unsupported blocksize for DAX");
clear_opt(&sbi->opt, DAX_ALWAYS);
}
+ if (test_opt(&sbi->opt, INODE_SHARE) && !erofs_sb_has_ishare_xattrs(sbi)) {
+ erofs_info(sb, "on-disk ishare xattrs not found. Turning off inode_share.");
+ clear_opt(&sbi->opt, INODE_SHARE);
+ }
+ if (test_opt(&sbi->opt, INODE_SHARE))
+ erofs_info(sb, "EXPERIMENTAL EROFS page cache share support in use. Use at your own risk!");
sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
@@ -948,10 +968,32 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
-#if defined(CONFIG_EROFS_FS_ONDEMAND)
+#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
+static void erofs_free_anon_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ kfree(vi->fingerprint.opaque);
+#endif
+ kmem_cache_free(erofs_inode_cachep, vi);
+}
+
+static const struct super_operations erofs_anon_sops = {
+ .alloc_inode = erofs_alloc_inode,
+ .drop_inode = inode_just_drop,
+ .free_inode = erofs_free_anon_inode,
+};
+
static int erofs_anon_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->ops = &erofs_anon_sops;
+ return 0;
}
struct file_system_type erofs_anon_fs_type = {
@@ -986,6 +1028,10 @@ static int __init erofs_module_init(void)
if (err)
goto sysfs_err;
+ err = erofs_init_ishare();
+ if (err)
+ goto ishare_err;
+
err = register_filesystem(&erofs_fs_type);
if (err)
goto fs_err;
@@ -993,6 +1039,8 @@ static int __init erofs_module_init(void)
return 0;
fs_err:
+ erofs_exit_ishare();
+ishare_err:
erofs_exit_sysfs();
sysfs_err:
z_erofs_exit_subsystem();
@@ -1010,6 +1058,7 @@ static void __exit erofs_module_exit(void)
/* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+ erofs_exit_ishare();
erofs_exit_sysfs();
z_erofs_exit_subsystem();
erofs_exit_shrinker();
@@ -1062,6 +1111,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",domain_id=%s", sbi->domain_id);
if (sbi->dif0.fsoff)
seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
+ if (test_opt(opt, INODE_SHARE))
+ seq_puts(seq, ",inode_share");
return 0;
}
@@ -1072,6 +1123,7 @@ static void erofs_evict_inode(struct inode *inode)
dax_break_layout_final(inode);
#endif
+ erofs_ishare_free_inode(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index ae61f20cb861..e1709059d3cc 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -577,3 +577,37 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
#endif
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp,
+ struct inode *inode, const char *domain_id)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+ struct erofs_xattr_prefix_item *prefix;
+ const char *infix;
+ int valuelen, base_index;
+
+ if (!test_opt(&sbi->opt, INODE_SHARE))
+ return -EOPNOTSUPP;
+ if (!sbi->xattr_prefixes)
+ return -EINVAL;
+ prefix = sbi->xattr_prefixes + sbi->ishare_xattr_prefix_id;
+ infix = prefix->prefix->infix;
+ base_index = prefix->prefix->base_index;
+ valuelen = erofs_getxattr(inode, base_index, infix, NULL, 0);
+ if (valuelen <= 0 || valuelen > (1 << sbi->blkszbits))
+ return -EFSCORRUPTED;
+ fp->size = valuelen + (domain_id ? strlen(domain_id) : 0);
+ fp->opaque = kmalloc(fp->size, GFP_KERNEL);
+ if (!fp->opaque)
+ return -ENOMEM;
+ if (valuelen != erofs_getxattr(inode, base_index, infix,
+ fp->opaque, valuelen)) {
+ kfree(fp->opaque);
+ fp->opaque = NULL;
+ return -EFSCORRUPTED;
+ }
+ memcpy(fp->opaque + valuelen, domain_id, fp->size - valuelen);
+ return 0;
+}
+#endif
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 6317caa8413e..bf75a580b8f1 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -67,4 +67,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu);
#define erofs_get_acl (NULL)
#endif
+int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp,
+ struct inode *inode, const char *domain_id);
+
#endif
--
2.22.0
On 2026/1/16 17:55, Hongbo Li wrote:
> From: Hongzhen Luo <hongzhen@linux.alibaba.com>
>
> Currently, reading files with different paths (or names) but the same
> content will consume multiple copies of the page cache, even if the
> content of these page caches is the same. For example, reading
> identical files (e.g., *.so files) from two different minor versions of
> container images will cost multiple copies of the same page cache,
> since different containers have different mount points. Therefore,
> sharing the page cache for files with the same content can save memory.
>
> This introduces the page cache share feature in erofs. It allocate a
> deduplicated inode and use its page cache as shared. Reads for files
> with identical content will ultimately be routed to the page cache of
> the deduplicated inode. In this way, a single page cache satisfies
> multiple read requests for different files with the same contents.
>
> We introduce inode_share mount option to enable the page sharing mode
> during mounting.
>
> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
> Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
> ---
> Documentation/filesystems/erofs.rst | 5 +
> fs/erofs/Makefile | 1 +
> fs/erofs/inode.c | 24 +----
> fs/erofs/internal.h | 57 ++++++++++
> fs/erofs/ishare.c | 161 ++++++++++++++++++++++++++++
> fs/erofs/super.c | 56 +++++++++-
> fs/erofs/xattr.c | 34 ++++++
> fs/erofs/xattr.h | 3 +
> 8 files changed, 316 insertions(+), 25 deletions(-)
> create mode 100644 fs/erofs/ishare.c
>
> diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
> index 08194f194b94..27d3caa3c73c 100644
> --- a/Documentation/filesystems/erofs.rst
> +++ b/Documentation/filesystems/erofs.rst
> @@ -128,7 +128,12 @@ device=%s Specify a path to an extra device to be used together.
> fsid=%s Specify a filesystem image ID for Fscache back-end.
> domain_id=%s Specify a domain ID in fscache mode so that different images
> with the same blobs under a given domain ID can share storage.
> + Also used for inode page sharing mode which defines a sharing
> + domain.
I think either the existing or the page cache sharing
here, `domain_id` should be protected as sensitive
information, so it'd be helpful to protect it as a
separate patch.
And change the description as below:
Specify a trusted domain ID for fscache mode so that
different images with the same blobs, identified by blob IDs,
can share storage within the same trusted domain.
Also used for different filesystems with inode page sharing
enabled to share page cache within the trusted domain.
> fsoffset=%llu Specify block-aligned filesystem offset for the primary device.
> +inode_share Enable inode page sharing for this filesystem. Inodes with
> + identical content within the same domain ID can share the
> + page cache.
> =================== =========================================================
...
> erofs_exit_shrinker();
> @@ -1062,6 +1111,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
> seq_printf(seq, ",domain_id=%s", sbi->domain_id);
I think we shouldn't show `domain_id` to the userspace
entirely.
Also, let's use kfree_sentitive() and no_free_ptr() to
replace the following snippet:
case Opt_domain_id:
kfree(sbi->domain_id); -> kfree_sentitive
sbi->domain_id = kstrdup(param->string, GFP_KERNEL);
-> sbi->domain_id = no_free_ptr(param->string);
if (!sbi->domain_id)
return -ENOMEM;
break;
And replace with kfree_sentitive() for domain_id everywhere.
Thanks,
Gao Xiang
On 2026/1/20 22:19, Gao Xiang wrote: > > > On 2026/1/16 17:55, Hongbo Li wrote: >> From: Hongzhen Luo <hongzhen@linux.alibaba.com> >> >> Currently, reading files with different paths (or names) but the same >> content will consume multiple copies of the page cache, even if the >> content of these page caches is the same. For example, reading >> identical files (e.g., *.so files) from two different minor versions of >> container images will cost multiple copies of the same page cache, >> since different containers have different mount points. Therefore, >> sharing the page cache for files with the same content can save memory. >> >> This introduces the page cache share feature in erofs. It allocate a >> deduplicated inode and use its page cache as shared. Reads for files >> with identical content will ultimately be routed to the page cache of >> the deduplicated inode. In this way, a single page cache satisfies >> multiple read requests for different files with the same contents. >> >> We introduce inode_share mount option to enable the page sharing mode >> during mounting. >> >> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com> >> Signed-off-by: Hongbo Li <lihongbo22@huawei.com> >> --- >> Documentation/filesystems/erofs.rst | 5 + >> fs/erofs/Makefile | 1 + >> fs/erofs/inode.c | 24 +---- >> fs/erofs/internal.h | 57 ++++++++++ >> fs/erofs/ishare.c | 161 ++++++++++++++++++++++++++++ >> fs/erofs/super.c | 56 +++++++++- >> fs/erofs/xattr.c | 34 ++++++ >> fs/erofs/xattr.h | 3 + >> 8 files changed, 316 insertions(+), 25 deletions(-) >> create mode 100644 fs/erofs/ishare.c >> >> diff --git a/Documentation/filesystems/erofs.rst >> b/Documentation/filesystems/erofs.rst >> index 08194f194b94..27d3caa3c73c 100644 >> --- a/Documentation/filesystems/erofs.rst >> +++ b/Documentation/filesystems/erofs.rst >> @@ -128,7 +128,12 @@ device=%s Specify a path to an extra >> device to be used together. >> fsid=%s Specify a filesystem image ID for Fscache >> back-end. >> domain_id=%s Specify a domain ID in fscache mode so that >> different images >> with the same blobs under a given domain ID >> can share storage. >> + Also used for inode page sharing mode which >> defines a sharing >> + domain. > > I think either the existing or the page cache sharing > here, `domain_id` should be protected as sensitive > information, so it'd be helpful to protect it as a > separate patch. > > And change the description as below: > Specify a trusted domain ID for fscache mode > so that > different images with the same blobs, > identified by blob IDs, > can share storage within the same trusted > domain. > Also used for different filesystems with > inode page sharing > enabled to share page cache within the > trusted domain. > > >> fsoffset=%llu Specify block-aligned filesystem offset for >> the primary device. >> +inode_share Enable inode page sharing for this >> filesystem. Inodes with >> + identical content within the same domain ID >> can share the >> + page cache. >> =================== >> ========================================================= > > ... > > >> erofs_exit_shrinker(); >> @@ -1062,6 +1111,8 @@ static int erofs_show_options(struct seq_file >> *seq, struct dentry *root) >> seq_printf(seq, ",domain_id=%s", sbi->domain_id); > > I think we shouldn't show `domain_id` to the userspace > entirely. > > Also, let's use kfree_sentitive() and no_free_ptr() to > replace the following snippet: > > case Opt_domain_id: > kfree(sbi->domain_id); -> kfree_sentitive Ok, kfree_sensitive/no_free_ptr looks good, this way makes domain_id more reliable. Thanks, Hongbo > sbi->domain_id = kstrdup(param->string, GFP_KERNEL); > -> sbi->domain_id = no_free_ptr(param->string); > if (!sbi->domain_id) > return -ENOMEM; > break; > > And replace with kfree_sentitive() for domain_id everywhere. > > Thanks, > Gao Xiang
On 2026/1/20 22:19, Gao Xiang wrote: > > > On 2026/1/16 17:55, Hongbo Li wrote: >> From: Hongzhen Luo <hongzhen@linux.alibaba.com> >> >> Currently, reading files with different paths (or names) but the same >> content will consume multiple copies of the page cache, even if the >> content of these page caches is the same. For example, reading >> identical files (e.g., *.so files) from two different minor versions of >> container images will cost multiple copies of the same page cache, >> since different containers have different mount points. Therefore, >> sharing the page cache for files with the same content can save memory. >> >> This introduces the page cache share feature in erofs. It allocate a >> deduplicated inode and use its page cache as shared. Reads for files >> with identical content will ultimately be routed to the page cache of >> the deduplicated inode. In this way, a single page cache satisfies >> multiple read requests for different files with the same contents. >> >> We introduce inode_share mount option to enable the page sharing mode >> during mounting. >> >> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com> >> Signed-off-by: Hongbo Li <lihongbo22@huawei.com> >> --- >> Documentation/filesystems/erofs.rst | 5 + >> fs/erofs/Makefile | 1 + >> fs/erofs/inode.c | 24 +---- >> fs/erofs/internal.h | 57 ++++++++++ >> fs/erofs/ishare.c | 161 ++++++++++++++++++++++++++++ >> fs/erofs/super.c | 56 +++++++++- >> fs/erofs/xattr.c | 34 ++++++ >> fs/erofs/xattr.h | 3 + >> 8 files changed, 316 insertions(+), 25 deletions(-) >> create mode 100644 fs/erofs/ishare.c >> >> diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst >> index 08194f194b94..27d3caa3c73c 100644 >> --- a/Documentation/filesystems/erofs.rst >> +++ b/Documentation/filesystems/erofs.rst >> @@ -128,7 +128,12 @@ device=%s Specify a path to an extra device to be used together. >> fsid=%s Specify a filesystem image ID for Fscache back-end. >> domain_id=%s Specify a domain ID in fscache mode so that different images >> with the same blobs under a given domain ID can share storage. >> + Also used for inode page sharing mode which defines a sharing >> + domain. > > I think either the existing or the page cache sharing > here, `domain_id` should be protected as sensitive > information, so it'd be helpful to protect it as a > separate patch. > > And change the description as below: > Specify a trusted domain ID for fscache mode so that > different images with the same blobs, identified by blob IDs, > can share storage within the same trusted domain. > Also used for different filesystems with inode page sharing > enabled to share page cache within the trusted domain. > > >> fsoffset=%llu Specify block-aligned filesystem offset for the primary device. >> +inode_share Enable inode page sharing for this filesystem. Inodes with >> + identical content within the same domain ID can share the >> + page cache. >> =================== ========================================================= > > ... > > >> erofs_exit_shrinker(); >> @@ -1062,6 +1111,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) >> seq_printf(seq, ",domain_id=%s", sbi->domain_id); > > I think we shouldn't show `domain_id` to the userspace > entirely. Maybe not bother with the deprecated fscache, just make sure `domain_id` won't be shown in any form for page cache sharing feature. > > Also, let's use kfree_sentitive() and no_free_ptr() to > replace the following snippet: > > case Opt_domain_id: > kfree(sbi->domain_id); -> kfree_sentitive > sbi->domain_id = kstrdup(param->string, GFP_KERNEL); > -> sbi->domain_id = no_free_ptr(param->string); > if (!sbi->domain_id) > return -ENOMEM; > break; > > And replace with kfree_sentitive() for domain_id everywhere. > > Thanks, > Gao Xiang
I don't really understand the fingerprint idea. Files with the
same content will point to the same physical disk blocks, so that
should be a much better indicator than a finger print? Also how does
the fingerprint guarantee uniqueness? Is it a cryptographically
secure hash? In here it just seems like an opaque blob.
> +static inline int erofs_inode_set_aops(struct inode *inode,
> + struct inode *realinode, bool no_fscache)
Factoring this out first would be a nice little prep patch.
Also it would probably be much cleaner using IS_ENABLED.
> +static int erofs_ishare_file_open(struct inode *inode, struct file *file)
> +{
> + struct inode *sharedinode = EROFS_I(inode)->sharedinode;
Ok, it looks like this allocates a separate backing file and inode.
On 2026/1/16 23:46, Christoph Hellwig wrote:
> I don't really understand the fingerprint idea. Files with the
> same content will point to the same physical disk blocks, so that
> should be a much better indicator than a finger print? Also how does
> the fingerprint guarantee uniqueness? Is it a cryptographically
> secure hash? In here it just seems like an opaque blob.
>
>> +static inline int erofs_inode_set_aops(struct inode *inode,
>> + struct inode *realinode, bool no_fscache)
>
> Factoring this out first would be a nice little prep patch.
> Also it would probably be much cleaner using IS_ENABLED.
Ok, Thanks for reviewing. I will refine in next version.
Thanks,
Hongbo
>
>> +static int erofs_ishare_file_open(struct inode *inode, struct file *file)
>> +{
>> + struct inode *sharedinode = EROFS_I(inode)->sharedinode;
>
> Ok, it looks like this allocates a separate backing file and inode.
>
On 2026/1/20 20:29, Hongbo Li wrote:
>
>
> On 2026/1/16 23:46, Christoph Hellwig wrote:
>> I don't really understand the fingerprint idea. Files with the
>> same content will point to the same physical disk blocks, so that
>> should be a much better indicator than a finger print? Also how does
>> the fingerprint guarantee uniqueness? Is it a cryptographically
>> secure hash? In here it just seems like an opaque blob.
>>
>>> +static inline int erofs_inode_set_aops(struct inode *inode,
>>> + struct inode *realinode, bool no_fscache)
>>
>> Factoring this out first would be a nice little prep patch.
>> Also it would probably be much cleaner using IS_ENABLED.
>
> Ok, Thanks for reviewing. I will refine in next version.
Sorry I overlooked this point. Factoring this out is a good idea, but we
cannot use IS_ENABLED here, because some aops is not visible when the
relevant config macro is not enabled. So I choose to keep this format
and only to factor this out.
Thanks,
Hongbo
>
> Thanks,
> Hongbo
>
>>
>>> +static int erofs_ishare_file_open(struct inode *inode, struct file
>>> *file)
>>> +{
>>> + struct inode *sharedinode = EROFS_I(inode)->sharedinode;
>>
>> Ok, it looks like this allocates a separate backing file and inode.
>>
On Thu, Jan 22, 2026 at 10:48:27PM +0800, Hongbo Li wrote: > Sorry I overlooked this point. Factoring this out is a good idea, but we > cannot use IS_ENABLED here, because some aops is not visible when the > relevant config macro is not enabled. So I choose to keep this format and > only to factor this out. Is it? If so just moving the extern outside the ifdef should be easy enough, but from a quick grep I can't see any such case.
Hi Christoph,
On 2026/1/16 23:46, Christoph Hellwig wrote:
> I don't really understand the fingerprint idea. Files with the
> same content will point to the same physical disk blocks, so that
> should be a much better indicator than a finger print? Also how does
Page cache sharing should apply to different EROFS
filesystem images on the same machine too, so the
physical disk block number idea cannot be applied
to this.
> the fingerprint guarantee uniqueness? Is it a cryptographically
> secure hash? In here it just seems like an opaque blob.
Yes, typically it can be a secure hash like sha256,
but it really depends on the users how to use it.
This feature is enabled _only_ when a dedicated mount
option is used, and should be enabled by the priviledged
mounters, and it's up to the priviledged mounters to
guarantee the fingerprint is correct (usually guaranteed
by signatures by image builders since images will be
signed).
Also different signatures also can be isolated by domain
ids, so that different domain ids cannot be shared.
>
>> +static inline int erofs_inode_set_aops(struct inode *inode,
>> + struct inode *realinode, bool no_fscache)
>
> Factoring this out first would be a nice little prep patch.
> Also it would probably be much cleaner using IS_ENABLED.
>
>> +static int erofs_ishare_file_open(struct inode *inode, struct file *file)
>> +{
>> + struct inode *sharedinode = EROFS_I(inode)->sharedinode;
>
> Ok, it looks like this allocates a separate backing file and inode.
Yes.
Thanks,
Gao Xiang
On Sat, Jan 17, 2026 at 12:21:16AM +0800, Gao Xiang wrote: > Hi Christoph, > > On 2026/1/16 23:46, Christoph Hellwig wrote: >> I don't really understand the fingerprint idea. Files with the >> same content will point to the same physical disk blocks, so that >> should be a much better indicator than a finger print? Also how does > > Page cache sharing should apply to different EROFS > filesystem images on the same machine too, so the > physical disk block number idea cannot be applied > to this. Oh. That's kinda unexpected and adds another twist to the whole scheme. So in that case the on-disk data actually is duplicated in each image and then de-duplicated in memory only? Ewwww...
On 2026/1/19 15:29, Christoph Hellwig wrote: > On Sat, Jan 17, 2026 at 12:21:16AM +0800, Gao Xiang wrote: >> Hi Christoph, >> >> On 2026/1/16 23:46, Christoph Hellwig wrote: >>> I don't really understand the fingerprint idea. Files with the >>> same content will point to the same physical disk blocks, so that >>> should be a much better indicator than a finger print? Also how does >> >> Page cache sharing should apply to different EROFS >> filesystem images on the same machine too, so the >> physical disk block number idea cannot be applied >> to this. > > Oh. That's kinda unexpected and adds another twist to the whole scheme. > So in that case the on-disk data actually is duplicated in each image > and then de-duplicated in memory only? Ewwww... On-disk deduplication is decoupled from this feature: - EROFS can share the same blocks in blobs (multiple devices) among different images, so that on-disk data can be shared by refering the same blobs; - On-disk data won't be deduplicated in image if reflink is enabled for backing fses, userspace mounters can trigger background GCs to deduplicate the identical blocks. I just tried to say EROFS doesn't limit what's the real meaning of `fingerprint` (they can be serialized integer numbers for example defined by a specific image publisher, or a specific secure hash. Currently, "mkfs.erofs" will generate sha256 for each files), but left them to the image builders: 1) if `fingerprint` is distributed as on-disk part of signed images, as I said, it could be shared within a trusted domain_id (usually the same image builder) -- that is the top priority thing using dmverity; Or 2) If `fingerprint` is not distributed in the image or images are untrusted (e.g. unknown signatures), image fetchers can scan each inode in the golden images to generate an extra minimal EROFS metadata-only image with local calculated `fingerprint` too, which is much similar to the current ostree way (parse remote files and calculate digests). Thanks, Gao Xiang
On Mon, Jan 19, 2026 at 03:53:21PM +0800, Gao Xiang wrote: > I just tried to say EROFS doesn't limit what's > the real meaning of `fingerprint` (they can be serialized > integer numbers for example defined by a specific image > publisher, or a specific secure hash. Currently, > "mkfs.erofs" will generate sha256 for each files), but > left them to the image builders: To me this sounds pretty scary, as we have code in the kernel's trust domain that heavily depends on arbitrary userspace policy decisions. Similarly the sharing of blocks between different file system instances opens a lot of questions about trust boundaries and life time rules. I don't really have good answers, but writing up the lifetime and threat models would really help.
On 2026/1/19 16:32, Christoph Hellwig wrote: > On Mon, Jan 19, 2026 at 03:53:21PM +0800, Gao Xiang wrote: >> I just tried to say EROFS doesn't limit what's >> the real meaning of `fingerprint` (they can be serialized >> integer numbers for example defined by a specific image >> publisher, or a specific secure hash. Currently, >> "mkfs.erofs" will generate sha256 for each files), but >> left them to the image builders: > > To me this sounds pretty scary, as we have code in the kernel's trust > domain that heavily depends on arbitrary userspace policy decisions. For example, overlayfs metacopy can also points to arbitary files, what's the difference between them? https://docs.kernel.org/filesystems/overlayfs.html#metadata-only-copy-up By using metacopy, overlayfs can access arbitary files as long as the metacopy has the pointer, so it should be a priviledged stuff, which is similar to this feature. > > Similarly the sharing of blocks between different file system > instances opens a lot of questions about trust boundaries and life > time rules. I don't really have good answers, but writing up the Could you give more details about the these? Since you raised the questions but I have no idea what the threats really come from. As for the lifetime: The blob itself are immutable files, what the lifetime rules means? And how do you define trust boundaries? You mean users have no right to access the data? I think it's similar: for blockdevice-based filesystems, you mount the filesystem with a given source, and it should have permission to the mounter. For multiple-blob EROFS filesystems, you mount the filesystem with multiple data sources, and the blockdevices and/or backed files should have permission to the mounters too. I don't quite get the point. Thanks, Gao Xiang > lifetime and threat models would really help.
On Mon, Jan 19, 2026 at 04:52:54PM +0800, Gao Xiang wrote: >> To me this sounds pretty scary, as we have code in the kernel's trust >> domain that heavily depends on arbitrary userspace policy decisions. > > For example, overlayfs metacopy can also points to > arbitary files, what's the difference between them? > https://docs.kernel.org/filesystems/overlayfs.html#metadata-only-copy-up > > By using metacopy, overlayfs can access arbitary files > as long as the metacopy has the pointer, so it should > be a priviledged stuff, which is similar to this feature. Sounds scary too. But overlayfs' job is to combine underlying files, so it is expected. I think it's the mix of erofs being a disk based file system, and reaching out beyond the device(s) assigned to the file system instance that makes me feel rather uneasy. >> >> Similarly the sharing of blocks between different file system >> instances opens a lot of questions about trust boundaries and life >> time rules. I don't really have good answers, but writing up the > > Could you give more details about the these? Since you > raised the questions but I have no idea what the threats > really come from. Right now by default we don't allow any unprivileged mounts. Now if people thing that say erofs is safe enough and opt into that, it needs to be clear what the boundaries of that are. For a file system limited to a single block device that boundaries are pretty clear. For file systems reaching out to the entire system (or some kind of domain), the scope is much wider. > As for the lifetime: The blob itself are immutable files, > what the lifetime rules means? What happens if the blob gets removed, intentionally or accidentally? > And how do you define trust boundaries? You mean users > have no right to access the data? > > I think it's similar: for blockdevice-based filesystems, > you mount the filesystem with a given source, and it > should have permission to the mounter. Yes. > For multiple-blob EROFS filesystems, you mount the > filesystem with multiple data sources, and the blockdevices > and/or backed files should have permission to the > mounters too. And what prevents other from modifying them, or sneaking unexpected data including unexpected comparison blobs in?
On 2026/1/19 17:22, Christoph Hellwig wrote: > On Mon, Jan 19, 2026 at 04:52:54PM +0800, Gao Xiang wrote: >>> To me this sounds pretty scary, as we have code in the kernel's trust >>> domain that heavily depends on arbitrary userspace policy decisions. >> >> For example, overlayfs metacopy can also points to >> arbitary files, what's the difference between them? >> https://docs.kernel.org/filesystems/overlayfs.html#metadata-only-copy-up >> >> By using metacopy, overlayfs can access arbitary files >> as long as the metacopy has the pointer, so it should >> be a priviledged stuff, which is similar to this feature. > > Sounds scary too. But overlayfs' job is to combine underlying files, so > it is expected. I think it's the mix of erofs being a disk based file But you still could point to an arbitary page cache if metacopy is used. > system, and reaching out beyond the device(s) assigned to the file system > instance that makes me feel rather uneasy. You mean the page cache can be shared from other filesystems even not backed by these devices/files? I admitted yes, there could be different: but that is why new mount options "inode_share" and the "domain_id" mount option are used. I think they should be regarded as a single super filesystem if "domain_id" is the same: From the security perspective much like subvolumes of a single super filesystem. And mounting a new filesystem within a "domain_id" can be regard as importing data into the super "domain_id" filesystem, and I think only trusted data within the single domain can be mounted/shared. > >>> >>> Similarly the sharing of blocks between different file system >>> instances opens a lot of questions about trust boundaries and life >>> time rules. I don't really have good answers, but writing up the >> >> Could you give more details about the these? Since you >> raised the questions but I have no idea what the threats >> really come from. > > Right now by default we don't allow any unprivileged mounts. Now > if people thing that say erofs is safe enough and opt into that, > it needs to be clear what the boundaries of that are. For a file > system limited to a single block device that boundaries are > pretty clear. For file systems reaching out to the entire system > (or some kind of domain), the scope is much wider. Why multiple device differ for an immutable fses, any filesystem instance cannot change the primary or external device/blobs. All data are immutable. > >> As for the lifetime: The blob itself are immutable files, >> what the lifetime rules means? > > What happens if the blob gets removed, intentionally or accidentally? The extra device/blob reference is held during the whole mount lifetime, much like the primary (block) device. And EROFS is an immutable filesystem, so that inner blocks within the blob won't be go away by some fs instance too. > >> And how do you define trust boundaries? You mean users >> have no right to access the data? >> >> I think it's similar: for blockdevice-based filesystems, >> you mount the filesystem with a given source, and it >> should have permission to the mounter. > > Yes. > >> For multiple-blob EROFS filesystems, you mount the >> filesystem with multiple data sources, and the blockdevices >> and/or backed files should have permission to the >> mounters too. > > And what prevents other from modifying them, or sneaking > unexpected data including unexpected comparison blobs in? I don't think it's difference from filesystems with single device. First, EROFS instances never modify any underlay device/blobs: If you say some other program modify the device data, yes, it can be changed externally, but I think it's just like trusted FUSE deamons, untrusted FUSE daemon can return arbitary (meta)data at random times too. Thanks, Gao Xiang
Hi Christoph,
Sorry I didn't phrase things clearly earlier, but I'd still
like to explain the whole idea, as this feature is clearly
useful for containerization. I hope we can reach agreement
on the page cache sharing feature: Christian agreed on this
feature (and I hope still):
https://lore.kernel.org/linux-fsdevel/20260112-begreifbar-hasten-da396ac2759b@brauner
First, let's separate this feature from mounting in user
namespaces (i.e., unprivileged mounts), because this feature
is designed specifically for privileged mounts.
The EROFS page cache sharing feature stems from a current
limitation in the page cache: a file-based folio cannot be
shared across different inode mappings (or the different
page index within the same mapping; If this limitation
were resolved, we could implement a finer-grained page
cache sharing mechanism at the folio level). As you may
know, this patchset dates back to 2023, and as of 2026; I
still see no indication that the page cache infra will
change.
So that let's face the reality: this feature introduces
on-disk xattrs called "fingerprints." --- Since they're
just xattrs, the EROFS on-disk format remains unchanged.
A new compat feature bit in the superblock indicates
whether an EROFS image contains such xattrs.
=====
In short: no on-disk format changes are required for
page cache sharing -- only xattrs attached to inodes
in the EROFS image.
Even if finer-grained page cache sharing is implemented
many many years later, existing images will remain
compatible, as we can simply ignore those xattrs.
=====
At runtime, the feature is explicitly enabled via a new
mount option: `inode_share`, which is intended only for
privileged mounters. A `domain_id` must also be specified
to define a trusted domain. This means:
- For regular EROFS mounts (without `inode_share`;
default), no page cache sharing happens for those
images;
- For mounts with `inode_share`, page cache sharing is
allowed only among mounts with the same `domain_id`.
The `domain_id` can be thought of as defining a federated
super-filesystem: data of the unique "fingerprints" (e.g.,
secure hashes or UUIDs) may come from any of the
participating filesystems, but page cache is the only one.
EROFS is an immutable, image-based golden filesystem: its
(meta)data is generated entirely in userspace. I consider
it as a special class of disk filesystem, so traditional
assumptions about generic read-write filesystems don't
always apply; and the image filesystem (especially for
containers) can also have unique features according to
image use cases against typical local filesystems.
As for unpriviledged mounts, that is another story (clearly
there are different features at least at runtime), first
I think no one argues whether mounting in the user space
is useful for containers: I do agree it should have a formal
written threat model in advance. While I'm not a security
expert per se, I'll draft one later separately.
My rough thoughts are:
- Let's not focusing entirely on the random human bugs,
because I think every practical subsystem should have bugs,
the whole threat model focuses on the system design, and
less code doesn't mean anything (buggy or even has system
design flaw)
- EROFS only accesses the (meta)data from the source blobs
specified at mount time, even with multi-device support:
mount -t erofs -odevice=[blob],device=[blob],... [source]
An EROFS mount instance never accesses data beyond those
blobs. Moreover, EROFS holds reference counts on these
blobs for the entire lifetime of the mounted filesystem
(so even if a blob is deleted, blobs remain accessible as
orphan/deleted inodes).
- As a strictly immutable filesystem, EROFS never writes to
underlying blobs/devices and thus avoids complicated space
allocation, deallocation, reverse mapping or journaling
writeback consistency issues from its design in writable
filesystems like ext4, XFS, or BTRFS. However, it doesn't
mean EROFS cannot bear random (meta)data change from
modifing blobs directly from external users.
- External users can modify underlay blobs/devices only when
they have permission to the blobs/devices, so there is no
privilege escalation risk; so I think "Sneaking in
unexpected data" isn't meaningful here -- you need proper
permissions to alter the source blobs;
So then the only question is whether EROFS's on-disk design
can safely handle arbitrary (even fuzzed) external
modifications. I believe it can: because EROFS don't
have any redundant metadata especially for space allocation
, reverse mapping and journalling like EXT4, XFS, BTRFS.
Thus, it avoids the kinds of severe inconsistency bugs
seen in generic readwrite filesystems; if you say corruption
or inconsientcy, you should define the corruption. Almost
all severe inconsientcy issue cannot be seen as inconsientcy
from EROFS on-disk design itself, also see:
https://erofs.docs.kernel.org/en/latest/imagefs.html
- Of course, unprivileged kernel EROFS mounts should start
from a minimal core on-disk format, typically the following:
https://erofs.docs.kernel.org/en/latest/core_ondisk.html
I'll clarify this together with the full security model
later if this feature really gets developped;
- In the end, I don't think various wild non-technical
assumptions makes any sense to form out the correct design
of unprivileged mounts, if a real security threat exists, it
should first have a potential attack path written in words
(even in theory), but I can't identify any practical one
based on the design in my mind.
All in all, I'm open to hear and discuss any potential
threat or valid argument and find the final answers, but I do
think we should keep discussion in the technical way rather
than purely in policy as in the previous related threads.
Thanks,
Gao Xiang
On Tue, Jan 20, 2026 at 11:07:48AM +0800, Gao Xiang wrote: > > Hi Christoph, > > Sorry I didn't phrase things clearly earlier, but I'd still > like to explain the whole idea, as this feature is clearly > useful for containerization. I hope we can reach agreement > on the page cache sharing feature: Christian agreed on this > feature (and I hope still): > > https://lore.kernel.org/linux-fsdevel/20260112-begreifbar-hasten-da396ac2759b@brauner He has to ultimatively decide. I do have an uneasy feeling about this. It's not super informed as I can keep up, and I'm not the one in charge, but I hope it is helpful to share my perspective. > First, let's separate this feature from mounting in user > namespaces (i.e., unprivileged mounts), because this feature > is designed specifically for privileged mounts. Ok. > The EROFS page cache sharing feature stems from a current > limitation in the page cache: a file-based folio cannot be > shared across different inode mappings (or the different > page index within the same mapping; If this limitation > were resolved, we could implement a finer-grained page > cache sharing mechanism at the folio level). As you may > know, this patchset dates back to 2023, I didn't.. > and as of 2026; I > still see no indication that the page cache infra will > change. It will be very hard to change unless we move to physical indexing of the page cache, which has all kinds of downside.s > So that let's face the reality: this feature introduces > on-disk xattrs called "fingerprints." --- Since they're > just xattrs, the EROFS on-disk format remains unchanged. I think the concept of using a backing file of some sort for the shared pagecache (which I have no problem with at all), vs the imprecise selection through a free form fingerprint are quite different aspects, that could be easily separated. I.e. one could easily imagine using the data path approach based purely on exact file system metadata. But that would of course not work with multiple images, which I think is a key feature here if I'm reading between the lines correctly. > - Let's not focusing entirely on the random human bugs, > because I think every practical subsystem should have bugs, > the whole threat model focuses on the system design, and > less code doesn't mean anything (buggy or even has system > design flaw) Yes, threats through malicious actors are much more intereating here. > - EROFS only accesses the (meta)data from the source blobs > specified at mount time, even with multi-device support: > > mount -t erofs -odevice=[blob],device=[blob],... [source] That is an important part that wasn't fully clear to me.
On Tue, Jan 20, 2026 at 07:52:42AM +0100, Christoph Hellwig wrote: > On Tue, Jan 20, 2026 at 11:07:48AM +0800, Gao Xiang wrote: > > > > Hi Christoph, > > > > Sorry I didn't phrase things clearly earlier, but I'd still > > like to explain the whole idea, as this feature is clearly > > useful for containerization. I hope we can reach agreement > > on the page cache sharing feature: Christian agreed on this > > feature (and I hope still): > > > > https://lore.kernel.org/linux-fsdevel/20260112-begreifbar-hasten-da396ac2759b@brauner > > He has to ultimatively decide. I do have an uneasy feeling about this. > It's not super informed as I can keep up, and I'm not the one in charge, > but I hope it is helpful to share my perspective. It always is helpful, Christoph! I appreciate your input. I'm fine with this feature. But as I've said in person: I still oppose making any block-based filesystem mountable in unprivileged containers without any sort of trust mechanism. I am however open in the future for block devices protected by dm-verity with the root hash signed by a sufficiently trusted key to be mountable in unprivileged containers.
Hi Christian, On 2026/1/20 21:40, Christian Brauner wrote: > On Tue, Jan 20, 2026 at 07:52:42AM +0100, Christoph Hellwig wrote: >> On Tue, Jan 20, 2026 at 11:07:48AM +0800, Gao Xiang wrote: >>> >>> Hi Christoph, >>> >>> Sorry I didn't phrase things clearly earlier, but I'd still >>> like to explain the whole idea, as this feature is clearly >>> useful for containerization. I hope we can reach agreement >>> on the page cache sharing feature: Christian agreed on this >>> feature (and I hope still): >>> >>> https://lore.kernel.org/linux-fsdevel/20260112-begreifbar-hasten-da396ac2759b@brauner >> >> He has to ultimatively decide. I do have an uneasy feeling about this. >> It's not super informed as I can keep up, and I'm not the one in charge, >> but I hope it is helpful to share my perspective. > > It always is helpful, Christoph! I appreciate your input. Thanks, I will raise some extra comments for Hongbo to change to make this feature more safer. > > I'm fine with this feature. But as I've said in person: I still oppose > making any block-based filesystem mountable in unprivileged containers > without any sort of trust mechanism. Nevertheless, since Christoph put this topic on the community list, I had to repeat my own latest thoughts of this on the list for reference. Anyway, some people would just be nitpicky to the words above as a policy: they will re-invent new non-block-based trick filesystems (but with much odd kernel-parsed metadata design) for the kernel community. Honestly, my own idea is that we should find real threats instead of arbitary assumptions against different types of filesystems. The original question is still that what provents _kernel filesystems with kernel-parsed metadata_ from mountable in unprivileged containers. On my own perspective (in public, without any policy involved), I think it would be better to get some fair technical points & concerns, so that either we either fully get in agreement as the real dead end or really overcome some barriers since this feature is indeed useful. I will not repeat my thoughts again to annoy folks even further for this topic, but document here for reference. > > I am however open in the future for block devices protected by dm-verity > with the root hash signed by a sufficiently trusted key to be mountable > in unprivileged containers. Signed images will be a good start, I fully agree. No one really argues that, and I believe I've told the signed image ideas in person to Christoph and Darrick too. Thanks, Gao Xiang
Hi, Thanks for the reply. On 2026/1/20 14:52, Christoph Hellwig wrote: > On Tue, Jan 20, 2026 at 11:07:48AM +0800, Gao Xiang wrote: >> >> Hi Christoph, >> >> Sorry I didn't phrase things clearly earlier, but I'd still >> like to explain the whole idea, as this feature is clearly >> useful for containerization. I hope we can reach agreement >> on the page cache sharing feature: Christian agreed on this >> feature (and I hope still): >> >> https://lore.kernel.org/linux-fsdevel/20260112-begreifbar-hasten-da396ac2759b@brauner > > He has to ultimatively decide. I do have an uneasy feeling about this. > It's not super informed as I can keep up, and I'm not the one in charge, > but I hope it is helpful to share my perspective. > >> First, let's separate this feature from mounting in user >> namespaces (i.e., unprivileged mounts), because this feature >> is designed specifically for privileged mounts. > > Ok. > >> The EROFS page cache sharing feature stems from a current >> limitation in the page cache: a file-based folio cannot be >> shared across different inode mappings (or the different >> page index within the same mapping; If this limitation >> were resolved, we could implement a finer-grained page >> cache sharing mechanism at the folio level). As you may >> know, this patchset dates back to 2023, > > I didn't.. > >> and as of 2026; I >> still see no indication that the page cache infra will >> change. > > It will be very hard to change unless we move to physical indexing of > the page cache, which has all kinds of downside.s I'm not sure if it's really needed: I think the final folio adaption plan is that folio can be dynamic allocated? then why not keep multiple folios for a physical memory, since folios are not order-0 anymore. Using physical indexing sounds really inflexible on my side, and it can be even regarded as a regression for me. > >> So that let's face the reality: this feature introduces >> on-disk xattrs called "fingerprints." --- Since they're >> just xattrs, the EROFS on-disk format remains unchanged. > > I think the concept of using a backing file of some sort for the shared > pagecache (which I have no problem with at all), vs the imprecise In that way (actually Jingbo worked that approach in 2023), we have to keep the shared data physically contiguous and even uncompressed, which cannot work for most cases. On the other side, I do think `fingerprint` from design is much like persistent NFS file handles in some aspect (but I don't want to equal to that concept, but very similar) for a single trusted domain, we should have to deal with multiple filesystem sources and mark in a unique way in a domain. > selection through a free form fingerprint are quite different aspects, > that could be easily separated. I.e. one could easily imagine using > the data path approach based purely on exact file system metadata. > But that would of course not work with multiple images, which I think > is a key feature here if I'm reading between the lines correctly. EROFS works as golden immutable images, so especially, remote filesystem images can and will only be used without any modification. So we have to deal with multiple filesystems on the same machine, otherwise, _hardlinks_ in a single filesystem can resolve most issues for page cache sharing, but that is not our intention. > >> - Let's not focusing entirely on the random human bugs, >> because I think every practical subsystem should have bugs, >> the whole threat model focuses on the system design, and >> less code doesn't mean anything (buggy or even has system >> design flaw) > > Yes, threats through malicious actors are much more intereating > here. Yes, otherwise we fail into endless meaningless rust and code line comparsion without any useful real system design part. > >> - EROFS only accesses the (meta)data from the source blobs >> specified at mount time, even with multi-device support: >> >> mount -t erofs -odevice=[blob],device=[blob],... [source] > > That is an important part that wasn't fully clear to me. Okay, Thanks, Gao Xiang
On Tue, Jan 20, 2026 at 03:19:21PM +0800, Gao Xiang wrote: >> It will be very hard to change unless we move to physical indexing of >> the page cache, which has all kinds of downside.s > > I'm not sure if it's really needed: I think the final > folio adaption plan is that folio can be dynamic > allocated? then why not keep multiple folios for a > physical memory, since folios are not order-0 anymore. Having multiple folios for the same piece of memory can't work, at we'd have unsynchronized state. > Using physical indexing sounds really inflexible on my > side, and it can be even regarded as a regression for me. I'm absolutely not arguing for that.. >>> So that let's face the reality: this feature introduces >>> on-disk xattrs called "fingerprints." --- Since they're >>> just xattrs, the EROFS on-disk format remains unchanged. >> >> I think the concept of using a backing file of some sort for the shared >> pagecache (which I have no problem with at all), vs the imprecise > > In that way (actually Jingbo worked that approach in 2023), > we have to keep the shared data physically contiguous and > even uncompressed, which cannot work for most cases. Why does that matter? > On the other side, I do think `fingerprint` from design > is much like persistent NFS file handles in some aspect > (but I don't want to equal to that concept, but very > similar) for a single trusted domain, we should have to > deal with multiple filesystem sources and mark in a > unique way in a domain. I don't really thing they are similar in any way.
On 2026/1/22 16:33, Christoph Hellwig wrote: > On Tue, Jan 20, 2026 at 03:19:21PM +0800, Gao Xiang wrote: >>> It will be very hard to change unless we move to physical indexing of >>> the page cache, which has all kinds of downside.s >> >> I'm not sure if it's really needed: I think the final >> folio adaption plan is that folio can be dynamic >> allocated? then why not keep multiple folios for a >> physical memory, since folios are not order-0 anymore. > > Having multiple folios for the same piece of memory can't work, > at we'd have unsynchronized state. Why not just left unsynchronized state in a unique way, but just left mapping + indexing seperated. Anyway, that is just a wild thought, I will not dig into that. > >> Using physical indexing sounds really inflexible on my >> side, and it can be even regarded as a regression for me. > > I'm absolutely not arguing for that.. > >>>> So that let's face the reality: this feature introduces >>>> on-disk xattrs called "fingerprints." --- Since they're >>>> just xattrs, the EROFS on-disk format remains unchanged. >>> >>> I think the concept of using a backing file of some sort for the shared >>> pagecache (which I have no problem with at all), vs the imprecise >> >> In that way (actually Jingbo worked that approach in 2023), >> we have to keep the shared data physically contiguous and >> even uncompressed, which cannot work for most cases. > > Why does that matter? Sorry then, I think I don't get the point, but we really need this for the complete page cache sharing on the single physical machine. > >> On the other side, I do think `fingerprint` from design >> is much like persistent NFS file handles in some aspect >> (but I don't want to equal to that concept, but very >> similar) for a single trusted domain, we should have to >> deal with multiple filesystem sources and mark in a >> unique way in a domain. > > I don't really thing they are similar in any way. Why they are not similiar, you still need persistent IDs in inodes for multiple fses, if there are a content-addressable immutable filesystems working in inodes, they could just use inode hashs as file handles instead of inode numbers + generations. Thanks, Gao Xiang
On Thu, Jan 22, 2026 at 04:40:56PM +0800, Gao Xiang wrote: >> Having multiple folios for the same piece of memory can't work, >> at we'd have unsynchronized state. > > Why not just left unsynchronized state in a unique way, > but just left mapping + indexing seperated. That would not just require allocating the folios dynamically, but most importantly splitting it up. We'd then also need to find a way to chain the folio_link structures from the main folio. I'm not going to see this might not happen, but it feels very far out there and might have all kinds of issues. >>>> I think the concept of using a backing file of some sort for the shared >>>> pagecache (which I have no problem with at all), vs the imprecise >>> >>> In that way (actually Jingbo worked that approach in 2023), >>> we have to keep the shared data physically contiguous and >>> even uncompressed, which cannot work for most cases. >> >> Why does that matter? > > Sorry then, I think I don't get the point, but we really > need this for the complete page cache sharing on the > single physical machine. Why do you need physically contigous space to share it that way? >> >>> On the other side, I do think `fingerprint` from design >>> is much like persistent NFS file handles in some aspect >>> (but I don't want to equal to that concept, but very >>> similar) for a single trusted domain, we should have to >>> deal with multiple filesystem sources and mark in a >>> unique way in a domain. >> >> I don't really thing they are similar in any way. > > Why they are not similiar, you still need persistent IDs > in inodes for multiple fses, if there are a > content-addressable immutable filesystems working in > inodes, they could just use inode hashs as file handles > instead of inode numbers + generations. Sure, if they are well defined, cryptographically secure hashes. But that's different from file handles, which don't address content at all, but are just a handle to given file that bypasses the path lookup. > > Thanks, > Gao Xiang ---end quoted text---
On 2026/1/23 13:39, Christoph Hellwig wrote: > On Thu, Jan 22, 2026 at 04:40:56PM +0800, Gao Xiang wrote: >>> Having multiple folios for the same piece of memory can't work, >>> at we'd have unsynchronized state. >> >> Why not just left unsynchronized state in a unique way, >> but just left mapping + indexing seperated. > > That would not just require allocating the folios dynamically, but most > importantly splitting it up. We'd then also need to find a way to chain > the folio_link structures from the main folio. I'm not going to see this > might not happen, but it feels very far out there and might have all > kinds of issues. I can see the way, but at least I don't have any resource, and I'm even not sure it will happen in the foresee future, so that is why we will not wait for per-folio sharing anymore (memory is already becoming $$$$$$..). > >>>>> I think the concept of using a backing file of some sort for the shared >>>>> pagecache (which I have no problem with at all), vs the imprecise >>>> >>>> In that way (actually Jingbo worked that approach in 2023), >>>> we have to keep the shared data physically contiguous and >>>> even uncompressed, which cannot work for most cases. >>> >>> Why does that matter? >> >> Sorry then, I think I don't get the point, but we really >> need this for the complete page cache sharing on the >> single physical machine. > > Why do you need physically contigous space to share it that way? Yes, it won't be necessary, but the main goal is to share various different filesystem images with consensus per-inode content-addressable IDs, either secure hashs or per-inode UUIDs. I still think it's very useful considering finer-grain page cache sharing can only exist in our heads so I will go on use this way for everyone to save memory (considering AI needs too much memory and memory becomes more expensive.) > >>> >>>> On the other side, I do think `fingerprint` from design >>>> is much like persistent NFS file handles in some aspect >>>> (but I don't want to equal to that concept, but very >>>> similar) for a single trusted domain, we should have to >>>> deal with multiple filesystem sources and mark in a >>>> unique way in a domain. >>> >>> I don't really thing they are similar in any way. >> >> Why they are not similiar, you still need persistent IDs >> in inodes for multiple fses, if there are a >> content-addressable immutable filesystems working in >> inodes, they could just use inode hashs as file handles >> instead of inode numbers + generations. > > Sure, if they are well defined, cryptographically secure hashes. But EROFS is a golden image filesystem generated purely in userspace, vendors will use secure hashs or per-vendor-generated per-inode UUID. > that's different from file handles, which don't address content at all, > but are just a handle to given file that bypasses the path lookup. I agree, so I once said _somewhat_ similar. Considering content-addressable filesystems, of course they could use simplifed secure hashs as file handles in some form. Thanks, Gao Xiang > >> >> Thanks, >> Gao Xiang > ---end quoted text---
On 2026/1/19 17:38, Gao Xiang wrote: > > > On 2026/1/19 17:22, Christoph Hellwig wrote: >> On Mon, Jan 19, 2026 at 04:52:54PM +0800, Gao Xiang wrote: >>>> To me this sounds pretty scary, as we have code in the kernel's trust >>>> domain that heavily depends on arbitrary userspace policy decisions. >>> >>> For example, overlayfs metacopy can also points to >>> arbitary files, what's the difference between them? >>> https://docs.kernel.org/filesystems/overlayfs.html#metadata-only-copy-up >>> >>> By using metacopy, overlayfs can access arbitary files >>> as long as the metacopy has the pointer, so it should >>> be a priviledged stuff, which is similar to this feature. >> >> Sounds scary too. But overlayfs' job is to combine underlying files, so >> it is expected. I think it's the mix of erofs being a disk based file > > But you still could point to an arbitary page cache > if metacopy is used. > >> system, and reaching out beyond the device(s) assigned to the file system >> instance that makes me feel rather uneasy. > > You mean the page cache can be shared from other > filesystems even not backed by these devices/files? > > I admitted yes, there could be different: but that > is why new mount options "inode_share" and the > "domain_id" mount option are used. > > I think they should be regarded as a single super > filesystem if "domain_id" is the same: From the > security perspective much like subvolumes of > a single super filesystem. > > And mounting a new filesystem within a "domain_id" > can be regard as importing data into the super > "domain_id" filesystem, and I think only trusted > data within the single domain can be mounted/shared. > >> >>>> >>>> Similarly the sharing of blocks between different file system >>>> instances opens a lot of questions about trust boundaries and life >>>> time rules. I don't really have good answers, but writing up the >>> >>> Could you give more details about the these? Since you >>> raised the questions but I have no idea what the threats >>> really come from. >> >> Right now by default we don't allow any unprivileged mounts. Now >> if people thing that say erofs is safe enough and opt into that, >> it needs to be clear what the boundaries of that are. For a file >> system limited to a single block device that boundaries are >> pretty clear. For file systems reaching out to the entire system >> (or some kind of domain), the scope is much wider. btw, I think it's indeed to be helpful to get the boundaries (even from on-disk formats and runtime features). But I have to clarify that a single EROFS filesystem instance won' have access to random block device or files. The backing device or files are specified by users explicitly when mounting, like: mount -odevice=blob1,device=blob2,...,device=blobn-1 blob0 mnt And these devices / files will be opened when mounting at once, no more than that. May I ask the difference between one device/file and a group of given devices/files? Especially for immutable usage. Thanks, Gao Xiang
On 2026/1/19 15:53, Gao Xiang wrote:
>
>
> On 2026/1/19 15:29, Christoph Hellwig wrote:
>> On Sat, Jan 17, 2026 at 12:21:16AM +0800, Gao Xiang wrote:
>>> Hi Christoph,
>>>
>>> On 2026/1/16 23:46, Christoph Hellwig wrote:
>>>> I don't really understand the fingerprint idea. Files with the
>>>> same content will point to the same physical disk blocks, so that
>>>> should be a much better indicator than a finger print? Also how does
>>>
>>> Page cache sharing should apply to different EROFS
>>> filesystem images on the same machine too, so the
>>> physical disk block number idea cannot be applied
>>> to this.
>>
>> Oh. That's kinda unexpected and adds another twist to the whole scheme.
>> So in that case the on-disk data actually is duplicated in each image
>> and then de-duplicated in memory only? Ewwww...
>
> On-disk deduplication is decoupled from this feature:
Of course, first of all:
- Data within a single EROFS image is deduplicated of
course (for example, erofs supports extent-based
chunks);
>
> - EROFS can share the same blocks in blobs (multiple
> devices) among different images, so that on-disk data
This way is like docker layers, common data/layers
can be kept in seperate blobs;
> can be shared by refering the same blobs;
Both deduplication ways above will be applied to the
golden images which will be transfered on the wire.
>
> - On-disk data won't be deduplicated in image if reflink
> is enabled for backing fses, userspace mounters can
> trigger background GCs to deduplicate the identical
> blocks.
And this way is applied at runtime if underlayfs
supports reflink.
>
> I just tried to say EROFS doesn't limit what's
> the real meaning of `fingerprint` (they can be serialized
> integer numbers for example defined by a specific image
> publisher, or a specific secure hash. Currently,
> "mkfs.erofs" will generate sha256 for each files), but
> left them to the image builders:
>
>
> 1) if `fingerprint` is distributed as on-disk part of
> signed images, as I said, it could be shared within a
> trusted domain_id (usually the same image builder) --
> that is the top priority thing using dmverity;
>
> Or
>
> 2) If `fingerprint` is not distributed in the image
> or images are untrusted (e.g. unknown signatures),
> image fetchers can scan each inode in the golden
> images to generate an extra minimal EROFS
> metadata-only image with local calculated
> `fingerprint` too, which is much similar to the
> current ostree way (parse remote files and calculate
> digests).
>
> Thanks,
> Gao Xiang
© 2016 - 2026 Red Hat, Inc.