From nobody Thu Feb 12 20:14:56 2026 Received: from out30-113.freemail.mail.aliyun.com (out30-113.freemail.mail.aliyun.com [115.124.30.113]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3CE6842A93 for ; Sun, 5 Jan 2025 15:12:19 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.113 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089944; cv=none; b=OfJhsqoFIMnSxzqirTahQnOtXtt610mG7OuWH0leKbb8b5UsIqBhRLWet91qJ5N9HNFjl9hlX3c6uh7CISbImiBsqGDR4QUWlMoUW8NT91ubiMFUY1Bcsnx3f/+wCQZN1E9GKVEa7UOpUbLVWEUigW6byfszT7q+4RkOXiuLTR0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089944; c=relaxed/simple; bh=/jkur354J85qrY5YxVmk7+AUfXj4Mqtl6k5N2oDMkyQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=MCeM6MkM5yjXpSTJD5M7Ji1yswgwcIMtQbiBlx70KaAkBh8TpufIahs0mKoSQT1MHZPPLOiGewuLKRqbi9qDUOlpDluE0WQMVsx10lSwoPJkQTLS+ZI/kgn3TccdqW2QfiQdvdE/vE7lOxHzQjvSGunV/WVAIV5TP7wRzw5/PaE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=Ne4BuyVk; arc=none smtp.client-ip=115.124.30.113 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="Ne4BuyVk" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1736089931; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=7JzKr/Q1YXpJrtDxijOF4+BJ1RBdmdJhpP2voDpJsTU=; b=Ne4BuyVkwXmLzhtr4+oYU1se+6G/84ZLnpYHb6NNWTJDeNWXs/OvdpHFrrv9gWyzkrALT3KENxkAFU3MbXrbNlfK/mO8IHDSJCiicKFHXviOyK3mQErU3TdDcHH9dEodeyvSHqsDdNIaISdlfZa2yz2p//Ti89t0kilPVP1jdn8= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WMyob0G_1736089930 cluster:ay36) by smtp.aliyun-inc.com; Sun, 05 Jan 2025 23:12:11 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [RFC PATCH v5 1/4] erofs: move `struct erofs_anon_fs_type` to super.c Date: Sun, 5 Jan 2025 23:12:05 +0800 Message-ID: <20250105151208.3797385-2-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> References: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Move the `struct erofs_anon_fs_type` to the super.c and expose it in preparation for the upcoming page cache share feature. Signed-off-by: Hongzhen Luo --- fs/erofs/fscache.c | 13 ------------- fs/erofs/internal.h | 2 ++ fs/erofs/super.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce3d8737df85..ae7bd9ebff38 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -3,7 +3,6 @@ * Copyright (C) 2022, Alibaba Cloud * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ -#include #include #include "internal.h" =20 @@ -13,18 +12,6 @@ static LIST_HEAD(erofs_domain_list); static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; =20 -static int erofs_anon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type erofs_anon_fs_type =3D { - .owner =3D THIS_MODULE, - .name =3D "pseudo_erofs", - .init_fs_context =3D erofs_anon_init_fs_context, - .kill_sb =3D kill_anon_super, -}; - struct erofs_fscache_io { struct netfs_cache_resources cres; struct iov_iter iter; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 686d835eb533..47004eb89838 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -379,6 +379,8 @@ extern const struct file_operations erofs_dir_fops; =20 extern const struct iomap_ops z_erofs_iomap_report_ops; =20 +extern struct file_system_type erofs_anon_fs_type; + /* flags for erofs_fscache_register_cookie() */ #define EROFS_REG_COOKIE_SHARE 0x0001 #define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1fc5623c3a4d..25d2c2b44d0a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "xattr.h" =20 #define CREATE_TRACE_POINTS @@ -852,6 +853,18 @@ static struct file_system_type erofs_fs_type =3D { }; MODULE_ALIAS_FS("erofs"); =20 +static int erofs_anon_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; +} + +struct file_system_type erofs_anon_fs_type =3D { + .owner =3D THIS_MODULE, + .name =3D "pseudo_erofs", + .init_fs_context =3D erofs_anon_init_fs_context, + .kill_sb =3D kill_anon_super, +}; + static int __init erofs_module_init(void) { int err; --=20 2.43.5 From nobody Thu Feb 12 20:14:56 2026 Received: from out30-113.freemail.mail.aliyun.com (out30-113.freemail.mail.aliyun.com [115.124.30.113]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7EE81481B3 for ; Sun, 5 Jan 2025 15:12:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.113 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089944; cv=none; b=mWOnGqV0e5qDhSUYB3KaYmsrsWEmDhpJJfI5JOVpd9WkzmovCCPDOv5m4+B7U4FINGB+WAFpZEVQAfqMQ4bHTAhPxEXgph6Lka7cv6yzP8NNtVfL2bMJVWm2ThIvO8eiRCMOUQa49wEM1tx9lLS/A0rKfFFYTN2mImdjgsMLKfI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089944; c=relaxed/simple; bh=V10STexruaNpYAPWPbHSl3VEXGSA0RdD3BhvK3XX9Ts=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=C8wSRkUIFKxd5iYcJuxXjsCKRBu8kQHgG4LjT9cg9yAl8JCc5KZoOSm3hpEV/DgHTsQ+qbzzFw1M5DW0YAID5SWK0oWl9Sw4TxRy+/+voD6UKf50fBuoNfgyE/kspjXEvsklR3rGBT/FXhLQC7ejEGhBHZwj7PQSWdkvfyem6ks= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=Z0js2Giz; arc=none smtp.client-ip=115.124.30.113 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="Z0js2Giz" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1736089933; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=xzy4wpwrY2Ed3xF90C4MLp1BMlntC8ejaDV8Ez8/VfM=; b=Z0js2GizyEYJXURgk/SDLqwRGu+bV65fa25Kxyzn+bxNqJAriQv4hYbqzVpGx+0F/1RDm6D2fCdm9m/el882TSm2je4IRDRjfxt42TA9vniNZM7OrBVejFSlDh7toudI6xmF2rgPdCvmvrgsDY5uoUtWbNeAuz9ZlsiAuBqMfxc= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WMynS4e_1736089931 cluster:ay36) by smtp.aliyun-inc.com; Sun, 05 Jan 2025 23:12:12 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature Date: Sun, 5 Jan 2025 23:12:06 +0800 Message-ID: <20250105151208.3797385-3-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> References: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Currently, reading files with different paths (or names) but the same content will consume multiple copies of the page cache, even if the content of these page caches is the same. For example, reading identical files (e.g., *.so files) from two different minor versions of container images will cost multiple copies of the same page cache, since different containers have different mount points. Therefore, sharing the page cache for files with the same content can save memory. This introduces the page cache share feature in erofs. During the mkfs phase, the file content is hashed and the hash value is stored in the `trusted.erofs.fingerprint` extended attribute. Inodes of files with the same `trusted.erofs.fingerprint` are mapped to the same anonymous inode (indicated by the `ano_inode` field). When a read request occurs, the anonymous inode serves as a "container" whose page cache is shared. The actual operations involving the iomap are carried out by the original inode which is mapped to the anonymous inode. Signed-off-by: Hongzhen Luo --- fs/erofs/Kconfig | 10 ++ fs/erofs/Makefile | 1 + fs/erofs/internal.h | 4 + fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++ fs/erofs/pagecache_share.h | 26 +++++ fs/erofs/super.c | 24 +++- 6 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 fs/erofs/pagecache_share.c create mode 100644 fs/erofs/pagecache_share.h diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 6ea60661fa55..3aa5f946b5f1 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI at higher priority. =20 If unsure, say N. + +config EROFS_FS_PAGE_CACHE_SHARE + bool "EROFS page cache share support" + depends on EROFS_FS + default n + help + This permits EROFS to share page cache for files with same + fingerprints. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 4331d53c7109..d035c9063ef8 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) +=3D decompressor_defl= ate.o erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) +=3D decompressor_zstd.o erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) +=3D fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) +=3D fscache.o +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) +=3D pagecache_share.o \ No newline at end of file diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 47004eb89838..6c87621d86ba 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -280,6 +280,9 @@ struct erofs_inode { }; #endif /* CONFIG_EROFS_FS_ZIP */ }; +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct inode *ano_inode; +#endif /* the corresponding vfs inode */ struct inode vfs_inode; }; @@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops; =20 extern const struct file_operations erofs_file_fops; extern const struct file_operations erofs_dir_fops; +extern const struct file_operations erofs_pcshr_fops; =20 extern const struct iomap_ops z_erofs_iomap_report_ops; =20 diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c new file mode 100644 index 000000000000..703fd17c002c --- /dev/null +++ b/fs/erofs/pagecache_share.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include +#include +#include +#include +#include "pagecache_share.h" +#include "internal.h" +#include "xattr.h" + +#define PCSHR_FPRT_IDX 4 +#define PCSHR_FPRT_NAME "erofs.fingerprint" +#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024) + +struct erofs_pcshr_counter { + struct mutex mutex; + struct kref ref; + struct vfsmount *mnt; +}; + +struct erofs_pcshr_private { + char fprt[PCSHR_FPRT_MAXLEN]; +}; + +static struct erofs_pcshr_counter mnt_counter =3D { + .mutex =3D __MUTEX_INITIALIZER(mnt_counter.mutex), + .mnt =3D NULL, +}; + +static void erofs_pcshr_counter_release(struct kref *ref) +{ + struct erofs_pcshr_counter *counter =3D container_of(ref, + struct erofs_pcshr_counter, ref); + + DBG_BUGON(!counter->mnt); + kern_unmount(counter->mnt); + counter->mnt =3D NULL; +} + +int erofs_pcshr_init_mnt(void) +{ + int ret; + struct vfsmount *tmp; + + mutex_lock(&mnt_counter.mutex); + if (!mnt_counter.mnt) { + tmp =3D kern_mount(&erofs_anon_fs_type); + if (IS_ERR(tmp)) { + ret =3D PTR_ERR(tmp); + goto out; + } + mnt_counter.mnt =3D tmp; + kref_init(&mnt_counter.ref); + } else + kref_get(&mnt_counter.ref); + ret =3D 0; +out: + mutex_unlock(&mnt_counter.mutex); + return ret; +} + +void erofs_pcshr_free_mnt(void) +{ + mutex_lock(&mnt_counter.mutex); + kref_put(&mnt_counter.ref, erofs_pcshr_counter_release); + mutex_unlock(&mnt_counter.mutex); +} + +static int erofs_fprt_eq(struct inode *inode, void *data) +{ + struct erofs_pcshr_private *ano_private =3D inode->i_private; + + return ano_private && memcmp(ano_private->fprt, data, + sizeof(size_t) + *(size_t *)data) =3D=3D 0 ? 1 : 0; +} + +static int erofs_fprt_set(struct inode *inode, void *data) +{ + struct erofs_pcshr_private *ano_private; + + ano_private =3D kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL); + if (!ano_private) + return -ENOMEM; + memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data); + inode->i_private =3D ano_private; + return 0; +} + +int erofs_pcshr_fill_inode(struct inode *inode) +{ + struct erofs_inode *vi =3D EROFS_I(inode); + /* | fingerprint length | fingerprint content | */ + char fprt[PCSHR_FPRT_MAXLEN]; + struct inode *ano_inode; + unsigned long fprt_hash; + size_t fprt_len; + int ret =3D -1; + + vi->ano_inode =3D NULL; + memset(fprt, 0, sizeof(fprt)); + fprt_len =3D erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME, + fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN); + if (fprt_len > 0 && fprt_len <=3D PCSHR_FPRT_MAXLEN) { + *(size_t *)fprt =3D fprt_len; + fprt_hash =3D xxh32(fprt + sizeof(size_t), fprt_len, 0); + ano_inode =3D iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash, + erofs_fprt_eq, erofs_fprt_set, fprt); + DBG_BUGON(!ano_inode); + vi->ano_inode =3D ano_inode; + if (ano_inode->i_state & I_NEW) { + if (erofs_inode_is_data_compressed(vi->datalayout)) + ano_inode->i_mapping->a_ops =3D &z_erofs_aops; + else + ano_inode->i_mapping->a_ops =3D &erofs_aops; + ano_inode->i_size =3D inode->i_size; + unlock_new_inode(ano_inode); + } + ret =3D 0; + } + return ret; +} + +void erofs_pcshr_free_inode(struct inode *inode) +{ + struct erofs_inode *vi =3D EROFS_I(inode); + + if (S_ISREG(inode->i_mode) && vi->ano_inode) { + iput(vi->ano_inode); + vi->ano_inode =3D NULL; + } +} + +static struct file *erofs_pcshr_alloc_file(struct file *file, + struct inode *ano_inode) +{ + struct file *ano_file; + + ano_file =3D alloc_file_pseudo(ano_inode, mnt_counter.mnt, + "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops); + if (IS_ERR(ano_file)) + return ano_file; + + file_ra_state_init(&ano_file->f_ra, file->f_mapping); + ano_file->private_data =3D EROFS_I(file_inode(file)); + return ano_file; +} + +static int erofs_pcshr_file_open(struct inode *inode, struct file *file) +{ + struct file *ano_file; + struct inode *ano_inode; + struct erofs_inode *vi =3D EROFS_I(inode); + + ano_inode =3D vi->ano_inode; + if (!ano_inode) + return -EINVAL; + + ano_file =3D erofs_pcshr_alloc_file(file, ano_inode); + if (IS_ERR(ano_file)) + return PTR_ERR(ano_file); + + ihold(ano_inode); + file->private_data =3D (void *)ano_file; + return 0; +} + +static int erofs_pcshr_file_release(struct inode *inode, struct file *file) +{ + if (!file->private_data) + return -EINVAL; + + fput((struct file *)file->private_data); + file->private_data =3D NULL; + return 0; +} + +static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode __maybe_unused *inode =3D file_inode(iocb->ki_filp); + struct file *file, *ano_file; + struct kiocb ano_iocb; + ssize_t res; + + if (!iov_iter_count(to)) + return 0; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return erofs_file_fops.read_iter(iocb, to); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return erofs_file_fops.read_iter(iocb, to); + + memcpy(&ano_iocb, iocb, sizeof(struct kiocb)); + file =3D iocb->ki_filp; + ano_file =3D file->private_data; + if (!ano_file) + return -EINVAL; + ano_iocb.ki_filp =3D ano_file; + res =3D filemap_read(&ano_iocb, to, 0); + memcpy(iocb, &ano_iocb, sizeof(struct kiocb)); + iocb->ki_filp =3D file; + file_accessed(file); + return res; +} + +extern const struct vm_operations_struct generic_file_vm_ops; + +static int erofs_pcshr_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *ano_file =3D file->private_data; + + vma_set_file(vma, ano_file); + vma->vm_ops =3D &generic_file_vm_ops; + return 0; +} + +const struct file_operations erofs_pcshr_fops =3D { + .open =3D erofs_pcshr_file_open, + .llseek =3D generic_file_llseek, + .read_iter =3D erofs_pcshr_file_read_iter, + .mmap =3D erofs_pcshr_mmap, + .release =3D erofs_pcshr_file_release, + .get_unmapped_area =3D thp_get_unmapped_area, + .splice_read =3D filemap_splice_read, +}; diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h new file mode 100644 index 000000000000..f3889d6889e5 --- /dev/null +++ b/fs/erofs/pagecache_share.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#ifndef __EROFS_PAGECACHE_SHARE_H +#define __EROFS_PAGECACHE_SHARE_H + +#include + +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + +int erofs_pcshr_init_mnt(void); +void erofs_pcshr_free_mnt(void); +int erofs_pcshr_fill_inode(struct inode *inode); +void erofs_pcshr_free_inode(struct inode *inode); + +#else + +static inline int erofs_pcshr_init_mnt(void) { return 0; } +static inline void erofs_pcshr_free_mnt(void) {} +static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1;= } +static inline void erofs_pcshr_free_inode(struct inode *inode) {} + +#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE + +#endif diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 25d2c2b44d0a..b4ce07dc931c 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -853,9 +853,31 @@ static struct file_system_type erofs_fs_type =3D { }; MODULE_ALIAS_FS("erofs"); =20 +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE +static void erofs_free_anon_inode(struct inode *inode) +{ + kfree(inode->i_private); + inode->i_private =3D NULL; +} +#else +#define erofs_free_anon_inode NULL +#endif + +static const struct super_operations erofs_anon_sops =3D { + .statfs =3D simple_statfs, + .free_inode =3D erofs_free_anon_inode, +}; + + static int erofs_anon_init_fs_context(struct fs_context *fc) { - return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; + struct pseudo_fs_context *ctx; + + ctx =3D init_pseudo(fc, EROFS_SUPER_MAGIC); + if (ctx) + ctx->ops =3D &erofs_anon_sops; + + return ctx ? 0 : -ENOMEM; } =20 struct file_system_type erofs_anon_fs_type =3D { --=20 2.43.5 From nobody Thu Feb 12 20:14:56 2026 Received: from out30-124.freemail.mail.aliyun.com (out30-124.freemail.mail.aliyun.com [115.124.30.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A1DA77082E for ; Sun, 5 Jan 2025 15:12:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089945; cv=none; b=j1S/4aFGCL/xvfMibFnpVO3NprIrVW9va0Qx+fZbyuzclBQF/sBDLfW4ROKKj7PuQgDDiP4fXkE3/tKZrr9GBQexfvt5rvz0vOXrnXP2ObScSaqXzpMS6pGP2+WW0hdXilLBCCOvaE3Ros8uGiLJ3JvpMVWdN4MK26xkuopnyDE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089945; c=relaxed/simple; bh=PUy0gA8L3+9uMjQ6vFhqRHPKPuIQyhoPMq5koXpo6IE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=HB3aLkxfaODyz1w7KS81JcA168SOK/leY0GT9K1ibD3qCVJ5QwqLpeT2r5zgPtj4WUps+suFJJjvzv7f0jLDWWjrdsoBaFenAOW3aelsuQSkR8WlDvVAQ5mvvhJxhgJX0Nx7tvsgOYZHv/iFwUJ7ezcpff4o4tyfEgVKQr04U0M= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=S6zihR0a; arc=none smtp.client-ip=115.124.30.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="S6zihR0a" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1736089934; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=zcuKqis8GqEhVM+85u6GWnryNgXPPTu4+n1bBvKQiR8=; b=S6zihR0av8p+pe8JA7Uz0mcClBcFTswS+WBTsZ3DL+ZMviZoVIFIMLz+HDLXkToYOc4wrfXaBvHL3U3p6pvxvX8sKo+4X5XdlLMwlZLajC8FIgJngzgrE9/gZ/O3KugCzeQlewo2zbHXpU5BGW3xGWDrxsUPiUvzQaTdNHtK0Bk= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WMypL8l_1736089932 cluster:ay36) by smtp.aliyun-inc.com; Sun, 05 Jan 2025 23:12:13 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [RFC PATCH v5 3/4] erofs: apply the page cache share feature Date: Sun, 5 Jan 2025 23:12:07 +0800 Message-ID: <20250105151208.3797385-4-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> References: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" This modifies relevant functions to apply the page cache share feature. Below is the memory usage for reading all files in two different minor versions of container images: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 241 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 163 | 33% | +-------------------+------------------+-------------+---------------+ | | No | 872 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 630 | 28% | +-------------------+------------------+-------------+---------------+ | | No | 2771 | - | | tensorflow +------------------+-------------+---------------+ | 1.11.0 & 2.11.1 | Yes | 2340 | 16% | +-------------------+------------------+-------------+---------------+ | | No | 926 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 735 | 21% | +-------------------+------------------+-------------+---------------+ | | No | 390 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 219 | 44% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 924 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 474 | 49% | +-------------------+------------------+-------------+---------------+ Additionally, the table below shows the runtime memory usage of the container: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 35 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 28 | 20% | +-------------------+------------------+-------------+---------------+ | | No | 149 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 95 | 37% | +-------------------+------------------+-------------+---------------+ | | No | 1028 | - | | tensorflow +------------------+-------------+---------------+ | 1.11.0 & 2.11.1 | Yes | 930 | 10% | +-------------------+------------------+-------------+---------------+ | | No | 155 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 132 | 15% | +-------------------+------------------+-------------+---------------+ | | No | 25 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 20 | 20% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 186 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 98 | 48% | +-------------------+------------------+-------------+---------------+ Signed-off-by: Hongzhen Luo --- fs/erofs/data.c | 14 +++++++-- fs/erofs/inode.c | 5 ++- fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++ fs/erofs/pagecache_share.h | 11 +++++++ fs/erofs/super.c | 7 +++++ fs/erofs/zdata.c | 9 ++++-- 6 files changed, 104 insertions(+), 5 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0cd6b5c4df98..fb08acbeaab6 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" +#include "pagecache_share.h" #include #include =20 @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_e= xtent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { - return iomap_read_folio(folio, &erofs_iomap_ops); + int ret, pcshr; + + pcshr =3D erofs_pcshr_read_begin(file, folio); + ret =3D iomap_read_folio(folio, &erofs_iomap_ops); + erofs_pcshr_read_end(file, folio, pcshr); + return ret; } =20 static void erofs_readahead(struct readahead_control *rac) { - return iomap_readahead(rac, &erofs_iomap_ops); + int pcshr; + + pcshr =3D erofs_pcshr_readahead_begin(rac); + iomap_readahead(rac, &erofs_iomap_ops); + erofs_pcshr_readahead_end(rac, pcshr); } =20 static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d4b89407822a..0b070f4b46b8 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" +#include "pagecache_share.h" #include =20 static int erofs_fill_symlink(struct inode *inode, void *kaddr, @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op =3D &erofs_generic_iops; - if (erofs_inode_is_data_compressed(vi->datalayout)) + if (erofs_pcshr_fill_inode(inode) =3D=3D 0) + inode->i_fop =3D &erofs_pcshr_fops; + else if (erofs_inode_is_data_compressed(vi->datalayout)) inode->i_fop =3D &generic_ro_fops; else inode->i_fop =3D &erofs_file_fops; diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c index 703fd17c002c..22172b5e21c7 100644 --- a/fs/erofs/pagecache_share.c +++ b/fs/erofs/pagecache_share.c @@ -22,6 +22,7 @@ struct erofs_pcshr_counter { =20 struct erofs_pcshr_private { char fprt[PCSHR_FPRT_MAXLEN]; + struct mutex mutex; }; =20 static struct erofs_pcshr_counter mnt_counter =3D { @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data) if (!ano_private) return -ENOMEM; memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data); + mutex_init(&ano_private->mutex); inode->i_private =3D ano_private; return 0; } @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops =3D { .get_unmapped_area =3D thp_get_unmapped_area, .splice_read =3D filemap_splice_read, }; + +int erofs_pcshr_read_begin(struct file *file, struct folio *folio) +{ + struct erofs_inode *vi; + struct erofs_pcshr_private *ano_private; + + if (!(file && file->private_data)) + return 0; + + vi =3D file->private_data; + if (vi->ano_inode !=3D file_inode(file)) + return 0; + + ano_private =3D vi->ano_inode->i_private; + mutex_lock(&ano_private->mutex); + folio->mapping->host =3D &vi->vfs_inode; + return 1; +} + +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcsh= r) +{ + struct erofs_pcshr_private *ano_private; + + if (pcshr =3D=3D 0) + return; + + ano_private =3D file_inode(file)->i_private; + folio->mapping->host =3D file_inode(file); + mutex_unlock(&ano_private->mutex); +} + +int erofs_pcshr_readahead_begin(struct readahead_control *rac) +{ + struct erofs_inode *vi; + struct file *file =3D rac->file; + struct erofs_pcshr_private *ano_private; + + if (!(file && file->private_data)) + return 0; + + vi =3D file->private_data; + if (vi->ano_inode !=3D file_inode(file)) + return 0; + + ano_private =3D file_inode(file)->i_private; + mutex_lock(&ano_private->mutex); + rac->mapping->host =3D &vi->vfs_inode; + return 1; +} + +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) +{ + struct erofs_pcshr_private *ano_private; + + if (pcshr =3D=3D 0) + return; + + ano_private =3D file_inode(rac->file)->i_private; + rac->mapping->host =3D file_inode(rac->file); + mutex_unlock(&ano_private->mutex); +} diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h index f3889d6889e5..abda2a60278b 100644 --- a/fs/erofs/pagecache_share.h +++ b/fs/erofs/pagecache_share.h @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void); int erofs_pcshr_fill_inode(struct inode *inode); void erofs_pcshr_free_inode(struct inode *inode); =20 +/* switch between the anonymous inode and the real inode */ +int erofs_pcshr_read_begin(struct file *file, struct folio *folio); +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcsh= r); +int erofs_pcshr_readahead_begin(struct readahead_control *rac); +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr); + #else =20 static inline int erofs_pcshr_init_mnt(void) { return 0; } @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {} static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1;= } static inline void erofs_pcshr_free_inode(struct inode *inode) {} =20 +static inline int erofs_pcshr_read_begin(struct file *file, struct folio *= folio) { return 0; } +static inline void erofs_pcshr_read_end(struct file *file, struct folio *f= olio, int pcshr) {} +static inline int erofs_pcshr_readahead_begin(struct readahead_control *ra= c) { return 0; } +static inline void erofs_pcshr_readahead_end(struct readahead_control *rac= , int pcshr) {} + #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE =20 #endif diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b4ce07dc931c..1b690eb6c1f1 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -13,6 +13,7 @@ #include #include #include "xattr.h" +#include "pagecache_share.h" =20 #define CREATE_TRACE_POINTS #include @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi =3D EROFS_I(inode); =20 + erofs_pcshr_free_inode(inode); if (inode->i_op =3D=3D &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb,= struct fs_context *fc) if (err) return err; =20 + err =3D erofs_pcshr_init_mnt(); + if (err) + return err; + erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); + erofs_pcshr_free_mnt(); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 19ef4ff2a134..fc2ed01eaabe 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,6 +5,7 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" +#include "pagecache_share.h" #include #include #include @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, str= uct folio *folio) { struct inode *const inode =3D folio->mapping->host; struct z_erofs_decompress_frontend f =3D DECOMPRESS_FRONTEND_INIT(inode); - int err; + int err, pcshr; =20 trace_erofs_read_folio(folio, false); + pcshr =3D erofs_pcshr_read_begin(file, folio); f.headoffset =3D (erofs_off_t)folio->index << PAGE_SHIFT; =20 z_erofs_pcluster_readmore(&f, NULL, true); @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, stru= ct folio *folio) =20 erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + erofs_pcshr_read_end(file, folio, pcshr); return err; } =20 @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_contro= l *rac) struct z_erofs_decompress_frontend f =3D DECOMPRESS_FRONTEND_INIT(inode); struct folio *head =3D NULL, *folio; unsigned int nr_folios; - int err; + int err, pcshr; =20 + pcshr =3D erofs_pcshr_readahead_begin(rac); f.headoffset =3D readahead_pos(rac); =20 z_erofs_pcluster_readmore(&f, rac, true); @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_contro= l *rac) (void)z_erofs_runqueue(&f, nr_folios); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + erofs_pcshr_readahead_end(rac, pcshr); } =20 const struct address_space_operations z_erofs_aops =3D { --=20 2.43.5 From nobody Thu Feb 12 20:14:56 2026 Received: from out30-131.freemail.mail.aliyun.com (out30-131.freemail.mail.aliyun.com [115.124.30.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AC3BE14A4F3 for ; Sun, 5 Jan 2025 15:12:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.131 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089945; cv=none; b=hkqelE9GPjldcNGPtZZub7pv7/D2G2O8r1hN6HeO2VcNhPAJZRBCC4w148ZjvoaFdJeCcLIZakdZwInD3zsQPn8oZUODAIbxUbXChq8MilSRbIEPnD6nlQDGfd8GH1Z4ClXEDUgKcKJbhMZOH25c4oIcoh0MV3vzMfzieiivImw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736089945; c=relaxed/simple; bh=6t9XUMI8Kh7Xai4WAJSdAxjPM/lApNzzyaGGHZDDZbE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=bK46vzkjfWxdzp/FOh12aa45izjYvOqiG5elNwPoMyXMtPkupstRDv6IZeP3WA7aWnv9WL5WR2YyZi+Yx1HJ2jQg8e9eQ2A0T3yb/zudtPtiIEm/jxzayeRRn3oq9vaZHZWCZd1PicalYk1DHj7dWy1CXtZ6DRm2ry3pJsnuiJs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=xtnvVUeu; arc=none smtp.client-ip=115.124.30.131 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="xtnvVUeu" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1736089935; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=irKsjukeB8am87/9XUlU8GqkONSxlLOSB/FtU9kbpzU=; b=xtnvVUeui0a/CivSx4kw/2r/aY61TS65u8Nfp6R4NIkf4sH+oribkQm+UC3YdBfM0jOIcySrXhBCFJZeVjs5mEqmZ6BjuX7IfN04ksUIWnCYemmyhXno4jTxcc1bGstqmO4u6mFtl8P6nzXC6XI9XPA1Nc+8ZL7df2BT0spOYKc= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WMyjv2C_1736089934 cluster:ay36) by smtp.aliyun-inc.com; Sun, 05 Jan 2025 23:12:14 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [RFC PATCH v5 4/4] erofs: introduce .fadvise for page cache share Date: Sun, 5 Jan 2025 23:12:08 +0800 Message-ID: <20250105151208.3797385-5-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> References: <20250105151208.3797385-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" When using .fadvice to release a file's page cache, it frees those page caches that were firstly read by this file. To achieve this, an interval tree is added in the inode of that file to track the segments firstly read by that inode. Signed-off-by: Hongzhen Luo --- fs/erofs/data.c | 5 +- fs/erofs/internal.h | 3 + fs/erofs/pagecache_share.c | 151 +++++++++++++++++++++++++++++++++++-- fs/erofs/pagecache_share.h | 10 ++- fs/erofs/zdata.c | 5 +- 5 files changed, 160 insertions(+), 14 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fb08acbeaab6..ebb9a79e5f0e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -382,10 +382,11 @@ static int erofs_read_folio(struct file *file, struct= folio *folio) static void erofs_readahead(struct readahead_control *rac) { int pcshr; + unsigned long start; =20 - pcshr =3D erofs_pcshr_readahead_begin(rac); + pcshr =3D erofs_pcshr_readahead_begin(rac, &start); iomap_readahead(rac, &erofs_iomap_ops); - erofs_pcshr_readahead_end(rac, pcshr); + erofs_pcshr_readahead_end(rac, pcshr, start); } =20 static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 6c87621d86ba..593c79abfb79 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -282,6 +282,9 @@ struct erofs_inode { }; #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE struct inode *ano_inode; + /* first-read segments */ + struct rb_root_cached segs; + struct mutex segs_mutex; #endif /* the corresponding vfs inode */ struct inode vfs_inode; diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c index 22172b5e21c7..46b022de5f17 100644 --- a/fs/erofs/pagecache_share.c +++ b/fs/erofs/pagecache_share.c @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include "pagecache_share.h" #include "internal.h" #include "xattr.h" @@ -18,6 +21,8 @@ struct erofs_pcshr_counter { struct mutex mutex; struct kref ref; struct vfsmount *mnt; + /* kmem cache for each inode's first-read segments */ + struct kmem_cache *segsp; }; =20 struct erofs_pcshr_private { @@ -38,6 +43,8 @@ static void erofs_pcshr_counter_release(struct kref *ref) DBG_BUGON(!counter->mnt); kern_unmount(counter->mnt); counter->mnt =3D NULL; + kmem_cache_destroy(counter->segsp); + counter->segsp =3D NULL; } =20 int erofs_pcshr_init_mnt(void) @@ -54,6 +61,14 @@ int erofs_pcshr_init_mnt(void) } mnt_counter.mnt =3D tmp; kref_init(&mnt_counter.ref); + + mnt_counter.segsp =3D kmem_cache_create("erofs_segs", + sizeof(struct interval_tree_node), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL); + if (!mnt_counter.segsp) { + ret =3D -ENOMEM; + goto out; + } } else kref_get(&mnt_counter.ref); ret =3D 0; @@ -69,6 +84,16 @@ void erofs_pcshr_free_mnt(void) mutex_unlock(&mnt_counter.mutex); } =20 +static struct interval_tree_node *erofs_pcshr_alloc_seg(void) +{ + return kmem_cache_alloc(mnt_counter.segsp, GFP_KERNEL); +} + +static void erofs_pcshr_free_seg(struct interval_tree_node *seg) +{ + kmem_cache_free(mnt_counter.segsp, seg); +} + static int erofs_fprt_eq(struct inode *inode, void *data) { struct erofs_pcshr_private *ano_private =3D inode->i_private; @@ -111,6 +136,8 @@ int erofs_pcshr_fill_inode(struct inode *inode) erofs_fprt_eq, erofs_fprt_set, fprt); DBG_BUGON(!ano_inode); vi->ano_inode =3D ano_inode; + vi->segs =3D RB_ROOT_CACHED; + mutex_init(&vi->segs_mutex); if (ano_inode->i_state & I_NEW) { if (erofs_inode_is_data_compressed(vi->datalayout)) ano_inode->i_mapping->a_ops =3D &z_erofs_aops; @@ -126,12 +153,20 @@ int erofs_pcshr_fill_inode(struct inode *inode) =20 void erofs_pcshr_free_inode(struct inode *inode) { + struct interval_tree_node *seg, *next_seg; struct erofs_inode *vi =3D EROFS_I(inode); =20 if (S_ISREG(inode->i_mode) && vi->ano_inode) { iput(vi->ano_inode); vi->ano_inode =3D NULL; } + seg =3D interval_tree_iter_first(&vi->segs, 0, LLONG_MAX); + while (seg) { + next_seg =3D interval_tree_iter_next(seg, 0, LLONG_MAX); + interval_tree_remove(seg, &vi->segs); + erofs_pcshr_free_seg(seg); + seg =3D next_seg; + } } =20 static struct file *erofs_pcshr_alloc_file(struct file *file, @@ -219,6 +254,65 @@ static int erofs_pcshr_mmap(struct file *file, struct = vm_area_struct *vma) return 0; } =20 +static int erofs_pcshr_fadvise(struct file *file, loff_t offset, loff_t le= n, int advice) +{ + struct erofs_inode *vi =3D EROFS_I(file_inode(file)); + struct interval_tree_node *seg, *next_seg, *new_seg; + struct file *ano_file =3D file->private_data; + struct erofs_pcshr_private *ano_private; + erofs_off_t start, end, l, r; + int err =3D 0; + + if (advice !=3D POSIX_FADV_DONTNEED) + return generic_fadvise(ano_file, offset, len, advice); + + ano_private =3D file_inode(ano_file)->i_private; + + start =3D offset >> PAGE_SHIFT; + /* len =3D 0 means EOF */ + end =3D ((!len ? LLONG_MAX : offset + len) >> PAGE_SHIFT) + 1; + + mutex_lock(&vi->segs_mutex); + seg =3D interval_tree_iter_first(&vi->segs, start, end); + while (seg) { + next_seg =3D interval_tree_iter_next(seg, start, end); + /* + * calculate the overlap between [start, end) + * and [seg->start, seg->last) + */ + l =3D max_t(u64, seg->start | 0ULL, start); + r =3D min_t(u64, seg->last | 0ULL, end); + if (l >=3D r) + continue; + + /* a new smaller interval on the left side */ + if (seg->start < l) { + new_seg =3D erofs_pcshr_alloc_seg(); + new_seg->start =3D seg->start; + new_seg->last =3D l; + interval_tree_insert(new_seg, &vi->segs); + } + + /* a new smaller interval on the right side */ + if (r < seg->last) { + new_seg =3D erofs_pcshr_alloc_seg(); + new_seg->start =3D r; + new_seg->last =3D seg->last; + interval_tree_insert(new_seg, &vi->segs); + } + mutex_lock(&ano_private->mutex); + truncate_inode_pages_range(file_inode(ano_file)->i_mapping, + l << PAGE_SHIFT, + (r - 1) << PAGE_SHIFT); + mutex_unlock(&ano_private->mutex); + interval_tree_remove(seg, &vi->segs); + erofs_pcshr_free_seg(seg); + seg =3D next_seg; + } + mutex_unlock(&vi->segs_mutex); + return err; +} + const struct file_operations erofs_pcshr_fops =3D { .open =3D erofs_pcshr_file_open, .llseek =3D generic_file_llseek, @@ -227,6 +321,7 @@ const struct file_operations erofs_pcshr_fops =3D { .release =3D erofs_pcshr_file_release, .get_unmapped_area =3D thp_get_unmapped_area, .splice_read =3D filemap_splice_read, + .fadvise =3D erofs_pcshr_fadvise, }; =20 int erofs_pcshr_read_begin(struct file *file, struct folio *folio) @@ -240,9 +335,11 @@ int erofs_pcshr_read_begin(struct file *file, struct f= olio *folio) vi =3D file->private_data; if (vi->ano_inode !=3D file_inode(file)) return 0; - ano_private =3D vi->ano_inode->i_private; + + mutex_lock(&vi->segs_mutex); mutex_lock(&ano_private->mutex); + folio->mapping->host =3D &vi->vfs_inode; return 1; } @@ -250,16 +347,36 @@ int erofs_pcshr_read_begin(struct file *file, struct = folio *folio) void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcsh= r) { struct erofs_pcshr_private *ano_private; + struct interval_tree_node *seg; + struct erofs_inode *vi; =20 if (pcshr =3D=3D 0) return; - + vi =3D file->private_data; ano_private =3D file_inode(file)->i_private; + + /* switch host inode */ folio->mapping->host =3D file_inode(file); + + /* record first-read segment */ + seg =3D erofs_pcshr_alloc_seg(); + if (!seg) { + DBG_BUGON(1); + goto unlock; + } + seg->start =3D folio_index(folio); + seg->last =3D seg->start + (folio_size(folio) >> PAGE_SHIFT); + if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT)) + seg->last =3D vi->vfs_inode.i_size >> PAGE_SHIFT; + DBG_BUGON(seg->last < seg->start); + interval_tree_insert(seg, &vi->segs); +unlock: mutex_unlock(&ano_private->mutex); + mutex_unlock(&vi->segs_mutex); } =20 -int erofs_pcshr_readahead_begin(struct readahead_control *rac) +int erofs_pcshr_readahead_begin(struct readahead_control *rac, + unsigned long *start) { struct erofs_inode *vi; struct file *file =3D rac->file; @@ -271,21 +388,43 @@ int erofs_pcshr_readahead_begin(struct readahead_cont= rol *rac) vi =3D file->private_data; if (vi->ano_inode !=3D file_inode(file)) return 0; - ano_private =3D file_inode(file)->i_private; + + mutex_lock(&vi->segs_mutex); mutex_lock(&ano_private->mutex); + rac->mapping->host =3D &vi->vfs_inode; + *start =3D readahead_pos(rac) >> PAGE_SHIFT; return 1; } =20 -void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr, + unsigned long start) { struct erofs_pcshr_private *ano_private; + struct interval_tree_node *seg; + struct erofs_inode *vi; =20 if (pcshr =3D=3D 0) return; - + vi =3D rac->file->private_data; ano_private =3D file_inode(rac->file)->i_private; + + /* switch host inode */ rac->mapping->host =3D file_inode(rac->file); + + /* record first-read segments */ + seg =3D erofs_pcshr_alloc_seg(); + if (!seg) { + DBG_BUGON(1); + goto unlock; + } + seg->start =3D start; + seg->last =3D readahead_pos(rac) >> PAGE_SHIFT; + if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT)) + seg->last =3D vi->vfs_inode.i_size >> PAGE_SHIFT; + interval_tree_insert(seg, &vi->segs); +unlock: mutex_unlock(&ano_private->mutex); + mutex_unlock(&vi->segs_mutex); } diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h index abda2a60278b..2c4ac7e45227 100644 --- a/fs/erofs/pagecache_share.h +++ b/fs/erofs/pagecache_share.h @@ -17,8 +17,10 @@ void erofs_pcshr_free_inode(struct inode *inode); /* switch between the anonymous inode and the real inode */ int erofs_pcshr_read_begin(struct file *file, struct folio *folio); void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcsh= r); -int erofs_pcshr_readahead_begin(struct readahead_control *rac); -void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr); +int erofs_pcshr_readahead_begin(struct readahead_control *rac, + unsigned long *start); +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr, + unsigned long start); =20 #else =20 @@ -29,8 +31,8 @@ static inline void erofs_pcshr_free_inode(struct inode *i= node) {} =20 static inline int erofs_pcshr_read_begin(struct file *file, struct folio *= folio) { return 0; } static inline void erofs_pcshr_read_end(struct file *file, struct folio *f= olio, int pcshr) {} -static inline int erofs_pcshr_readahead_begin(struct readahead_control *ra= c) { return 0; } -static inline void erofs_pcshr_readahead_end(struct readahead_control *rac= , int pcshr) {} +static inline int erofs_pcshr_readahead_begin(struct readahead_control *ra= c, unsigned long *start) { return 0; } +static inline void erofs_pcshr_readahead_end(struct readahead_control *rac= , int pcshr, unsigned long start) {} =20 #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE =20 diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index fc2ed01eaabe..f646ec70cd7a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1921,9 +1921,10 @@ static void z_erofs_readahead(struct readahead_contr= ol *rac) struct z_erofs_decompress_frontend f =3D DECOMPRESS_FRONTEND_INIT(inode); struct folio *head =3D NULL, *folio; unsigned int nr_folios; + unsigned long start; int err, pcshr; =20 - pcshr =3D erofs_pcshr_readahead_begin(rac); + pcshr =3D erofs_pcshr_readahead_begin(rac, &start); f.headoffset =3D readahead_pos(rac); =20 z_erofs_pcluster_readmore(&f, rac, true); @@ -1951,7 +1952,7 @@ static void z_erofs_readahead(struct readahead_contro= l *rac) (void)z_erofs_runqueue(&f, nr_folios); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); - erofs_pcshr_readahead_end(rac, pcshr); + erofs_pcshr_readahead_end(rac, pcshr, start); } =20 const struct address_space_operations z_erofs_aops =3D { --=20 2.43.5