From nobody Fri Dec 19 08:58:51 2025 Received: from out30-98.freemail.mail.aliyun.com (out30-98.freemail.mail.aliyun.com [115.124.30.98]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5EB391D45FF for ; Mon, 2 Sep 2024 11:06:36 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.98 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275199; cv=none; b=S8t5c0Yb76EZxjz6MwXlMoSpYQMf71zHgwVyj3Lj98wS+wnXFjhYgrEkm23IuOS2UnrfvMSCO6KB3uEE3Ip1EPk1wDJ2gEegdo1qJigQfm/Ed9TPkIOnjauvts6XB1Lv/GVg6T0ahz6BAqfmdQXBxioY+dK9aJTJjZTC2PTlC6M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275199; c=relaxed/simple; bh=/D19tGyMqi94IO07FaZ2mz3Apm/t5eXAuyucNfsvoxc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=QS1g7zbA7H8tqsvXTR+u7102o5RKYIpuhCBYOXQMnqEMEQjHMDNbfLkuTqOqflc7uBMx4SuWAjWCLvume6o5+H3x+MpvcQ1IMjI1oDYkol4D5hw8pf2xqQRGD9AVjUlFk6uDaDen8pz3ooUkkaHH3ZR6oQLF/QP1+zQSU0x9cRE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=pTV9IUXu; arc=none smtp.client-ip=115.124.30.98 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="pTV9IUXu" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1725275195; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=3lx/WieEaOEVggL0VOeyWGBi+pM61qJto+aBTEIGQYo=; b=pTV9IUXuTM/3BBpAVK/dFiFt8iwpzgvlH3jRYTVKovGzm4L+TtYIJ+CGdALVoRv+stWb123MohQkZhWfx9KQ7y2R5MICyAvVY6g5jnN2Qrv180rFe854pjWhOPl0v/eHwUnn4jIhSgwMztfc8vMK9ViLEVvh5wjArA2om8u+j7E= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WE7YYpT_1725275194) by smtp.aliyun-inc.com; Mon, 02 Sep 2024 19:06:34 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: lihongbo22@huawei.com, linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [PATCH RFC v4 1/4] erofs: move `struct erofs_anon_fs_type` to super.c Date: Mon, 2 Sep 2024 19:06:17 +0800 Message-ID: <20240902110620.2202586-2-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> References: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Move the `struct erofs_anon_fs_type` to the super.c and expose it in preparation for the upcoming page cache share feature. Signed-off-by: Hongzhen Luo --- v4: There are no changes compared to v3. v3: https://lore.kernel.org/all/20240828111959.3677011-2-hongzhen@linux.ali= baba.com/ v2: The patch set v2 does not move the `struct erofs_anon_fs_type` to super= .c. v1: https://lore.kernel.org/all/20240722065355.1396365-2-hongzhen@linux.ali= baba.com/ --- fs/erofs/fscache.c | 13 ------------- fs/erofs/internal.h | 2 ++ fs/erofs/super.c | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index fda16eedafb5..826b2893acb2 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -3,7 +3,6 @@ * Copyright (C) 2022, Alibaba Cloud * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ -#include #include #include "internal.h" =20 @@ -13,18 +12,6 @@ static LIST_HEAD(erofs_domain_list); static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; =20 -static int erofs_anon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type erofs_anon_fs_type =3D { - .owner =3D THIS_MODULE, - .name =3D "pseudo_erofs", - .init_fs_context =3D erofs_anon_init_fs_context, - .kill_sb =3D kill_anon_super, -}; - struct erofs_fscache_io { struct netfs_cache_resources cres; struct iov_iter iter; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 45dc15ebd870..3d6bb1b36378 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -387,6 +387,8 @@ extern const struct file_operations erofs_dir_fops; =20 extern const struct iomap_ops z_erofs_iomap_report_ops; =20 +extern struct file_system_type erofs_anon_fs_type; + /* flags for erofs_fscache_register_cookie() */ #define EROFS_REG_COOKIE_SHARE 0x0001 #define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 6cb5c8916174..afca576144ca 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "xattr.h" =20 #define CREATE_TRACE_POINTS @@ -834,6 +835,26 @@ static struct file_system_type erofs_fs_type =3D { }; MODULE_ALIAS_FS("erofs"); =20 +static const struct super_operations erofs_anon_sops =3D { + .statfs =3D simple_statfs, +}; + +static int erofs_anon_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx =3D init_pseudo(fc, EROFS_SUPER_MAGIC); + + if (ctx) + ctx->ops =3D &erofs_anon_sops; + return ctx ? 0 : -ENOMEM; +} + +struct file_system_type erofs_anon_fs_type =3D { + .owner =3D THIS_MODULE, + .name =3D "pseudo_erofs", + .init_fs_context =3D erofs_anon_init_fs_context, + .kill_sb =3D kill_anon_super, +}; + static int __init erofs_module_init(void) { int err; --=20 2.43.5 From nobody Fri Dec 19 08:58:51 2025 Received: from out30-130.freemail.mail.aliyun.com (out30-130.freemail.mail.aliyun.com [115.124.30.130]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 88E991D47C0 for ; Mon, 2 Sep 2024 11:06:41 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.130 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275204; cv=none; b=cjYjD08dmQ5f8/Giv9cAXLj5Kr7UdYKXv1nXgI+ZheJ+dr8Y97/e1f0Z36h0kp7E1UIqdy35cocZAVDeAtuhPYpETU17woivMUUkS7hQbNbO48qKlfDl5iFsf1cgpOHrQ8BZTf0qCcqOhehD+NoSd+5qo2pDEqYBBPJ4UZ4JjHU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275204; c=relaxed/simple; bh=YcmYmi49WcrgfePWcewDo7GbNXNoTX2U4V2NA2M1myk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JTmYq/EBUaYfnlsM+XEfXYvDkB1FPsF2+B2da4+rA/Y3HFHa5ccy9HiQ/OIY8FPjBNgSaDqgtU12ndYVJ8huBFGRFEQUldYeIzdGD3fxRwBySKajZzbdPbkvLLjXu3hACw5KPvjKPlVBrLLK5dAvMs2tZVgq9WnlcooK+vDc0U4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=Wjw/gC4e; arc=none smtp.client-ip=115.124.30.130 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="Wjw/gC4e" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1725275199; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=HmnuGVS/tsuwLc1eMgnltdcvukWN1aa5ke3kJ2BOAww=; b=Wjw/gC4eSgBP6oMwKKEO5kRNBMwha+u+nzxgbhm0XCavojeiZADS0W/+DvIzS+9GbRIhe13/CAq9cIhXmuI5tntAKMrQGY3jmBBSET1bfLlCeRflMwuOQAj33Qbqq4KceKvMrCV5JBp/2rn9UAiC+9IJKvg3yBtvRwIWAwbE7Ug= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WE7X3AN_1725275197) by smtp.aliyun-inc.com; Mon, 02 Sep 2024 19:06:38 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: lihongbo22@huawei.com, linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [PATCH RFC v4 2/4] erofs: introduce page cache share feature Date: Mon, 2 Sep 2024 19:06:18 +0800 Message-ID: <20240902110620.2202586-3-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> References: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Currently, reading files with different paths (or names) but the same content will consume multiple copies of the page cache, even if the content of these page caches is the same. For example, reading identical files (e.g., *.so files) from two different minor versions of container images will cost multiple copies of the same page cache, since different containers have different mount points. Therefore, sharing the page cache for files with the same content can save memory. This introduces the page cache share feature in erofs. During the mkfs phase, the file content is hashed and the hash value is stored in the `trusted.erofs.fingerprint` extended attribute. Inodes of files with the same `trusted.erofs.fingerprint` are mapped to the same anonymous inode (indicated by the `ano_inode` field). When a read request occurs, the anonymous inode serves as a "container" whose page cache is shared. The actual operations involving the iomap are carried out by the original inode which is mapped to the anonymous inode. Below is the memory usage for reading all files in two different minor versions of container images: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 241 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 163 | 33% | +-------------------+------------------+-------------+---------------+ | | No | 872 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 630 | 28% | +-------------------+------------------+-------------+---------------+ | | No | 2771 | - | | tensorflow +------------------+-------------+---------------+ | 1.11.0 & 2.11.1 | Yes | 2340 | 16% | +-------------------+------------------+-------------+---------------+ | | No | 926 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 735 | 21% | +-------------------+------------------+-------------+---------------+ | | No | 390 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 219 | 44% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 924 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 474 | 49% | +-------------------+------------------+-------------+---------------+ Additionally, the table below shows the runtime memory usage of the container: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 35 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 28 | 20% | +-------------------+------------------+-------------+---------------+ | | No | 149 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 95 | 37% | +-------------------+------------------+-------------+---------------+ | | No | 1028 | - | | tensorflow +------------------+-------------+---------------+ | 1.11.0 & 2.11.1 | Yes | 930 | 10% | +-------------------+------------------+-------------+---------------+ | | No | 155 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 132 | 15% | +-------------------+------------------+-------------+---------------+ | | No | 25 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 20 | 20% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 186 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 98 | 48% | +-------------------+------------------+-------------+---------------+ Signed-off-by: Hongzhen Luo --- v4: There are no changes compared to v3. v3: https://lore.kernel.org/all/20240828111959.3677011-3-hongzhen@linux.ali= baba.com/ v2: https://lore.kernel.org/all/20240731080704.678259-2-hongzhen@linux.alib= aba.com/ v1: https://lore.kernel.org/all/20240722065355.1396365-4-hongzhen@linux.ali= baba.com/ --- fs/erofs/Kconfig | 10 +++ fs/erofs/Makefile | 1 + fs/erofs/internal.h | 4 + fs/erofs/pagecache_share.c | 171 +++++++++++++++++++++++++++++++++++++ fs/erofs/pagecache_share.h | 20 +++++ 5 files changed, 206 insertions(+) create mode 100644 fs/erofs/pagecache_share.c create mode 100644 fs/erofs/pagecache_share.h diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 7dcdce660cac..756a74de623c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -158,3 +158,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI at higher priority. =20 If unsure, say N. + +config EROFS_FS_PAGE_CACHE_SHARE + bool "EROFS page cache share support" + depends on EROFS_FS + default n + help + This permits EROFS to share page cache for files with same + fingerprints. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 097d672e6b14..f14a2ac0e561 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -8,3 +8,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) +=3D decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) +=3D decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) +=3D decompressor_zstd.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) +=3D fscache.o +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) +=3D pagecache_share.o diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3d6bb1b36378..358377825927 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -288,6 +288,9 @@ struct erofs_inode { }; #endif /* CONFIG_EROFS_FS_ZIP */ }; +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct inode *ano_inode; +#endif /* the corresponding vfs inode */ struct inode vfs_inode; }; @@ -384,6 +387,7 @@ extern const struct inode_operations erofs_dir_iops; =20 extern const struct file_operations erofs_file_fops; extern const struct file_operations erofs_dir_fops; +extern const struct file_operations erofs_pcs_file_fops; =20 extern const struct iomap_ops z_erofs_iomap_report_ops; =20 diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c new file mode 100644 index 000000000000..2d2a74547b67 --- /dev/null +++ b/fs/erofs/pagecache_share.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include +#include +#include "pagecache_share.h" +#include "internal.h" +#include "xattr.h" + +#define PCS_FPRT_IDX 4 +#define PCS_FPRT_NAME "erofs.fingerprint" +#define PCS_FPRT_MAXLEN (sizeof(size_t) + 1024) + +static DEFINE_MUTEX(pseudo_mnt_lock); +static refcount_t pseudo_mnt_count; +static struct vfsmount *erofs_pcs_mnt; + +int erofs_pcs_init_mnt(void) +{ + mutex_lock(&pseudo_mnt_lock); + if (!erofs_pcs_mnt) { + struct vfsmount *tmp =3D kern_mount(&erofs_anon_fs_type); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + erofs_pcs_mnt =3D tmp; + refcount_set(&pseudo_mnt_count, 1); + } else + refcount_add(1, &pseudo_mnt_count); + mutex_unlock(&pseudo_mnt_lock); + return 0; +} + +void erofs_pcs_free_mnt(void) +{ + mutex_lock(&pseudo_mnt_lock); + if (refcount_dec_and_test(&pseudo_mnt_count)) { + kern_unmount(erofs_pcs_mnt); + erofs_pcs_mnt =3D NULL; + } + mutex_unlock(&pseudo_mnt_lock); +} + +static int erofs_pcs_eq(struct inode *inode, void *data) +{ + return inode->i_private && memcmp(inode->i_private, data, + sizeof(size_t) + *(size_t *)data) =3D=3D 0 ? 1 : 0; +} + +static int erofs_pcs_set_fprt(struct inode *inode, void *data) +{ + /* fprt length and content */ + inode->i_private =3D kmalloc(*(size_t *)data + sizeof(size_t), + GFP_KERNEL); + memcpy(inode->i_private, data, sizeof(size_t) + *(size_t *)data); + return 0; +} + +void erofs_pcs_fill_inode(struct inode *inode) +{ + struct erofs_inode *vi =3D EROFS_I(inode); + char fprt[PCS_FPRT_MAXLEN]; + struct inode *ano_inode; + unsigned long fprt_hash; + size_t fprt_len; + + vi->ano_inode =3D NULL; + fprt_len =3D erofs_getxattr(inode, PCS_FPRT_IDX, PCS_FPRT_NAME, + fprt + sizeof(size_t), PCS_FPRT_MAXLEN); + if (fprt_len > 0 && fprt_len <=3D PCS_FPRT_MAXLEN) { + *(size_t *)fprt =3D fprt_len; + fprt_hash =3D xxh32(fprt + sizeof(size_t), fprt_len, 0); + ano_inode =3D iget5_locked(erofs_pcs_mnt->mnt_sb, fprt_hash, + erofs_pcs_eq, erofs_pcs_set_fprt, fprt); + vi->ano_inode =3D ano_inode; + if (ano_inode->i_state & I_NEW) { + if (erofs_inode_is_data_compressed(vi->datalayout)) + ano_inode->i_mapping->a_ops =3D &z_erofs_aops; + else + ano_inode->i_mapping->a_ops =3D + &erofs_raw_access_aops; + ano_inode->i_size =3D inode->i_size; + unlock_new_inode(ano_inode); + } + } +} + +static struct file *erofs_pcs_alloc_file(struct file *file, + struct inode *ano_inode) +{ + struct file *ano_file; + + ano_file =3D alloc_file_pseudo(ano_inode, erofs_pcs_mnt, "[erofs_pcs_f]", + O_RDONLY, &erofs_file_fops); + file_ra_state_init(&ano_file->f_ra, file->f_mapping); + ano_file->private_data =3D EROFS_I(file_inode(file)); + return ano_file; +} + +static int erofs_pcs_file_open(struct inode *inode, struct file *file) +{ + struct file *ano_file; + struct inode *ano_inode; + struct erofs_inode *vi =3D EROFS_I(inode); + + ano_inode =3D vi->ano_inode; + if (!ano_inode) + return -EINVAL; + ano_file =3D erofs_pcs_alloc_file(file, ano_inode); + ihold(ano_inode); + file->private_data =3D (void *)ano_file; + return 0; +} + +static int erofs_pcs_file_release(struct inode *inode, struct file *file) +{ + if (!file->private_data) + return -EINVAL; + fput((struct file *)file->private_data); + file->private_data =3D NULL; + return 0; +} + +static ssize_t erofs_pcs_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode =3D file_inode(iocb->ki_filp); + struct file *file, *ano_file; + struct kiocb ano_iocb; + ssize_t res; + + if (!iov_iter_count(to)) + return 0; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return erofs_file_fops.read_iter(iocb, to); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return erofs_file_fops.read_iter(iocb, to); + + memcpy(&ano_iocb, iocb, sizeof(struct kiocb)); + file =3D iocb->ki_filp; + ano_file =3D file->private_data; + if (!ano_file) + return -EINVAL; + ano_iocb.ki_filp =3D ano_file; + res =3D filemap_read(&ano_iocb, to, 0); + memcpy(iocb, &ano_iocb, sizeof(struct kiocb)); + iocb->ki_filp =3D file; + file_accessed(file); + return res; +} + +static int erofs_pcs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *ano_file =3D file->private_data; + + vma_set_file(vma, ano_file); + vma->vm_ops =3D &generic_file_vm_ops; + return 0; +} + +const struct file_operations erofs_pcs_file_fops =3D { + .open =3D erofs_pcs_file_open, + .llseek =3D generic_file_llseek, + .read_iter =3D erofs_pcs_file_read_iter, + .mmap =3D erofs_pcs_mmap, + .release =3D erofs_pcs_file_release, + .get_unmapped_area =3D thp_get_unmapped_area, + .splice_read =3D filemap_splice_read, +}; diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h new file mode 100644 index 000000000000..b8111291cf79 --- /dev/null +++ b/fs/erofs/pagecache_share.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#ifndef __EROFS_PAGECACHE_SHARE_H +#define __EROFS_PAGECACHE_SHARE_H + +#include +#include +#include +#include +#include "internal.h" + +int erofs_pcs_init_mnt(void); +void erofs_pcs_free_mnt(void); +void erofs_pcs_fill_inode(struct inode *inode); + +extern const struct vm_operations_struct generic_file_vm_ops; + +#endif --=20 2.43.5 From nobody Fri Dec 19 08:58:51 2025 Received: from out30-124.freemail.mail.aliyun.com (out30-124.freemail.mail.aliyun.com [115.124.30.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 558DD1D47D8 for ; Mon, 2 Sep 2024 11:06:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275208; cv=none; b=KOUIz2yc4a2th/BEHMA6hrwE/Bp1i5Q+pYFO9CssHorFEs+79kK9wG1UiWjV2vuhINcqVcu5UlwjvlwJhddn+XvoGnoOvOCwEeN3CTgNO/rs/hv2ekNta+oWMFINoNPftfbVjj3PhWIQw/HOa0JkqfyZxIVHZ2cjrKPIgle/8Bo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275208; c=relaxed/simple; bh=V0hQ1TV8PGl6D0j6r5STtcvN72jVIzadXFI9qco4OcI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=mY/EsDkYddn1jmDAOxNX9hFqtFCsN7HIzQJjl4jm4vgwwxgfuszcse68XpXGkdzv1bE0PKBnzw1QWGwSFm/o0Y3AOPqILaQfMWNFmof5mSwCk1K97+0SixpGasbmLmPE4cKxzRcR4n44shnAQA22vT97JHa32Bd1G9VyXZADAyQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=nn2MGH1P; arc=none smtp.client-ip=115.124.30.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="nn2MGH1P" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1725275204; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=wILK9eGhOkbD2R7HF62DI0YCU9LxevSyER3gqj7sMMA=; b=nn2MGH1PxGTRElOlDlg/wwAyyqyuqmB4/w3nw/+JOl3OVw/jMQQm+I7uN2NKj4iwHhgtcM/jgrvbxZLTo+DWgTE2qQFP24wKZY0NeHfJVu6ycIiX6mUb+tIgytKPJzNYFnYm2tXYflnvrBeO7fy7hPgL3xaQDr3XMtdmYBFOoPs= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WE8BYKE_1725275202) by smtp.aliyun-inc.com; Mon, 02 Sep 2024 19:06:43 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: lihongbo22@huawei.com, linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [PATCH RFC v4 3/4] erofs: apply the page cache share feature Date: Mon, 2 Sep 2024 19:06:19 +0800 Message-ID: <20240902110620.2202586-4-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> References: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" This modifies relevant functions to apply the page cache share feature. Signed-off-by: Hongzhen Luo --- v4: There are no changes compared to v3. v3: https://lore.kernel.org/all/20240828111959.3677011-4-hongzhen@linux.ali= baba.com/ v2: https://lore.kernel.org/all/20240731080704.678259-3-hongzhen@linux.alib= aba.com/ v1: https://lore.kernel.org/all/20240722065355.1396365-5-hongzhen@linux.ali= baba.com/ --- fs/erofs/data.c | 34 +++++++++++++++++++++++++++++++++- fs/erofs/inode.c | 12 ++++++++++++ fs/erofs/super.c | 29 +++++++++++++++++++++++++++++ fs/erofs/zdata.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 1 deletion(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1b7eba38ba1e..ef27b934115f 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -347,12 +347,44 @@ int erofs_fiemap(struct inode *inode, struct fiemap_e= xtent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct erofs_inode *vi =3D NULL; + int ret; + + if (file && file->private_data) { + vi =3D file->private_data; + if (vi->ano_inode =3D=3D file_inode(file)) + folio->mapping->host =3D &vi->vfs_inode; + else + vi =3D NULL; + } + ret =3D iomap_read_folio(folio, &erofs_iomap_ops); + if (vi) + folio->mapping->host =3D file_inode(file); + return ret; +#else return iomap_read_folio(folio, &erofs_iomap_ops); +#endif } - static void erofs_readahead(struct readahead_control *rac) { +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct erofs_inode *vi =3D NULL; + struct file *file =3D rac->file; + + if (file && file->private_data) { + vi =3D file->private_data; + if (vi->ano_inode =3D=3D file_inode(file)) + rac->mapping->host =3D &vi->vfs_inode; + else + vi =3D NULL; + } + iomap_readahead(rac, &erofs_iomap_ops); + if (vi) + rac->mapping->host =3D file_inode(file); +#else return iomap_readahead(rac, &erofs_iomap_ops); +#endif } =20 static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 419432be3223..3f2db0ad7959 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" +#include "pagecache_share.h" =20 #include =20 @@ -229,10 +230,21 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op =3D &erofs_generic_iops; +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + erofs_pcs_fill_inode(inode); + if (vi->ano_inode) + inode->i_fop =3D &erofs_pcs_file_fops; + else if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_fop =3D &generic_ro_fops; + else + inode->i_fop =3D &erofs_file_fops; +#else if (erofs_inode_is_data_compressed(vi->datalayout)) inode->i_fop =3D &generic_ro_fops; else inode->i_fop =3D &erofs_file_fops; +#endif + break; case S_IFDIR: inode->i_op =3D &erofs_dir_iops; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index afca576144ca..113e305080fa 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -12,6 +12,7 @@ #include #include #include "xattr.h" +#include "pagecache_share.h" =20 #define CREATE_TRACE_POINTS #include @@ -103,6 +104,12 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi =3D EROFS_I(inode); =20 +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + if (S_ISREG(inode->i_mode) && vi->ano_inode) { + iput(vi->ano_inode); + vi->ano_inode =3D NULL; + } +#endif if (inode->i_op =3D=3D &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); @@ -687,6 +694,12 @@ static int erofs_fc_fill_super(struct super_block *sb,= struct fs_context *fc) if (err) return err; =20 +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + err =3D erofs_pcs_init_mnt(); + if (err) + return err; +#endif + erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } @@ -797,6 +810,9 @@ static void erofs_kill_sb(struct super_block *sb) else kill_block_super(sb); =20 +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + erofs_pcs_free_mnt(); +#endif erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); @@ -835,8 +851,21 @@ static struct file_system_type erofs_fs_type =3D { }; MODULE_ALIAS_FS("erofs"); =20 +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE +static void erofs_free_anon_inode(struct inode *inode) +{ + if (inode->i_private) { + kfree(inode->i_private); + inode->i_private =3D NULL; + } +} +#endif + static const struct super_operations erofs_anon_sops =3D { .statfs =3D simple_statfs, +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + .free_inode =3D erofs_free_anon_inode, +#endif }; =20 static int erofs_anon_init_fs_context(struct fs_context *fc) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 424f656cd765..cd3cabfef462 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1802,6 +1802,17 @@ static void z_erofs_pcluster_readmore(struct z_erofs= _decompress_frontend *f, =20 static int z_erofs_read_folio(struct file *file, struct folio *folio) { +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct erofs_inode *vi =3D NULL; + + if (file && file->private_data) { + vi =3D file->private_data; + if (vi->ano_inode =3D=3D file_inode(file)) + folio->mapping->host =3D &vi->vfs_inode; + else + vi =3D NULL; + } +#endif struct inode *const inode =3D folio->mapping->host; struct erofs_sb_info *const sbi =3D EROFS_I_SB(inode); struct z_erofs_decompress_frontend f =3D DECOMPRESS_FRONTEND_INIT(inode); @@ -1824,11 +1835,27 @@ static int z_erofs_read_folio(struct file *file, st= ruct folio *folio) =20 erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + if (vi) + folio->mapping->host =3D file_inode(file); +#endif return err; } =20 static void z_erofs_readahead(struct readahead_control *rac) { +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct erofs_inode *vi =3D NULL; + + if (rac->file && rac->file->private_data) { + vi =3D rac->file->private_data; + if (vi->ano_inode =3D=3D file_inode(rac->file)) + rac->mapping->host =3D &vi->vfs_inode; + else + vi =3D NULL; + } +#endif struct inode *const inode =3D rac->mapping->host; struct erofs_sb_info *const sbi =3D EROFS_I_SB(inode); struct z_erofs_decompress_frontend f =3D DECOMPRESS_FRONTEND_INIT(inode); @@ -1863,6 +1890,11 @@ static void z_erofs_readahead(struct readahead_contr= ol *rac) z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + if (vi) + rac->mapping->host =3D file_inode(rac->file); +#endif } =20 const struct address_space_operations z_erofs_aops =3D { --=20 2.43.5 From nobody Fri Dec 19 08:58:51 2025 Received: from out30-99.freemail.mail.aliyun.com (out30-99.freemail.mail.aliyun.com [115.124.30.99]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DF9FD1D54D2 for ; Mon, 2 Sep 2024 11:06:49 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=115.124.30.99 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275212; cv=none; b=QfFHqyxSYODCNPNsP1fmhCNHsDKVoOtHOeUE78R/DYaUFWHeoC02/hH7jtUmhA8Fc7n6SR97NG4Oh067lwp2dpA+2OInpiYkdJiIsg8+QxebWmNQsQNk+cbX7DnJDJC9bO5uikNnjALsGhKF8koic4+OfJzDDYeJEyfakN4fnJg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725275212; c=relaxed/simple; bh=LM1TNY8Lojkj9WlEEyFgXKiALoAeXliyUuknPJjuiSs=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=iNvhAFqw+GVPxecY5ToOUHg+XWlf6e7euCAGVfZPCgxzsFnJ+7u+Rhv40VhIlYhWaw9MFg/qhIWq7HflDK2frQcha4ESTUPjGQWlZMUBScHnRYOwxVEFhdh4VeHDK3w5jdT9DvbgiaQFuls59vWZEXHkbih4KjjpMRPteJj3CqY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com; spf=pass smtp.mailfrom=linux.alibaba.com; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b=i8IBfu6I; arc=none smtp.client-ip=115.124.30.99 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.alibaba.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.alibaba.com header.i=@linux.alibaba.com header.b="i8IBfu6I" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.alibaba.com; s=default; t=1725275207; h=From:To:Subject:Date:Message-ID:MIME-Version; bh=8F/K6gXuWQR15VccFVhsO1O0QRvNL2x4hMbfePUdpfk=; b=i8IBfu6IE2Cf8ntTD2pBTCBnfwvEJN9R9MSAkp349WnxEetG0pjIEXrZ6dubaOfFNG8pDDGs+HI+yUJPC1dMgpV5eUyG5K7oweMtBDr/NLX/cJ7QICijRLNiE8jV2fIvgxc3zXQppbMgdX2P0sg5dkZ9iG1HtvWDq4eane+dUvM= Received: from localhost(mailfrom:hongzhen@linux.alibaba.com fp:SMTPD_---0WE8BYL5_1725275206) by smtp.aliyun-inc.com; Mon, 02 Sep 2024 19:06:47 +0800 From: Hongzhen Luo To: linux-erofs@lists.ozlabs.org Cc: lihongbo22@huawei.com, linux-kernel@vger.kernel.org, Hongzhen Luo Subject: [PATCH RFC v4 4/4] erofs: introduce .fadvise for page cache share Date: Mon, 2 Sep 2024 19:06:20 +0800 Message-ID: <20240902110620.2202586-5-hongzhen@linux.alibaba.com> X-Mailer: git-send-email 2.43.5 In-Reply-To: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> References: <20240902110620.2202586-1-hongzhen@linux.alibaba.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" When using .fadvice to release a file's page cache, it frees those page caches that were firstly read by this file. To achieve this, an interval tree is added in the inode of that file to track the segments firstly read by that inode. Signed-off-by: Hongzhen Luo --- fs/erofs/data.c | 38 +++++++++++++++++++-- fs/erofs/internal.h | 5 +++ fs/erofs/pagecache_share.c | 68 ++++++++++++++++++++++++++++++++++++++ fs/erofs/pagecache_share.h | 3 +- fs/erofs/super.c | 9 +++++ 5 files changed, 120 insertions(+), 3 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ef27b934115f..e4fcc8a6ce6d 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -7,6 +7,7 @@ #include "internal.h" #include #include +#include "pagecache_share.h" =20 void erofs_unmap_metabuf(struct erofs_buf *buf) { @@ -349,6 +350,7 @@ static int erofs_read_folio(struct file *file, struct f= olio *folio) { #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE struct erofs_inode *vi =3D NULL; + struct interval_tree_node *seg; int ret; =20 if (file && file->private_data) { @@ -359,8 +361,22 @@ static int erofs_read_folio(struct file *file, struct = folio *folio) vi =3D NULL; } ret =3D iomap_read_folio(folio, &erofs_iomap_ops); - if (vi) + if (vi) { folio->mapping->host =3D file_inode(file); + seg =3D erofs_pcs_alloc_seg(); + if (!seg) + return -ENOMEM; + seg->start =3D folio_index(folio); + seg->last =3D seg->start + (folio_size(folio) >> PAGE_SHIFT); + if (seg->last > (vi->vfs_inode.i_size >> PAGE_SHIFT)) + seg->last =3D vi->vfs_inode.i_size >> PAGE_SHIFT; + if (seg->last >=3D seg->start) { + mutex_lock(&vi->segs_mutex); + interval_tree_insert(seg, &vi->segs); + mutex_unlock(&vi->segs_mutex); + } else + erofs_pcs_free_seg(seg); + } return ret; #else return iomap_read_folio(folio, &erofs_iomap_ops); @@ -371,6 +387,8 @@ static void erofs_readahead(struct readahead_control *r= ac) #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE struct erofs_inode *vi =3D NULL; struct file *file =3D rac->file; + struct interval_tree_node *seg; + erofs_off_t start, end; =20 if (file && file->private_data) { vi =3D file->private_data; @@ -378,10 +396,26 @@ static void erofs_readahead(struct readahead_control = *rac) rac->mapping->host =3D &vi->vfs_inode; else vi =3D NULL; + start =3D readahead_pos(rac); + end =3D start + readahead_length(rac); + if (end > vi->vfs_inode.i_size) + end =3D vi->vfs_inode.i_size; } iomap_readahead(rac, &erofs_iomap_ops); - if (vi) + if (vi) { rac->mapping->host =3D file_inode(file); + seg =3D erofs_pcs_alloc_seg(); + if (!seg) + return; + seg->start =3D start >> PAGE_SHIFT; + seg->last =3D end >> PAGE_SHIFT; + if (seg->last >=3D seg->start) { + mutex_lock(&vi->segs_mutex); + interval_tree_insert(seg, &vi->segs); + mutex_unlock(&vi->segs_mutex); + } else + erofs_pcs_free_seg(seg); + } #else return iomap_readahead(rac, &erofs_iomap_ops); #endif diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 358377825927..59af8768fab8 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "erofs_fs.h" =20 /* redefine pr_fmt "erofs: " */ @@ -290,6 +292,9 @@ struct erofs_inode { }; #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE struct inode *ano_inode; + /* segments attributed by this inode */ + struct rb_root_cached segs; + struct mutex segs_mutex; #endif /* the corresponding vfs inode */ struct inode vfs_inode; diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c index 2d2a74547b67..a97024904019 100644 --- a/fs/erofs/pagecache_share.c +++ b/fs/erofs/pagecache_share.c @@ -4,6 +4,9 @@ */ #include #include +#include +#include +#include #include "pagecache_share.h" #include "internal.h" #include "xattr.h" @@ -15,6 +18,7 @@ static DEFINE_MUTEX(pseudo_mnt_lock); static refcount_t pseudo_mnt_count; static struct vfsmount *erofs_pcs_mnt; +struct kmem_cache *erofs_pcs_segsp; =20 int erofs_pcs_init_mnt(void) { @@ -25,6 +29,11 @@ int erofs_pcs_init_mnt(void) return PTR_ERR(tmp); erofs_pcs_mnt =3D tmp; refcount_set(&pseudo_mnt_count, 1); + erofs_pcs_segsp =3D kmem_cache_create("erofs_pcs_segs", + sizeof(struct interval_tree_node), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL); + if (!erofs_pcs_segsp) + return -ENOMEM; } else refcount_add(1, &pseudo_mnt_count); mutex_unlock(&pseudo_mnt_lock); @@ -37,10 +46,22 @@ void erofs_pcs_free_mnt(void) if (refcount_dec_and_test(&pseudo_mnt_count)) { kern_unmount(erofs_pcs_mnt); erofs_pcs_mnt =3D NULL; + kmem_cache_destroy(erofs_pcs_segsp); + erofs_pcs_segsp =3D NULL; } mutex_unlock(&pseudo_mnt_lock); } =20 +struct interval_tree_node *erofs_pcs_alloc_seg(void) +{ + return kmem_cache_alloc(erofs_pcs_segsp, GFP_KERNEL); +} + +void erofs_pcs_free_seg(struct interval_tree_node *seg) +{ + kmem_cache_free(erofs_pcs_segsp, seg); +} + static int erofs_pcs_eq(struct inode *inode, void *data) { return inode->i_private && memcmp(inode->i_private, data, @@ -73,6 +94,8 @@ void erofs_pcs_fill_inode(struct inode *inode) ano_inode =3D iget5_locked(erofs_pcs_mnt->mnt_sb, fprt_hash, erofs_pcs_eq, erofs_pcs_set_fprt, fprt); vi->ano_inode =3D ano_inode; + vi->segs =3D RB_ROOT_CACHED; + mutex_init(&vi->segs_mutex); if (ano_inode->i_state & I_NEW) { if (erofs_inode_is_data_compressed(vi->datalayout)) ano_inode->i_mapping->a_ops =3D &z_erofs_aops; @@ -160,6 +183,50 @@ static int erofs_pcs_mmap(struct file *file, struct vm= _area_struct *vma) return 0; } =20 +static int erofs_pcs_fadvise(struct file *file, loff_t offset, loff_t len,= int advice) +{ + struct erofs_inode *vi =3D EROFS_I(file_inode(file)); + struct interval_tree_node *seg, *next_seg, *new_seg; + struct file *ano_file =3D file->private_data; + erofs_off_t start, end; + int err =3D 0; + u64 l, r; + + if (advice !=3D POSIX_FADV_DONTNEED) + return generic_fadvise(ano_file, offset, len, advice); + + start =3D offset >> PAGE_SHIFT; + /* len =3D 0 means EOF */ + end =3D (!len ? LLONG_MAX : offset + len) >> PAGE_SHIFT; + + mutex_lock(&vi->segs_mutex); + seg =3D interval_tree_iter_first(&vi->segs, start, end); + while (seg) { + next_seg =3D interval_tree_iter_next(seg, start, end); + l =3D max_t(u64, seg->start | 0ULL, start); + r =3D min_t(u64, seg->last | 0ULL, end); + if (l > r) + continue; + (void)invalidate_mapping_pages(ano_file->f_mapping, l, r); + if (seg->start < l) { + new_seg =3D erofs_pcs_alloc_seg(); + new_seg->start =3D seg->start; + new_seg->last =3D l; + interval_tree_insert(new_seg, &vi->segs); + } + if (r < seg->last) { + new_seg =3D erofs_pcs_alloc_seg(); + new_seg->start =3D r; + new_seg->last =3D seg->last; + interval_tree_insert(new_seg, &vi->segs); + } + interval_tree_remove(seg, &vi->segs); + seg =3D next_seg; + } + mutex_unlock(&vi->segs_mutex); + return err; +} + const struct file_operations erofs_pcs_file_fops =3D { .open =3D erofs_pcs_file_open, .llseek =3D generic_file_llseek, @@ -168,4 +235,5 @@ const struct file_operations erofs_pcs_file_fops =3D { .release =3D erofs_pcs_file_release, .get_unmapped_area =3D thp_get_unmapped_area, .splice_read =3D filemap_splice_read, + .fadvise =3D erofs_pcs_fadvise, }; diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h index b8111291cf79..e0aba20a6a0e 100644 --- a/fs/erofs/pagecache_share.h +++ b/fs/erofs/pagecache_share.h @@ -14,7 +14,8 @@ int erofs_pcs_init_mnt(void); void erofs_pcs_free_mnt(void); void erofs_pcs_fill_inode(struct inode *inode); +struct interval_tree_node *erofs_pcs_alloc_seg(void); +void erofs_pcs_free_seg(struct interval_tree_node *seg); =20 extern const struct vm_operations_struct generic_file_vm_ops; - #endif diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 113e305080fa..da595e608702 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -105,10 +105,19 @@ static void erofs_free_inode(struct inode *inode) struct erofs_inode *vi =3D EROFS_I(inode); =20 #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct interval_tree_node *seg, *next_seg; + if (S_ISREG(inode->i_mode) && vi->ano_inode) { iput(vi->ano_inode); vi->ano_inode =3D NULL; } + seg =3D interval_tree_iter_first(&vi->segs, 0, LLONG_MAX); + while (seg) { + next_seg =3D interval_tree_iter_next(seg, 0, LLONG_MAX); + interval_tree_remove(seg, &vi->segs); + erofs_pcs_free_seg(seg); + seg =3D next_seg; + } #endif if (inode->i_op =3D=3D &erofs_fast_symlink_iops) kfree(inode->i_link); --=20 2.43.5