This modifies relevant functions to apply the page cache
share feature.
Below is the memory usage for reading all files in two different minor
versions of container images:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 241 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 163 | 33% |
+-------------------+------------------+-------------+---------------+
| | No | 872 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 630 | 28% |
+-------------------+------------------+-------------+---------------+
| | No | 2771 | - |
| tensorflow +------------------+-------------+---------------+
| 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
+-------------------+------------------+-------------+---------------+
| | No | 926 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 735 | 21% |
+-------------------+------------------+-------------+---------------+
| | No | 390 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 219 | 44% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 924 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 474 | 49% |
+-------------------+------------------+-------------+---------------+
Additionally, the table below shows the runtime memory usage of the
container:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 35 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 28 | 20% |
+-------------------+------------------+-------------+---------------+
| | No | 149 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 95 | 37% |
+-------------------+------------------+-------------+---------------+
| | No | 1028 | - |
| tensorflow +------------------+-------------+---------------+
| 1.11.0 & 2.11.1 | Yes | 930 | 10% |
+-------------------+------------------+-------------+---------------+
| | No | 155 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 132 | 15% |
+-------------------+------------------+-------------+---------------+
| | No | 25 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 20 | 20% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 186 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 98 | 48% |
+-------------------+------------------+-------------+---------------+
Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
---
fs/erofs/data.c | 14 +++++++--
fs/erofs/inode.c | 5 ++-
fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 11 +++++++
fs/erofs/super.c | 7 +++++
fs/erofs/zdata.c | 9 ++++--
6 files changed, 104 insertions(+), 5 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0cd6b5c4df98..fb08acbeaab6 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
+#include "pagecache_share.h"
#include <linux/sched/mm.h>
#include <trace/events/erofs.h>
@@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ int ret, pcshr;
+
+ pcshr = erofs_pcshr_read_begin(file, folio);
+ ret = iomap_read_folio(folio, &erofs_iomap_ops);
+ erofs_pcshr_read_end(file, folio, pcshr);
+ return ret;
}
static void erofs_readahead(struct readahead_control *rac)
{
- return iomap_readahead(rac, &erofs_iomap_ops);
+ int pcshr;
+
+ pcshr = erofs_pcshr_readahead_begin(rac);
+ iomap_readahead(rac, &erofs_iomap_ops);
+ erofs_pcshr_readahead_end(rac, pcshr);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d4b89407822a..0b070f4b46b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
+#include "pagecache_share.h"
#include <trace/events/erofs.h>
static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- if (erofs_inode_is_data_compressed(vi->datalayout))
+ if (erofs_pcshr_fill_inode(inode) == 0)
+ inode->i_fop = &erofs_pcshr_fops;
+ else if (erofs_inode_is_data_compressed(vi->datalayout))
inode->i_fop = &generic_ro_fops;
else
inode->i_fop = &erofs_file_fops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
index 703fd17c002c..22172b5e21c7 100644
--- a/fs/erofs/pagecache_share.c
+++ b/fs/erofs/pagecache_share.c
@@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
struct erofs_pcshr_private {
char fprt[PCSHR_FPRT_MAXLEN];
+ struct mutex mutex;
};
static struct erofs_pcshr_counter mnt_counter = {
@@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
if (!ano_private)
return -ENOMEM;
memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+ mutex_init(&ano_private->mutex);
inode->i_private = ano_private;
return 0;
}
@@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
+
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
+{
+ struct erofs_inode *vi;
+ struct erofs_pcshr_private *ano_private;
+
+ if (!(file && file->private_data))
+ return 0;
+
+ vi = file->private_data;
+ if (vi->ano_inode != file_inode(file))
+ return 0;
+
+ ano_private = vi->ano_inode->i_private;
+ mutex_lock(&ano_private->mutex);
+ folio->mapping->host = &vi->vfs_inode;
+ return 1;
+}
+
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ if (pcshr == 0)
+ return;
+
+ ano_private = file_inode(file)->i_private;
+ folio->mapping->host = file_inode(file);
+ mutex_unlock(&ano_private->mutex);
+}
+
+int erofs_pcshr_readahead_begin(struct readahead_control *rac)
+{
+ struct erofs_inode *vi;
+ struct file *file = rac->file;
+ struct erofs_pcshr_private *ano_private;
+
+ if (!(file && file->private_data))
+ return 0;
+
+ vi = file->private_data;
+ if (vi->ano_inode != file_inode(file))
+ return 0;
+
+ ano_private = file_inode(file)->i_private;
+ mutex_lock(&ano_private->mutex);
+ rac->mapping->host = &vi->vfs_inode;
+ return 1;
+}
+
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ if (pcshr == 0)
+ return;
+
+ ano_private = file_inode(rac->file)->i_private;
+ rac->mapping->host = file_inode(rac->file);
+ mutex_unlock(&ano_private->mutex);
+}
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
index f3889d6889e5..abda2a60278b 100644
--- a/fs/erofs/pagecache_share.h
+++ b/fs/erofs/pagecache_share.h
@@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
int erofs_pcshr_fill_inode(struct inode *inode);
void erofs_pcshr_free_inode(struct inode *inode);
+/* switch between the anonymous inode and the real inode */
+int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
+void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
+int erofs_pcshr_readahead_begin(struct readahead_control *rac);
+void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
+
#else
static inline int erofs_pcshr_init_mnt(void) { return 0; }
@@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
static inline void erofs_pcshr_free_inode(struct inode *inode) {}
+static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
+static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
+static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
+static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
+
#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b4ce07dc931c..1b690eb6c1f1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
#include <linux/backing-dev.h>
#include <linux/pseudo_fs.h>
#include "xattr.h"
+#include "pagecache_share.h"
#define CREATE_TRACE_POINTS
#include <trace/events/erofs.h>
@@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
+ erofs_pcshr_free_inode(inode);
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
@@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ err = erofs_pcshr_init_mnt();
+ if (err)
+ return err;
+
erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
kill_anon_super(sb);
else
kill_block_super(sb);
+ erofs_pcshr_free_mnt();
fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
erofs_sb_free(sbi);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 19ef4ff2a134..fc2ed01eaabe 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,6 +5,7 @@
* Copyright (C) 2022 Alibaba Cloud
*/
#include "compress.h"
+#include "pagecache_share.h"
#include <linux/psi.h>
#include <linux/cpuhotplug.h>
#include <trace/events/erofs.h>
@@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- int err;
+ int err, pcshr;
trace_erofs_read_folio(folio, false);
+ pcshr = erofs_pcshr_read_begin(file, folio);
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
@@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
+ erofs_pcshr_read_end(file, folio, pcshr);
return err;
}
@@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct folio *head = NULL, *folio;
unsigned int nr_folios;
- int err;
+ int err, pcshr;
+ pcshr = erofs_pcshr_readahead_begin(rac);
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
@@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_runqueue(&f, nr_folios);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
+ erofs_pcshr_readahead_end(rac, pcshr);
}
const struct address_space_operations z_erofs_aops = {
--
2.43.5
On 2025/1/5 23:12, Hongzhen Luo wrote:
> This modifies relevant functions to apply the page cache
> share feature.
>
> Below is the memory usage for reading all files in two different minor
> versions of container images:
>
> +-------------------+------------------+-------------+---------------+
> | Image | Page Cache Share | Memory (MB) | Memory |
> | | | | Reduction (%) |
> +-------------------+------------------+-------------+---------------+
> | | No | 241 | - |
> | redis +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
> +-------------------+------------------+-------------+---------------+
> | | No | 872 | - |
> | postgres +------------------+-------------+---------------+
> | 16.1 & 16.2 | Yes | 630 | 28% |
> +-------------------+------------------+-------------+---------------+
> | | No | 2771 | - |
> | tensorflow +------------------+-------------+---------------+
> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
> +-------------------+------------------+-------------+---------------+
> | | No | 926 | - |
> | mysql +------------------+-------------+---------------+
> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
> +-------------------+------------------+-------------+---------------+
> | | No | 390 | - |
> | nginx +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
> +-------------------+------------------+-------------+---------------+
> | tomcat | No | 924 | - |
> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
> | | Yes | 474 | 49% |
> +-------------------+------------------+-------------+---------------+
>
> Additionally, the table below shows the runtime memory usage of the
> container:
>
> +-------------------+------------------+-------------+---------------+
> | Image | Page Cache Share | Memory (MB) | Memory |
> | | | | Reduction (%) |
> +-------------------+------------------+-------------+---------------+
> | | No | 35 | - |
> | redis +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
> +-------------------+------------------+-------------+---------------+
> | | No | 149 | - |
> | postgres +------------------+-------------+---------------+
> | 16.1 & 16.2 | Yes | 95 | 37% |
> +-------------------+------------------+-------------+---------------+
> | | No | 1028 | - |
> | tensorflow +------------------+-------------+---------------+
> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
> +-------------------+------------------+-------------+---------------+
> | | No | 155 | - |
> | mysql +------------------+-------------+---------------+
> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
> +-------------------+------------------+-------------+---------------+
> | | No | 25 | - |
> | nginx +------------------+-------------+---------------+
> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
> +-------------------+------------------+-------------+---------------+
> | tomcat | No | 186 | - |
> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
> | | Yes | 98 | 48% |
> +-------------------+------------------+-------------+---------------+
>
> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
> ---
> fs/erofs/data.c | 14 +++++++--
> fs/erofs/inode.c | 5 ++-
> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
> fs/erofs/pagecache_share.h | 11 +++++++
> fs/erofs/super.c | 7 +++++
> fs/erofs/zdata.c | 9 ++++--
> 6 files changed, 104 insertions(+), 5 deletions(-)
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0cd6b5c4df98..fb08acbeaab6 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "internal.h"
> +#include "pagecache_share.h"
> #include <linux/sched/mm.h>
> #include <trace/events/erofs.h>
>
> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> */
> static int erofs_read_folio(struct file *file, struct folio *folio)
> {
> - return iomap_read_folio(folio, &erofs_iomap_ops);
> + int ret, pcshr;
> +
> + pcshr = erofs_pcshr_read_begin(file, folio);
> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
> + erofs_pcshr_read_end(file, folio, pcshr);
> + return ret;
> }
>
> static void erofs_readahead(struct readahead_control *rac)
> {
> - return iomap_readahead(rac, &erofs_iomap_ops);
> + int pcshr;
> +
> + pcshr = erofs_pcshr_readahead_begin(rac);
> + iomap_readahead(rac, &erofs_iomap_ops);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index d4b89407822a..0b070f4b46b8 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "xattr.h"
> +#include "pagecache_share.h"
> #include <trace/events/erofs.h>
>
> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
> switch (inode->i_mode & S_IFMT) {
> case S_IFREG:
> inode->i_op = &erofs_generic_iops;
> - if (erofs_inode_is_data_compressed(vi->datalayout))
> + if (erofs_pcshr_fill_inode(inode) == 0)
> + inode->i_fop = &erofs_pcshr_fops;
> + else if (erofs_inode_is_data_compressed(vi->datalayout))
> inode->i_fop = &generic_ro_fops;
> else
> inode->i_fop = &erofs_file_fops;
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> index 703fd17c002c..22172b5e21c7 100644
> --- a/fs/erofs/pagecache_share.c
> +++ b/fs/erofs/pagecache_share.c
> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>
> struct erofs_pcshr_private {
> char fprt[PCSHR_FPRT_MAXLEN];
> + struct mutex mutex;
> };
>
> static struct erofs_pcshr_counter mnt_counter = {
> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
> if (!ano_private)
> return -ENOMEM;
> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + mutex_init(&ano_private->mutex);
> inode->i_private = ano_private;
> return 0;
> }
> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
> .get_unmapped_area = thp_get_unmapped_area,
> .splice_read = filemap_splice_read,
> };
> +
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
> +{
> + struct erofs_inode *vi;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = vi->ano_inode->i_private;
> + mutex_lock(&ano_private->mutex);
Can we lock in folio granularity? The erofs_pcshr_private mutex may
limit the concurrent in reading.
> + folio->mapping->host = &vi->vfs_inode;
> + return 1;
> +}
> +
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(file)->i_private;
> + folio->mapping->host = file_inode(file);
> + mutex_unlock(&ano_private->mutex);
> +}
> +
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
> +{
May be the begin/end helpers for read and readahead can be used with the
same helpers. They did the similar logic.
> + struct erofs_inode *vi;
> + struct file *file = rac->file;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = file_inode(file)->i_private;
> + mutex_lock(&ano_private->mutex);
> + rac->mapping->host = &vi->vfs_inode;
> + return 1;
> +}
> +
> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(rac->file)->i_private;
> + rac->mapping->host = file_inode(rac->file);
> + mutex_unlock(&ano_private->mutex);
> +}
> diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
> index f3889d6889e5..abda2a60278b 100644
> --- a/fs/erofs/pagecache_share.h
> +++ b/fs/erofs/pagecache_share.h
> @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
> int erofs_pcshr_fill_inode(struct inode *inode);
> void erofs_pcshr_free_inode(struct inode *inode);
>
> +/* switch between the anonymous inode and the real inode */
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr);
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac);
> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr);
> +
> #else
>
> static inline int erofs_pcshr_init_mnt(void) { return 0; }
> @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
> static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
> static inline void erofs_pcshr_free_inode(struct inode *inode) {}
>
> +static inline int erofs_pcshr_read_begin(struct file *file, struct folio *folio) { return 0; }
> +static inline void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr) {}
> +static inline int erofs_pcshr_readahead_begin(struct readahead_control *rac) { return 0; }
> +static inline void erofs_pcshr_readahead_end(struct readahead_control *rac, int pcshr) {}
> +
> #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>
> #endif
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index b4ce07dc931c..1b690eb6c1f1 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -13,6 +13,7 @@
> #include <linux/backing-dev.h>
> #include <linux/pseudo_fs.h>
> #include "xattr.h"
> +#include "pagecache_share.h"
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/erofs.h>
> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
> {
> struct erofs_inode *vi = EROFS_I(inode);
>
> + erofs_pcshr_free_inode(inode);
> if (inode->i_op == &erofs_fast_symlink_iops)
> kfree(inode->i_link);
> kfree(vi->xattr_shared_xattrs);
> @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
> if (err)
> return err;
>
> + err = erofs_pcshr_init_mnt();
> + if (err)
> + return err;
> +
> erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
> return 0;
> }
> @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
> kill_anon_super(sb);
> else
> kill_block_super(sb);
> + erofs_pcshr_free_mnt();
> fs_put_dax(sbi->dif0.dax_dev, NULL);
> erofs_fscache_unregister_fs(sb);
> erofs_sb_free(sbi);
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 19ef4ff2a134..fc2ed01eaabe 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2022 Alibaba Cloud
> */
> #include "compress.h"
> +#include "pagecache_share.h"
> #include <linux/psi.h>
> #include <linux/cpuhotplug.h>
> #include <trace/events/erofs.h>
> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
> {
> struct inode *const inode = folio->mapping->host;
> struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
> - int err;
> + int err, pcshr;
>
> trace_erofs_read_folio(folio, false);
> + pcshr = erofs_pcshr_read_begin(file, folio);
> f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
>
> z_erofs_pcluster_readmore(&f, NULL, true);
> @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
>
> erofs_put_metabuf(&f.map.buf);
> erofs_release_pages(&f.pagepool);
> + erofs_pcshr_read_end(file, folio, pcshr);
> return err;
> }
>
> @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct readahead_control *rac)
> struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
> struct folio *head = NULL, *folio;
> unsigned int nr_folios;
> - int err;
> + int err, pcshr;
>
> + pcshr = erofs_pcshr_readahead_begin(rac);
> f.headoffset = readahead_pos(rac);
>
> z_erofs_pcluster_readmore(&f, rac, true);
> @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
> (void)z_erofs_runqueue(&f, nr_folios);
> erofs_put_metabuf(&f.map.buf);
> erofs_release_pages(&f.pagepool);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> const struct address_space_operations z_erofs_aops = {
On 2025/1/21 19:59, Hongbo Li wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> This modifies relevant functions to apply the page cache
>> share feature.
>>
>> Below is the memory usage for reading all files in two different minor
>> versions of container images:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 241 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 872 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 630 | 28% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 2771 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 926 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 390 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 924 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 474 | 49% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Additionally, the table below shows the runtime memory usage of the
>> container:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 35 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 149 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 95 | 37% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 1028 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 155 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 25 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 186 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 98 | 48% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
>> ---
>> fs/erofs/data.c | 14 +++++++--
>> fs/erofs/inode.c | 5 ++-
>> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
>> fs/erofs/pagecache_share.h | 11 +++++++
>> fs/erofs/super.c | 7 +++++
>> fs/erofs/zdata.c | 9 ++++--
>> 6 files changed, 104 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
>> index 0cd6b5c4df98..fb08acbeaab6 100644
>> --- a/fs/erofs/data.c
>> +++ b/fs/erofs/data.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "internal.h"
>> +#include "pagecache_share.h"
>> #include <linux/sched/mm.h>
>> #include <trace/events/erofs.h>
>> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct
>> fiemap_extent_info *fieinfo,
>> */
>> static int erofs_read_folio(struct file *file, struct folio *folio)
>> {
>> - return iomap_read_folio(folio, &erofs_iomap_ops);
>> + int ret, pcshr;
>> +
>> + pcshr = erofs_pcshr_read_begin(file, folio);
>> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
>> + erofs_pcshr_read_end(file, folio, pcshr);
>> + return ret;
>> }
>> static void erofs_readahead(struct readahead_control *rac)
>> {
>> - return iomap_readahead(rac, &erofs_iomap_ops);
>> + int pcshr;
>> +
>> + pcshr = erofs_pcshr_readahead_begin(rac);
>> + iomap_readahead(rac, &erofs_iomap_ops);
>> + erofs_pcshr_readahead_end(rac, pcshr);
>> }
>> static sector_t erofs_bmap(struct address_space *mapping,
>> sector_t block)
>> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
>> index d4b89407822a..0b070f4b46b8 100644
>> --- a/fs/erofs/inode.c
>> +++ b/fs/erofs/inode.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "xattr.h"
>> +#include "pagecache_share.h"
>> #include <trace/events/erofs.h>
>> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
>> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
>> switch (inode->i_mode & S_IFMT) {
>> case S_IFREG:
>> inode->i_op = &erofs_generic_iops;
>> - if (erofs_inode_is_data_compressed(vi->datalayout))
>> + if (erofs_pcshr_fill_inode(inode) == 0)
>> + inode->i_fop = &erofs_pcshr_fops;
>> + else if (erofs_inode_is_data_compressed(vi->datalayout))
>> inode->i_fop = &generic_ro_fops;
>> else
>> inode->i_fop = &erofs_file_fops;
>> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> index 703fd17c002c..22172b5e21c7 100644
>> --- a/fs/erofs/pagecache_share.c
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>> struct erofs_pcshr_private {
>> char fprt[PCSHR_FPRT_MAXLEN];
>> + struct mutex mutex;
>> };
>> static struct erofs_pcshr_counter mnt_counter = {
>> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void
>> *data)
>> if (!ano_private)
>> return -ENOMEM;
>> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> + mutex_init(&ano_private->mutex);
>> inode->i_private = ano_private;
>> return 0;
>> }
>> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
>> .get_unmapped_area = thp_get_unmapped_area,
>> .splice_read = filemap_splice_read,
>> };
>> +
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
>> +{
>> + struct erofs_inode *vi;
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (!(file && file->private_data))
>> + return 0;
>> +
>> + vi = file->private_data;
>> + if (vi->ano_inode != file_inode(file))
>> + return 0;
>> +
>> + ano_private = vi->ano_inode->i_private;
>> + mutex_lock(&ano_private->mutex);
> Can we lock in folio granularity? The erofs_pcshr_private mutex may
> limit the concurrent in reading.
I’m sorry for the delay in responding; I just saw this message. I will
send an improved version of the patch soon. Thanks for this suggestion.
>> + folio->mapping->host = &vi->vfs_inode;
>> + return 1;
>> +}
>> +
>> +void erofs_pcshr_read_end(struct file *file, struct folio *folio,
>> int pcshr)
>> +{
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (pcshr == 0)
>> + return;
>> +
>> + ano_private = file_inode(file)->i_private;
>> + folio->mapping->host = file_inode(file);
>> + mutex_unlock(&ano_private->mutex);
>> +}
>> +
>> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
>> +{
> May be the begin/end helpers for read and readahead can be used with
> the same helpers. They did the similar logic.
Okay, indeed! I will send an improved version later.
Best wishes,
Hongzhen Luo
>> + struct erofs_inode *vi;
>> + struct file *file = rac->file;
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (!(file && file->private_data))
>> + return 0;
>> +
>> + vi = file->private_data;
>> + if (vi->ano_inode != file_inode(file))
>> + return 0;
>> +
>> + ano_private = file_inode(file)->i_private;
>> + mutex_lock(&ano_private->mutex);
>> + rac->mapping->host = &vi->vfs_inode;
>> + return 1;
>> +}
>> +
>> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int
>> pcshr)
>> +{
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (pcshr == 0)
>> + return;
>> +
>> + ano_private = file_inode(rac->file)->i_private;
>> + rac->mapping->host = file_inode(rac->file);
>> + mutex_unlock(&ano_private->mutex);
>> +}
>> diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
>> index f3889d6889e5..abda2a60278b 100644
>> --- a/fs/erofs/pagecache_share.h
>> +++ b/fs/erofs/pagecache_share.h
>> @@ -14,6 +14,12 @@ void erofs_pcshr_free_mnt(void);
>> int erofs_pcshr_fill_inode(struct inode *inode);
>> void erofs_pcshr_free_inode(struct inode *inode);
>> +/* switch between the anonymous inode and the real inode */
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio);
>> +void erofs_pcshr_read_end(struct file *file, struct folio *folio,
>> int pcshr);
>> +int erofs_pcshr_readahead_begin(struct readahead_control *rac);
>> +void erofs_pcshr_readahead_end(struct readahead_control *rac, int
>> pcshr);
>> +
>> #else
>> static inline int erofs_pcshr_init_mnt(void) { return 0; }
>> @@ -21,6 +27,11 @@ static inline void erofs_pcshr_free_mnt(void) {}
>> static inline int erofs_pcshr_fill_inode(struct inode *inode) {
>> return -1; }
>> static inline void erofs_pcshr_free_inode(struct inode *inode) {}
>> +static inline int erofs_pcshr_read_begin(struct file *file, struct
>> folio *folio) { return 0; }
>> +static inline void erofs_pcshr_read_end(struct file *file, struct
>> folio *folio, int pcshr) {}
>> +static inline int erofs_pcshr_readahead_begin(struct
>> readahead_control *rac) { return 0; }
>> +static inline void erofs_pcshr_readahead_end(struct
>> readahead_control *rac, int pcshr) {}
>> +
>> #endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>> #endif
>> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
>> index b4ce07dc931c..1b690eb6c1f1 100644
>> --- a/fs/erofs/super.c
>> +++ b/fs/erofs/super.c
>> @@ -13,6 +13,7 @@
>> #include <linux/backing-dev.h>
>> #include <linux/pseudo_fs.h>
>> #include "xattr.h"
>> +#include "pagecache_share.h"
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/erofs.h>
>> @@ -81,6 +82,7 @@ static void erofs_free_inode(struct inode *inode)
>> {
>> struct erofs_inode *vi = EROFS_I(inode);
>> + erofs_pcshr_free_inode(inode);
>> if (inode->i_op == &erofs_fast_symlink_iops)
>> kfree(inode->i_link);
>> kfree(vi->xattr_shared_xattrs);
>> @@ -683,6 +685,10 @@ static int erofs_fc_fill_super(struct
>> super_block *sb, struct fs_context *fc)
>> if (err)
>> return err;
>> + err = erofs_pcshr_init_mnt();
>> + if (err)
>> + return err;
>> +
>> erofs_info(sb, "mounted with root inode @ nid %llu.",
>> sbi->root_nid);
>> return 0;
>> }
>> @@ -818,6 +824,7 @@ static void erofs_kill_sb(struct super_block *sb)
>> kill_anon_super(sb);
>> else
>> kill_block_super(sb);
>> + erofs_pcshr_free_mnt();
>> fs_put_dax(sbi->dif0.dax_dev, NULL);
>> erofs_fscache_unregister_fs(sb);
>> erofs_sb_free(sbi);
>> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
>> index 19ef4ff2a134..fc2ed01eaabe 100644
>> --- a/fs/erofs/zdata.c
>> +++ b/fs/erofs/zdata.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2022 Alibaba Cloud
>> */
>> #include "compress.h"
>> +#include "pagecache_share.h"
>> #include <linux/psi.h>
>> #include <linux/cpuhotplug.h>
>> #include <trace/events/erofs.h>
>> @@ -1891,9 +1892,10 @@ static int z_erofs_read_folio(struct file
>> *file, struct folio *folio)
>> {
>> struct inode *const inode = folio->mapping->host;
>> struct z_erofs_decompress_frontend f =
>> DECOMPRESS_FRONTEND_INIT(inode);
>> - int err;
>> + int err, pcshr;
>> trace_erofs_read_folio(folio, false);
>> + pcshr = erofs_pcshr_read_begin(file, folio);
>> f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
>> z_erofs_pcluster_readmore(&f, NULL, true);
>> @@ -1909,6 +1911,7 @@ static int z_erofs_read_folio(struct file
>> *file, struct folio *folio)
>> erofs_put_metabuf(&f.map.buf);
>> erofs_release_pages(&f.pagepool);
>> + erofs_pcshr_read_end(file, folio, pcshr);
>> return err;
>> }
>> @@ -1918,8 +1921,9 @@ static void z_erofs_readahead(struct
>> readahead_control *rac)
>> struct z_erofs_decompress_frontend f =
>> DECOMPRESS_FRONTEND_INIT(inode);
>> struct folio *head = NULL, *folio;
>> unsigned int nr_folios;
>> - int err;
>> + int err, pcshr;
>> + pcshr = erofs_pcshr_readahead_begin(rac);
>> f.headoffset = readahead_pos(rac);
>> z_erofs_pcluster_readmore(&f, rac, true);
>> @@ -1947,6 +1951,7 @@ static void z_erofs_readahead(struct
>> readahead_control *rac)
>> (void)z_erofs_runqueue(&f, nr_folios);
>> erofs_put_metabuf(&f.map.buf);
>> erofs_release_pages(&f.pagepool);
>> + erofs_pcshr_readahead_end(rac, pcshr);
>> }
>> const struct address_space_operations z_erofs_aops = {
On 2025/1/21 19:59, Hongbo Li via Linux-erofs wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> This modifies relevant functions to apply the page cache
>> share feature.
>>
>> Below is the memory usage for reading all files in two different minor
>> versions of container images:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 241 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 163 | 33% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 872 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 630 | 28% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 2771 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 2340 | 16% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 926 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 735 | 21% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 390 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 219 | 44% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 924 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 474 | 49% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Additionally, the table below shows the runtime memory usage of the
>> container:
>>
>> +-------------------+------------------+-------------+---------------+
>> | Image | Page Cache Share | Memory (MB) | Memory |
>> | | | | Reduction (%) |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 35 | - |
>> | redis +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 28 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 149 | - |
>> | postgres +------------------+-------------+---------------+
>> | 16.1 & 16.2 | Yes | 95 | 37% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 1028 | - |
>> | tensorflow +------------------+-------------+---------------+
>> | 1.11.0 & 2.11.1 | Yes | 930 | 10% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 155 | - |
>> | mysql +------------------+-------------+---------------+
>> | 8.0.11 & 8.0.12 | Yes | 132 | 15% |
>> +-------------------+------------------+-------------+---------------+
>> | | No | 25 | - |
>> | nginx +------------------+-------------+---------------+
>> | 7.2.4 & 7.2.5 | Yes | 20 | 20% |
>> +-------------------+------------------+-------------+---------------+
>> | tomcat | No | 186 | - |
>> | 10.1.25 & 10.1.26 +------------------+-------------+---------------+
>> | | Yes | 98 | 48% |
>> +-------------------+------------------+-------------+---------------+
>>
>> Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
>> ---
>> fs/erofs/data.c | 14 +++++++--
>> fs/erofs/inode.c | 5 ++-
>> fs/erofs/pagecache_share.c | 63 ++++++++++++++++++++++++++++++++++++++
>> fs/erofs/pagecache_share.h | 11 +++++++
>> fs/erofs/super.c | 7 +++++
>> fs/erofs/zdata.c | 9 ++++--
>> 6 files changed, 104 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
>> index 0cd6b5c4df98..fb08acbeaab6 100644
>> --- a/fs/erofs/data.c
>> +++ b/fs/erofs/data.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "internal.h"
>> +#include "pagecache_share.h"
>> #include <linux/sched/mm.h>
>> #include <trace/events/erofs.h>
>> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>> */
>> static int erofs_read_folio(struct file *file, struct folio *folio)
>> {
>> - return iomap_read_folio(folio, &erofs_iomap_ops);
>> + int ret, pcshr;
>> +
>> + pcshr = erofs_pcshr_read_begin(file, folio);
>> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
>> + erofs_pcshr_read_end(file, folio, pcshr);
>> + return ret;
>> }
>> static void erofs_readahead(struct readahead_control *rac)
>> {
>> - return iomap_readahead(rac, &erofs_iomap_ops);
>> + int pcshr;
>> +
>> + pcshr = erofs_pcshr_readahead_begin(rac);
>> + iomap_readahead(rac, &erofs_iomap_ops);
>> + erofs_pcshr_readahead_end(rac, pcshr);
>> }
>> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
>> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
>> index d4b89407822a..0b070f4b46b8 100644
>> --- a/fs/erofs/inode.c
>> +++ b/fs/erofs/inode.c
>> @@ -5,6 +5,7 @@
>> * Copyright (C) 2021, Alibaba Cloud
>> */
>> #include "xattr.h"
>> +#include "pagecache_share.h"
>> #include <trace/events/erofs.h>
>> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
>> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
>> switch (inode->i_mode & S_IFMT) {
>> case S_IFREG:
>> inode->i_op = &erofs_generic_iops;
>> - if (erofs_inode_is_data_compressed(vi->datalayout))
>> + if (erofs_pcshr_fill_inode(inode) == 0)
>> + inode->i_fop = &erofs_pcshr_fops;
>> + else if (erofs_inode_is_data_compressed(vi->datalayout))
>> inode->i_fop = &generic_ro_fops;
>> else
>> inode->i_fop = &erofs_file_fops;
>> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> index 703fd17c002c..22172b5e21c7 100644
>> --- a/fs/erofs/pagecache_share.c
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>> struct erofs_pcshr_private {
>> char fprt[PCSHR_FPRT_MAXLEN];
>> + struct mutex mutex;
>> };
>> static struct erofs_pcshr_counter mnt_counter = {
>> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
>> if (!ano_private)
>> return -ENOMEM;
>> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> + mutex_init(&ano_private->mutex);
>> inode->i_private = ano_private;
>> return 0;
>> }
>> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
>> .get_unmapped_area = thp_get_unmapped_area,
>> .splice_read = filemap_splice_read,
>> };
>> +
>> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
>> +{
>> + struct erofs_inode *vi;
>> + struct erofs_pcshr_private *ano_private;
>> +
>> + if (!(file && file->private_data))
>> + return 0;
>> +
>> + vi = file->private_data;
>> + if (vi->ano_inode != file_inode(file))
>> + return 0;
>> +
>> + ano_private = vi->ano_inode->i_private;
>> + mutex_lock(&ano_private->mutex);
> Can we lock in folio granularity? The erofs_pcshr_private mutex may limit the concurrent in reading.
I've asked Hongzhen to prepare a new reasonable version,
in this version it shouldn't be such mutex to lock the
whole submit process, but just keep all inodes stable.
Please just ignore this whole series.
Thanks,
Gao Xiang
On 2025/1/5 23:12, Hongzhen Luo wrote:
...
>
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0cd6b5c4df98..fb08acbeaab6 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "internal.h"
> +#include "pagecache_share.h"
> #include <linux/sched/mm.h>
> #include <trace/events/erofs.h>
>
> @@ -370,12 +371,21 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> */
> static int erofs_read_folio(struct file *file, struct folio *folio)
> {
> - return iomap_read_folio(folio, &erofs_iomap_ops);
> + int ret, pcshr;
> +
> + pcshr = erofs_pcshr_read_begin(file, folio);
> + ret = iomap_read_folio(folio, &erofs_iomap_ops);
> + erofs_pcshr_read_end(file, folio, pcshr);
> + return ret;
> }
>
> static void erofs_readahead(struct readahead_control *rac)
> {
> - return iomap_readahead(rac, &erofs_iomap_ops);
> + int pcshr;
> +
> + pcshr = erofs_pcshr_readahead_begin(rac);
> + iomap_readahead(rac, &erofs_iomap_ops);
> + erofs_pcshr_readahead_end(rac, pcshr);
> }
>
> static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index d4b89407822a..0b070f4b46b8 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2021, Alibaba Cloud
> */
> #include "xattr.h"
> +#include "pagecache_share.h"
> #include <trace/events/erofs.h>
>
> static int erofs_fill_symlink(struct inode *inode, void *kaddr,
> @@ -212,7 +213,9 @@ static int erofs_fill_inode(struct inode *inode)
> switch (inode->i_mode & S_IFMT) {
> case S_IFREG:
> inode->i_op = &erofs_generic_iops;
> - if (erofs_inode_is_data_compressed(vi->datalayout))
> + if (erofs_pcshr_fill_inode(inode) == 0)
> + inode->i_fop = &erofs_pcshr_fops;
> + else if (erofs_inode_is_data_compressed(vi->datalayout))
> inode->i_fop = &generic_ro_fops;
> else
> inode->i_fop = &erofs_file_fops;
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> index 703fd17c002c..22172b5e21c7 100644
> --- a/fs/erofs/pagecache_share.c
> +++ b/fs/erofs/pagecache_share.c
> @@ -22,6 +22,7 @@ struct erofs_pcshr_counter {
>
> struct erofs_pcshr_private {
> char fprt[PCSHR_FPRT_MAXLEN];
> + struct mutex mutex;
> };
>
> static struct erofs_pcshr_counter mnt_counter = {
> @@ -84,6 +85,7 @@ static int erofs_fprt_set(struct inode *inode, void *data)
> if (!ano_private)
> return -ENOMEM;
> memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + mutex_init(&ano_private->mutex);
> inode->i_private = ano_private;
> return 0;
> }
> @@ -226,3 +228,64 @@ const struct file_operations erofs_pcshr_fops = {
> .get_unmapped_area = thp_get_unmapped_area,
> .splice_read = filemap_splice_read,
> };
> +
> +int erofs_pcshr_read_begin(struct file *file, struct folio *folio)
> +{
> + struct erofs_inode *vi;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = vi->ano_inode->i_private;
> + mutex_lock(&ano_private->mutex);
> + folio->mapping->host = &vi->vfs_inode;
you shouldn't change `folio->mapping->host` directly.
> + return 1;
> +}
> +
> +void erofs_pcshr_read_end(struct file *file, struct folio *folio, int pcshr)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + if (pcshr == 0)
> + return;
> +
> + ano_private = file_inode(file)->i_private;
> + folio->mapping->host = file_inode(file);
you shouldn't change `folio->mapping->host` directly
and then switch back. It's too hacky.
> + mutex_unlock(&ano_private->mutex);
> +}
> +
> +int erofs_pcshr_readahead_begin(struct readahead_control *rac)
> +{
> + struct erofs_inode *vi;
> + struct file *file = rac->file;
> + struct erofs_pcshr_private *ano_private;
> +
> + if (!(file && file->private_data))
> + return 0;
> +
> + vi = file->private_data;
> + if (vi->ano_inode != file_inode(file))
> + return 0;
> +
> + ano_private = file_inode(file)->i_private;
> + mutex_lock(&ano_private->mutex);
> + rac->mapping->host = &vi->vfs_inode;
Same here.
Thanks,
Gao Xiang
© 2016 - 2026 Red Hat, Inc.