[PATCH] erofs: free pcluster right after decompression if possible

Chunhai Guo posted 1 patch 1 month, 4 weeks ago
fs/erofs/internal.h |  3 ++-
fs/erofs/zdata.c    | 14 ++++++++---
fs/erofs/zutil.c    | 58 +++++++++++++++++++++++++++++----------------
3 files changed, 51 insertions(+), 24 deletions(-)
[PATCH] erofs: free pcluster right after decompression if possible
Posted by Chunhai Guo 1 month, 4 weeks ago
Once a pcluster is fully decompressed and there are no attached cached
pages, its corresponding struct z_erofs_pcluster will be freed. This
will significantly reduce the frequency of calls to erofs_shrink_scan()
and the memory allocated for struct z_erofs_pcluster.

The tables below show approximately a 95% reduction in the calls to
erofs_shrink_scan() and in the memory allocated for struct
z_erofs_pcluster after applying this patch. The results were obtained by
performing a test to copy a 2.1 GB partition on ARM64 Android devices
running the 5.15 kernel with an 8-core CPU and 8GB of memory.

1. The reduction in calls to erofs_shrink_scan():
+-----------------+-----------+----------+---------+
|                 | w/o patch | w/ patch |  diff   |
+-----------------+-----------+----------+---------+
| Average (times) |   3152    |   160    | -94.92% |
+-----------------+-----------+----------+---------+

2. The reduction in memory released by erofs_shrink_scan():
+-----------------+-----------+----------+---------+
|                 | w/o patch | w/ patch |  diff   |
+-----------------+-----------+----------+---------+
| Average (Byte)  | 44503200  | 2293760  | -94.84% |
+-----------------+-----------+----------+---------+

Signed-off-by: Chunhai Guo <guochunhai@vivo.com>
---
 fs/erofs/internal.h |  3 ++-
 fs/erofs/zdata.c    | 14 ++++++++---
 fs/erofs/zutil.c    | 58 +++++++++++++++++++++++++++++----------------
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4efd578d7c62..17b04bfd743f 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -456,7 +456,8 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
 void erofs_release_pages(struct page **pagepool);
 
 #ifdef CONFIG_EROFS_FS_ZIP
-void erofs_workgroup_put(struct erofs_workgroup *grp);
+void erofs_workgroup_put(struct erofs_sb_info *sbi, struct erofs_workgroup *grp,
+		bool can_released);
 struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
 					     pgoff_t index);
 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 8936790618c6..656fd65aec33 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -888,7 +888,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
 	 * any longer if the pcluster isn't hosted by ourselves.
 	 */
 	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
-		erofs_workgroup_put(&pcl->obj);
+		erofs_workgroup_put(EROFS_I_SB(fe->inode), &pcl->obj, false);
 
 	fe->pcl = NULL;
 }
@@ -1046,6 +1046,9 @@ struct z_erofs_decompress_backend {
 	struct list_head decompressed_secondary_bvecs;
 	struct page **pagepool;
 	unsigned int onstack_used, nr_pages;
+
+	/* whether the pcluster can be released after its decompression */
+	bool try_free;
 };
 
 struct z_erofs_bvec_item {
@@ -1244,12 +1247,15 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 		WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
 		put_page(page);
 	} else {
+		be->try_free = true;
 		/* managed folios are still left in compressed_bvecs[] */
 		for (i = 0; i < pclusterpages; ++i) {
 			page = be->compressed_pages[i];
 			if (!page ||
-			    erofs_folio_is_managed(sbi, page_folio(page)))
+			    erofs_folio_is_managed(sbi, page_folio(page))) {
+				be->try_free = false;
 				continue;
+			}
 			(void)z_erofs_put_shortlivedpage(be->pagepool, page);
 			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
 		}
@@ -1285,6 +1291,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	if (be->decompressed_pages != be->onstack_pages)
 		kvfree(be->decompressed_pages);
 
+	be->try_free = be->try_free && !pcl->partial;
 	pcl->length = 0;
 	pcl->partial = true;
 	pcl->multibases = false;
@@ -1320,7 +1327,8 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
 		if (z_erofs_is_inline_pcluster(be.pcl))
 			z_erofs_free_pcluster(be.pcl);
 		else
-			erofs_workgroup_put(&be.pcl->obj);
+			erofs_workgroup_put(EROFS_SB(io->sb), &be.pcl->obj,
+					be.try_free);
 	}
 	return err;
 }
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 37afe2024840..cf59ba6a2322 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -285,26 +285,11 @@ static void  __erofs_workgroup_free(struct erofs_workgroup *grp)
 	erofs_workgroup_free_rcu(grp);
 }
 
-void erofs_workgroup_put(struct erofs_workgroup *grp)
-{
-	if (lockref_put_or_lock(&grp->lockref))
-		return;
-
-	DBG_BUGON(__lockref_is_dead(&grp->lockref));
-	if (grp->lockref.count == 1)
-		atomic_long_inc(&erofs_global_shrink_cnt);
-	--grp->lockref.count;
-	spin_unlock(&grp->lockref.lock);
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+static bool erofs_prepare_to_release_workgroup(struct erofs_sb_info *sbi,
 					   struct erofs_workgroup *grp)
 {
-	int free = false;
-
-	spin_lock(&grp->lockref.lock);
 	if (grp->lockref.count)
-		goto out;
+		return false;
 
 	/*
 	 * Note that all cached pages should be detached before deleted from
@@ -312,7 +297,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
 	 * the orphan old workgroup when the new one is available in the tree.
 	 */
 	if (erofs_try_to_free_all_cached_folios(sbi, grp))
-		goto out;
+		return false;
 
 	/*
 	 * It's impossible to fail after the workgroup is freezed,
@@ -322,14 +307,47 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
 	DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
 
 	lockref_mark_dead(&grp->lockref);
-	free = true;
-out:
+	return true;
+}
+
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+					   struct erofs_workgroup *grp)
+{
+	bool free = false;
+
+	/* Using trylock to avoid deadlock with erofs_workgroup_put() */
+	if (!spin_trylock(&grp->lockref.lock))
+		return free;
+	free = erofs_prepare_to_release_workgroup(sbi, grp);
 	spin_unlock(&grp->lockref.lock);
 	if (free)
 		__erofs_workgroup_free(grp);
 	return free;
 }
 
+void erofs_workgroup_put(struct erofs_sb_info *sbi, struct erofs_workgroup *grp,
+		bool try_free)
+{
+	bool free = false;
+
+	if (lockref_put_or_lock(&grp->lockref))
+		return;
+
+	DBG_BUGON(__lockref_is_dead(&grp->lockref));
+	if (--grp->lockref.count == 0) {
+		atomic_long_inc(&erofs_global_shrink_cnt);
+
+		if (try_free) {
+			xa_lock(&sbi->managed_pslots);
+			free = erofs_prepare_to_release_workgroup(sbi, grp);
+			xa_unlock(&sbi->managed_pslots);
+		}
+	}
+	spin_unlock(&grp->lockref.lock);
+	if (free)
+		__erofs_workgroup_free(grp);
+}
+
 static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 					      unsigned long nr_shrink)
 {
-- 
2.25.1
Re: [PATCH] erofs: free pcluster right after decompression if possible
Posted by Gao Xiang 1 month, 1 week ago
Hi Chunhai,

Thanks for the work!  Please rebase this work on
my "sunset z_erofs_workgroup` series.

On 2024/9/30 22:04, Chunhai Guo wrote:
> Once a pcluster is fully decompressed and there are no attached cached
> pages, its corresponding struct z_erofs_pcluster will be freed. This

Subject: free pclusters if no cached folio attached

cached folios, its corresponding `struct z_erofs_pcluster`...

> will significantly reduce the frequency of calls to erofs_shrink_scan()
> and the memory allocated for struct z_erofs_pcluster.
> 
> The tables below show approximately a 95% reduction in the calls to
> erofs_shrink_scan() and in the memory allocated for struct
					for `struct z_erofs_pcluster`

> z_erofs_pcluster after applying this patch. The results were obtained by
> performing a test to copy a 2.1 GB partition on ARM64 Android devices
> running the 5.15 kernel with an 8-core CPU and 8GB of memory.

I guess you could try to use more recent kernels for testing instead?

> 
> 1. The reduction in calls to erofs_shrink_scan():
> +-----------------+-----------+----------+---------+
> |                 | w/o patch | w/ patch |  diff   |
> +-----------------+-----------+----------+---------+
> | Average (times) |   3152    |   160    | -94.92% |
> +-----------------+-----------+----------+---------+
> 
> 2. The reduction in memory released by erofs_shrink_scan():
> +-----------------+-----------+----------+---------+
> |                 | w/o patch | w/ patch |  diff   |
> +-----------------+-----------+----------+---------+
> | Average (Byte)  | 44503200  | 2293760  | -94.84% |
> +-----------------+-----------+----------+---------+
> 
> Signed-off-by: Chunhai Guo <guochunhai@vivo.com>
> ---
>   fs/erofs/internal.h |  3 ++-
>   fs/erofs/zdata.c    | 14 ++++++++---
>   fs/erofs/zutil.c    | 58 +++++++++++++++++++++++++++++----------------
>   3 files changed, 51 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 4efd578d7c62..17b04bfd743f 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -456,7 +456,8 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
>   void erofs_release_pages(struct page **pagepool);
>   
>   #ifdef CONFIG_EROFS_FS_ZIP
> -void erofs_workgroup_put(struct erofs_workgroup *grp);
> +void erofs_workgroup_put(struct erofs_sb_info *sbi, struct erofs_workgroup *grp,
> +		bool can_released);
>   struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
>   					     pgoff_t index);
>   struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 8936790618c6..656fd65aec33 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -888,7 +888,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
>   	 * any longer if the pcluster isn't hosted by ourselves.
>   	 */
>   	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
> -		erofs_workgroup_put(&pcl->obj);
> +		erofs_workgroup_put(EROFS_I_SB(fe->inode), &pcl->obj, false);
>   
>   	fe->pcl = NULL;
>   }
> @@ -1046,6 +1046,9 @@ struct z_erofs_decompress_backend {
>   	struct list_head decompressed_secondary_bvecs;
>   	struct page **pagepool;
>   	unsigned int onstack_used, nr_pages;
> +
> +	/* whether the pcluster can be released after its decompression */
> +	bool try_free;
>   };
>   
>   struct z_erofs_bvec_item {
> @@ -1244,12 +1247,15 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
>   		WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
>   		put_page(page);
>   	} else {
> +		be->try_free = true;
>   		/* managed folios are still left in compressed_bvecs[] */
>   		for (i = 0; i < pclusterpages; ++i) {
>   			page = be->compressed_pages[i];
>   			if (!page ||
> -			    erofs_folio_is_managed(sbi, page_folio(page)))
> +			    erofs_folio_is_managed(sbi, page_folio(page))) {
> +				be->try_free = false;
>   				continue;
> +			}
>   			(void)z_erofs_put_shortlivedpage(be->pagepool, page);
>   			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
>   		}
> @@ -1285,6 +1291,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
>   	if (be->decompressed_pages != be->onstack_pages)
>   		kvfree(be->decompressed_pages);
>   
> +	be->try_free = be->try_free && !pcl->partial;

I think no need to check `pcl->partial`.

>   	pcl->length = 0;
>   	pcl->partial = true;
>   	pcl->multibases = false;
> @@ -1320,7 +1327,8 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
>   		if (z_erofs_is_inline_pcluster(be.pcl))
>   			z_erofs_free_pcluster(be.pcl);
>   		else
> -			erofs_workgroup_put(&be.pcl->obj);
> +			erofs_workgroup_put(EROFS_SB(io->sb), &be.pcl->obj,
> +					be.try_free);

We could just move

if (z_erofs_is_inline_pcluster(be.pcl))
	z_erofs_free_pcluster(be.pcl);
else
	z_erofs_put_pcluster(be.pcl);

into the end of z_erofs_decompress_pcluster() and
get rid of `be->try_free`;



>   	}
>   	return err;
>   }
> diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
> index 37afe2024840..cf59ba6a2322 100644
> --- a/fs/erofs/zutil.c
> +++ b/fs/erofs/zutil.c
> @@ -285,26 +285,11 @@ static void  __erofs_workgroup_free(struct erofs_workgroup *grp)
>   	erofs_workgroup_free_rcu(grp);
>   }
>   
> -void erofs_workgroup_put(struct erofs_workgroup *grp)
> -{
> -	if (lockref_put_or_lock(&grp->lockref))
> -		return;
> -
> -	DBG_BUGON(__lockref_is_dead(&grp->lockref));
> -	if (grp->lockref.count == 1)
> -		atomic_long_inc(&erofs_global_shrink_cnt);
> -	--grp->lockref.count;
> -	spin_unlock(&grp->lockref.lock);
> -}
> -
> -static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
> +static bool erofs_prepare_to_release_workgroup(struct erofs_sb_info *sbi,
>   					   struct erofs_workgroup *grp)
>   {
> -	int free = false;
> -
> -	spin_lock(&grp->lockref.lock);
>   	if (grp->lockref.count)
> -		goto out;
> +		return false;
>   
>   	/*
>   	 * Note that all cached pages should be detached before deleted from
> @@ -312,7 +297,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
>   	 * the orphan old workgroup when the new one is available in the tree.
>   	 */
>   	if (erofs_try_to_free_all_cached_folios(sbi, grp))
> -		goto out;
> +		return false;
>   
>   	/*
>   	 * It's impossible to fail after the workgroup is freezed,
> @@ -322,14 +307,47 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
>   	DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
>   
>   	lockref_mark_dead(&grp->lockref);
> -	free = true;
> -out:
> +	return true;
> +}
> +
> +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
> +					   struct erofs_workgroup *grp)
> +{
> +	bool free = false;
> +
> +	/* Using trylock to avoid deadlock with erofs_workgroup_put() */
> +	if (!spin_trylock(&grp->lockref.lock))
> +		return free;
> +	free = erofs_prepare_to_release_workgroup(sbi, grp);
>   	spin_unlock(&grp->lockref.lock);
>   	if (free)
>   		__erofs_workgroup_free(grp);
>   	return free;
>   }
>   
> +void erofs_workgroup_put(struct erofs_sb_info *sbi, struct erofs_workgroup *grp,
> +		bool try_free)
> +{
> +	bool free = false;
> +
> +	if (lockref_put_or_lock(&grp->lockref))
> +		return;
> +
> +	DBG_BUGON(__lockref_is_dead(&grp->lockref));
> +	if (--grp->lockref.count == 0) {
> +		atomic_long_inc(&erofs_global_shrink_cnt);
> +
> +		if (try_free) {
> +			xa_lock(&sbi->managed_pslots);
> +			free = erofs_prepare_to_release_workgroup(sbi, grp);
> +			xa_unlock(&sbi->managed_pslots);
> +		}
> +	}
> +	spin_unlock(&grp->lockref.lock);
> +	if (free)
> +		__erofs_workgroup_free(grp);

need to wait for a RCU grace period.

Thanks,
Gao Xiang