[v3] mm, swap: swap table phase II: unify swapin use swap cache and cleanup flags

[PATCH v3 07/19] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO

Posted by Kairui Song 2 months, 2 weeks ago

From: Kairui Song <kasong@tencent.com>

Now the overhead of the swap cache is trivial to none, bypassing the
swap cache is no longer a valid optimization.

We have removed the cache bypass swapin for anon memory, now do the same
for shmem. Many helpers and functions can be dropped now.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c    | 65 +++++++++++++++++------------------------------------------
 mm/swap.h     |  4 ----
 mm/swapfile.c | 35 +++++++++-----------------------
 3 files changed, 27 insertions(+), 77 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ad18172ff831..d08248fd67ff 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2001,10 +2001,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
 		swp_entry_t entry, int order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct folio *new, *swapcache;
 	int nr_pages = 1 << order;
-	struct folio *new;
 	gfp_t alloc_gfp;
-	void *shadow;
 
 	/*
 	 * We have arrived here because our zones are constrained, so don't
@@ -2044,34 +2043,19 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
 		goto fallback;
 	}
 
-	/*
-	 * Prevent parallel swapin from proceeding with the swap cache flag.
-	 *
-	 * Of course there is another possible concurrent scenario as well,
-	 * that is to say, the swap cache flag of a large folio has already
-	 * been set by swapcache_prepare(), while another thread may have
-	 * already split the large swap entry stored in the shmem mapping.
-	 * In this case, shmem_add_to_page_cache() will help identify the
-	 * concurrent swapin and return -EEXIST.
-	 */
-	if (swapcache_prepare(entry, nr_pages)) {
+	swapcache = swapin_folio(entry, new);
+	if (swapcache != new) {
 		folio_put(new);
-		new = ERR_PTR(-EEXIST);
-		/* Try smaller folio to avoid cache conflict */
-		goto fallback;
+		if (!swapcache) {
+			/*
+			 * The new folio is charged already, swapin can
+			 * only fail due to another raced swapin.
+			 */
+			new = ERR_PTR(-EEXIST);
+			goto fallback;
+		}
 	}
-
-	__folio_set_locked(new);
-	__folio_set_swapbacked(new);
-	new->swap = entry;
-
-	memcg1_swapin(entry, nr_pages);
-	shadow = swap_cache_get_shadow(entry);
-	if (shadow)
-		workingset_refault(new, shadow);
-	folio_add_lru(new);
-	swap_read_folio(new, NULL);
-	return new;
+	return swapcache;
 fallback:
 	/* Order 0 swapin failed, nothing to fallback to, abort */
 	if (!order)
@@ -2161,8 +2145,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
 }
 
 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
-					 struct folio *folio, swp_entry_t swap,
-					 bool skip_swapcache)
+					 struct folio *folio, swp_entry_t swap)
 {
 	struct address_space *mapping = inode->i_mapping;
 	swp_entry_t swapin_error;
@@ -2178,8 +2161,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
 
 	nr_pages = folio_nr_pages(folio);
 	folio_wait_writeback(folio);
-	if (!skip_swapcache)
-		swap_cache_del_folio(folio);
+	swap_cache_del_folio(folio);
 	/*
 	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
 	 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2279,7 +2261,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	softleaf_t index_entry;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
-	bool skip_swapcache = false;
 	int error, nr_pages, order;
 	pgoff_t offset;
 
@@ -2322,7 +2303,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 				folio = NULL;
 				goto failed;
 			}
-			skip_swapcache = true;
 		} else {
 			/* Cached swapin only supports order 0 folio */
 			folio = shmem_swapin_cluster(swap, gfp, info, index);
@@ -2378,9 +2358,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	 * and swap cache folios are never partially freed.
 	 */
 	folio_lock(folio);
-	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
-	    shmem_confirm_swap(mapping, index, swap) < 0 ||
-	    folio->swap.val != swap.val) {
+	if (!folio_matches_swap_entry(folio, swap) ||
+	    shmem_confirm_swap(mapping, index, swap) < 0) {
 		error = -EEXIST;
 		goto unlock;
 	}
@@ -2412,12 +2391,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (sgp == SGP_WRITE)
 		folio_mark_accessed(folio);
 
-	if (skip_swapcache) {
-		folio->swap.val = 0;
-		swapcache_clear(si, swap, nr_pages);
-	} else {
-		swap_cache_del_folio(folio);
-	}
+	swap_cache_del_folio(folio);
 	folio_mark_dirty(folio);
 	swap_free_nr(swap, nr_pages);
 	put_swap_device(si);
@@ -2428,14 +2402,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (shmem_confirm_swap(mapping, index, swap) < 0)
 		error = -EEXIST;
 	if (error == -EIO)
-		shmem_set_folio_swapin_error(inode, index, folio, swap,
-					     skip_swapcache);
+		shmem_set_folio_swapin_error(inode, index, folio, swap);
 unlock:
 	if (folio)
 		folio_unlock(folio);
 failed_nolock:
-	if (skip_swapcache)
-		swapcache_clear(si, folio->swap, folio_nr_pages(folio));
 	if (folio)
 		folio_put(folio);
 	put_swap_device(si);
diff --git a/mm/swap.h b/mm/swap.h
index 214e7d041030..e0f05babe13a 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -403,10 +403,6 @@ static inline int swap_writeout(struct folio *folio,
 	return 0;
 }
 
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
-{
-}
-
 static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
 {
 	return NULL;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ee6bb37ab174..5853db044031 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1610,22 +1610,6 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	return NULL;
 }
 
-static void swap_entries_put_cache(struct swap_info_struct *si,
-				   swp_entry_t entry, int nr)
-{
-	unsigned long offset = swp_offset(entry);
-	struct swap_cluster_info *ci;
-
-	ci = swap_cluster_lock(si, offset);
-	if (swap_only_has_cache(si, offset, nr)) {
-		swap_entries_free(si, ci, entry, nr);
-	} else {
-		for (int i = 0; i < nr; i++, entry.val++)
-			swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
-	}
-	swap_cluster_unlock(ci);
-}
-
 static bool swap_entries_put_map(struct swap_info_struct *si,
 				 swp_entry_t entry, int nr)
 {
@@ -1761,13 +1745,21 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
 void put_swap_folio(struct folio *folio, swp_entry_t entry)
 {
 	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
 	int size = 1 << swap_entry_order(folio_order(folio));
 
 	si = _swap_info_get(entry);
 	if (!si)
 		return;
 
-	swap_entries_put_cache(si, entry, size);
+	ci = swap_cluster_lock(si, offset);
+	if (swap_only_has_cache(si, offset, size))
+		swap_entries_free(si, ci, entry, size);
+	else
+		for (int i = 0; i < size; i++, entry.val++)
+			swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+	swap_cluster_unlock(ci);
 }
 
 int __swap_count(swp_entry_t entry)
@@ -3780,15 +3772,6 @@ int swapcache_prepare(swp_entry_t entry, int nr)
 	return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
 }
 
-/*
- * Caller should ensure entries belong to the same folio so
- * the entries won't span cross cluster boundary.
- */
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
-{
-	swap_entries_put_cache(si, entry, nr);
-}
-
 /*
  * add_swap_count_continuation - called when a swap count is duplicated
  * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's

-- 
2.52.0

Re: [PATCH v3 07/19] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO

Posted by Baolin Wang 2 months, 1 week ago

Hi Kairui,

On 2025/11/25 03:13, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Now the overhead of the swap cache is trivial to none, bypassing the
> swap cache is no longer a valid optimization.
> 
> We have removed the cache bypass swapin for anon memory, now do the same
> for shmem. Many helpers and functions can be dropped now.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---

I'm glad to see we can remove the skip swapcache logic. I did a quick 
test, testing 1G shmem sequential swap-in with 64K mTHP and 2M mTHP, and 
I observed a slight drop, which could also be fluctuation. Can you also 
perform some measurements?

64K shmem mTHP:
W/ patchset	W/o patchset
154 ms		148 ms

2M shmem mTHP
W/ patchset	W/o patchset
117 ms		115 ms

Anyway I still hope we can remove the skip swapcache logic. The changes 
look good to me with one nit as below. Thanks for your work.

>   mm/shmem.c    | 65 +++++++++++++++++------------------------------------------
>   mm/swap.h     |  4 ----
>   mm/swapfile.c | 35 +++++++++-----------------------
>   3 files changed, 27 insertions(+), 77 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ad18172ff831..d08248fd67ff 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2001,10 +2001,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
>   		swp_entry_t entry, int order, gfp_t gfp)
>   {
>   	struct shmem_inode_info *info = SHMEM_I(inode);
> +	struct folio *new, *swapcache;
>   	int nr_pages = 1 << order;
> -	struct folio *new;
>   	gfp_t alloc_gfp;
> -	void *shadow;
>   
>   	/*
>   	 * We have arrived here because our zones are constrained, so don't
> @@ -2044,34 +2043,19 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
>   		goto fallback;
>   	}
>   
> -	/*
> -	 * Prevent parallel swapin from proceeding with the swap cache flag.
> -	 *
> -	 * Of course there is another possible concurrent scenario as well,
> -	 * that is to say, the swap cache flag of a large folio has already
> -	 * been set by swapcache_prepare(), while another thread may have
> -	 * already split the large swap entry stored in the shmem mapping.
> -	 * In this case, shmem_add_to_page_cache() will help identify the
> -	 * concurrent swapin and return -EEXIST.
> -	 */
> -	if (swapcache_prepare(entry, nr_pages)) {
> +	swapcache = swapin_folio(entry, new);
> +	if (swapcache != new) {
>   		folio_put(new);
> -		new = ERR_PTR(-EEXIST);
> -		/* Try smaller folio to avoid cache conflict */
> -		goto fallback;
> +		if (!swapcache) {
> +			/*
> +			 * The new folio is charged already, swapin can
> +			 * only fail due to another raced swapin.
> +			 */
> +			new = ERR_PTR(-EEXIST);
> +			goto fallback;
> +		}
>   	}
> -
> -	__folio_set_locked(new);
> -	__folio_set_swapbacked(new);
> -	new->swap = entry;
> -
> -	memcg1_swapin(entry, nr_pages);
> -	shadow = swap_cache_get_shadow(entry);
> -	if (shadow)
> -		workingset_refault(new, shadow);
> -	folio_add_lru(new);
> -	swap_read_folio(new, NULL);
> -	return new;
> +	return swapcache;
>   fallback:
>   	/* Order 0 swapin failed, nothing to fallback to, abort */
>   	if (!order)
> @@ -2161,8 +2145,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
>   }
>   
>   static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
> -					 struct folio *folio, swp_entry_t swap,
> -					 bool skip_swapcache)
> +					 struct folio *folio, swp_entry_t swap)
>   {
>   	struct address_space *mapping = inode->i_mapping;
>   	swp_entry_t swapin_error;
> @@ -2178,8 +2161,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
>   
>   	nr_pages = folio_nr_pages(folio);
>   	folio_wait_writeback(folio);
> -	if (!skip_swapcache)
> -		swap_cache_del_folio(folio);
> +	swap_cache_del_folio(folio);
>   	/*
>   	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
>   	 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
> @@ -2279,7 +2261,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   	softleaf_t index_entry;
>   	struct swap_info_struct *si;
>   	struct folio *folio = NULL;
> -	bool skip_swapcache = false;
>   	int error, nr_pages, order;
>   	pgoff_t offset;
>   
> @@ -2322,7 +2303,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   				folio = NULL;
>   				goto failed;
>   			}
> -			skip_swapcache = true;
>   		} else {
>   			/* Cached swapin only supports order 0 folio */
>   			folio = shmem_swapin_cluster(swap, gfp, info, index);
> @@ -2378,9 +2358,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   	 * and swap cache folios are never partially freed.
>   	 */
>   	folio_lock(folio);
> -	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
> -	    shmem_confirm_swap(mapping, index, swap) < 0 ||
> -	    folio->swap.val != swap.val) {
> +	if (!folio_matches_swap_entry(folio, swap) ||
> +	    shmem_confirm_swap(mapping, index, swap) < 0) {

We should still keep the '!folio_test_swapcache(folio)' check here?

Re: [PATCH v3 07/19] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO

Posted by Kairui Song 2 months, 1 week ago

On Tue, Dec 2, 2025 at 3:34 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
> Hi Kairui,
>
> On 2025/11/25 03:13, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Now the overhead of the swap cache is trivial to none, bypassing the
> > swap cache is no longer a valid optimization.
> >
> > We have removed the cache bypass swapin for anon memory, now do the same
> > for shmem. Many helpers and functions can be dropped now.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
>
> I'm glad to see we can remove the skip swapcache logic. I did a quick
> test, testing 1G shmem sequential swap-in with 64K mTHP and 2M mTHP, and
> I observed a slight drop, which could also be fluctuation. Can you also
> perform some measurements?
>
> 64K shmem mTHP:
> W/ patchset     W/o patchset
> 154 ms          148 ms
>
> 2M shmem mTHP
> W/ patchset     W/o patchset
> 117 ms          115 ms

Hi Baolin,

Thanks for testing! This patch (7/19) is still an intermediate step,
so we are still updating both swap_map and swap table with higher
overhead. And even with that, the performance change looks small
(~1-4% in the result you posted), close to noise level.

And after this whole series, the double update is *partially* dropped,
so the performance is almost identical to before:

tmpfs with transparent_hugepage_tmpfs=within_size, 3 test run on my machine:
Before       [PATCH 7/19]         [PATCH 19/19]
5.99s          6.29s            6.08s (~1%)

Note we are still using swap_map so there are double lookups
everywhere in this series, and I added more WARN_ON checks. Swap is
complex so being cautious is better I think. I've also mentioned
another valkey slight performance drop in the cover letter due to
this, which is also tiny and will be improved a lot in phase 3 by
removing swap_map and the double lookup, as demonstrated before:
https://lore.kernel.org/linux-mm/20250514201729.48420-1-ryncsn@gmail.com/

Last time I tested that branch it was a clear optimization for shmem,
some of the optimizations in that series were split or merged
separately so the performance may look go up / down in some
intermediate steps, the final result is good.

swap_cgroup_ctrl will be gone too, even later maybe though.

>
> Anyway I still hope we can remove the skip swapcache logic. The changes
> look good to me with one nit as below. Thanks for your work.
>
> >   mm/shmem.c    | 65 +++++++++++++++++------------------------------------------
> >   mm/swap.h     |  4 ----
> >   mm/swapfile.c | 35 +++++++++-----------------------
> >   3 files changed, 27 insertions(+), 77 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index ad18172ff831..d08248fd67ff 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2001,10 +2001,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
> >               swp_entry_t entry, int order, gfp_t gfp)
> >   {
> >       struct shmem_inode_info *info = SHMEM_I(inode);
> > +     struct folio *new, *swapcache;
> >       int nr_pages = 1 << order;
> > -     struct folio *new;
> >       gfp_t alloc_gfp;
> > -     void *shadow;
> >
> >       /*
> >        * We have arrived here because our zones are constrained, so don't
> > @@ -2044,34 +2043,19 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
> >               goto fallback;
> >       }
> >
> > -     /*
> > -      * Prevent parallel swapin from proceeding with the swap cache flag.
> > -      *
> > -      * Of course there is another possible concurrent scenario as well,
> > -      * that is to say, the swap cache flag of a large folio has already
> > -      * been set by swapcache_prepare(), while another thread may have
> > -      * already split the large swap entry stored in the shmem mapping.
> > -      * In this case, shmem_add_to_page_cache() will help identify the
> > -      * concurrent swapin and return -EEXIST.
> > -      */
> > -     if (swapcache_prepare(entry, nr_pages)) {
> > +     swapcache = swapin_folio(entry, new);
> > +     if (swapcache != new) {
> >               folio_put(new);
> > -             new = ERR_PTR(-EEXIST);
> > -             /* Try smaller folio to avoid cache conflict */
> > -             goto fallback;
> > +             if (!swapcache) {
> > +                     /*
> > +                      * The new folio is charged already, swapin can
> > +                      * only fail due to another raced swapin.
> > +                      */
> > +                     new = ERR_PTR(-EEXIST);
> > +                     goto fallback;
> > +             }
> >       }
> > -
> > -     __folio_set_locked(new);
> > -     __folio_set_swapbacked(new);
> > -     new->swap = entry;
> > -
> > -     memcg1_swapin(entry, nr_pages);
> > -     shadow = swap_cache_get_shadow(entry);
> > -     if (shadow)
> > -             workingset_refault(new, shadow);
> > -     folio_add_lru(new);
> > -     swap_read_folio(new, NULL);
> > -     return new;
> > +     return swapcache;
> >   fallback:
> >       /* Order 0 swapin failed, nothing to fallback to, abort */
> >       if (!order)
> > @@ -2161,8 +2145,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
> >   }
> >
> >   static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
> > -                                      struct folio *folio, swp_entry_t swap,
> > -                                      bool skip_swapcache)
> > +                                      struct folio *folio, swp_entry_t swap)
> >   {
> >       struct address_space *mapping = inode->i_mapping;
> >       swp_entry_t swapin_error;
> > @@ -2178,8 +2161,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
> >
> >       nr_pages = folio_nr_pages(folio);
> >       folio_wait_writeback(folio);
> > -     if (!skip_swapcache)
> > -             swap_cache_del_folio(folio);
> > +     swap_cache_del_folio(folio);
> >       /*
> >        * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
> >        * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
> > @@ -2279,7 +2261,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >       softleaf_t index_entry;
> >       struct swap_info_struct *si;
> >       struct folio *folio = NULL;
> > -     bool skip_swapcache = false;
> >       int error, nr_pages, order;
> >       pgoff_t offset;
> >
> > @@ -2322,7 +2303,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                               folio = NULL;
> >                               goto failed;
> >                       }
> > -                     skip_swapcache = true;
> >               } else {
> >                       /* Cached swapin only supports order 0 folio */
> >                       folio = shmem_swapin_cluster(swap, gfp, info, index);
> > @@ -2378,9 +2358,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >        * and swap cache folios are never partially freed.
> >        */
> >       folio_lock(folio);
> > -     if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
> > -         shmem_confirm_swap(mapping, index, swap) < 0 ||
> > -         folio->swap.val != swap.val) {
> > +     if (!folio_matches_swap_entry(folio, swap) ||
> > +         shmem_confirm_swap(mapping, index, swap) < 0) {
>
> We should still keep the '!folio_test_swapcache(folio)' check here?

Thanks for the review, this one is OK because folio_test_swapcache is
included in folio_matches_swap_entry already.

Re: [PATCH v3 07/19] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO

Posted by Baolin Wang 2 months, 1 week ago


On 2025/12/3 13:33, Kairui Song wrote:
> On Tue, Dec 2, 2025 at 3:34 PM Baolin Wang
> <baolin.wang@linux.alibaba.com> wrote:
>>
>> Hi Kairui,
>>
>> On 2025/11/25 03:13, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Now the overhead of the swap cache is trivial to none, bypassing the
>>> swap cache is no longer a valid optimization.
>>>
>>> We have removed the cache bypass swapin for anon memory, now do the same
>>> for shmem. Many helpers and functions can be dropped now.
>>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>
>> I'm glad to see we can remove the skip swapcache logic. I did a quick
>> test, testing 1G shmem sequential swap-in with 64K mTHP and 2M mTHP, and
>> I observed a slight drop, which could also be fluctuation. Can you also
>> perform some measurements?
>>
>> 64K shmem mTHP:
>> W/ patchset     W/o patchset
>> 154 ms          148 ms
>>
>> 2M shmem mTHP
>> W/ patchset     W/o patchset
>> 117 ms          115 ms
> 
> Hi Baolin,
> 
> Thanks for testing! This patch (7/19) is still an intermediate step,
> so we are still updating both swap_map and swap table with higher
> overhead. And even with that, the performance change looks small
> (~1-4% in the result you posted), close to noise level.
> 
> And after this whole series, the double update is *partially* dropped,
> so the performance is almost identical to before:
> 
> tmpfs with transparent_hugepage_tmpfs=within_size, 3 test run on my machine:
> Before       [PATCH 7/19]         [PATCH 19/19]
> 5.99s          6.29s            6.08s (~1%)
> 
> Note we are still using swap_map so there are double lookups
> everywhere in this series, and I added more WARN_ON checks. Swap is
> complex so being cautious is better I think. I've also mentioned
> another valkey slight performance drop in the cover letter due to
> this, which is also tiny and will be improved a lot in phase 3 by
> removing swap_map and the double lookup, as demonstrated before:
> https://lore.kernel.org/linux-mm/20250514201729.48420-1-ryncsn@gmail.com/
> 
> Last time I tested that branch it was a clear optimization for shmem,
> some of the optimizations in that series were split or merged
> separately so the performance may look go up / down in some
> intermediate steps, the final result is good.

Sounds good. Better to mention this (including your data) in the commit 
message. With that, please feel free to add:
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>

> swap_cgroup_ctrl will be gone too, even later maybe though.
> 
>>
>> Anyway I still hope we can remove the skip swapcache logic. The changes
>> look good to me with one nit as below. Thanks for your work.
>>
>>>    mm/shmem.c    | 65 +++++++++++++++++------------------------------------------
>>>    mm/swap.h     |  4 ----
>>>    mm/swapfile.c | 35 +++++++++-----------------------
>>>    3 files changed, 27 insertions(+), 77 deletions(-)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index ad18172ff831..d08248fd67ff 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -2001,10 +2001,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
>>>                swp_entry_t entry, int order, gfp_t gfp)
>>>    {
>>>        struct shmem_inode_info *info = SHMEM_I(inode);
>>> +     struct folio *new, *swapcache;
>>>        int nr_pages = 1 << order;
>>> -     struct folio *new;
>>>        gfp_t alloc_gfp;
>>> -     void *shadow;
>>>
>>>        /*
>>>         * We have arrived here because our zones are constrained, so don't
>>> @@ -2044,34 +2043,19 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
>>>                goto fallback;
>>>        }
>>>
>>> -     /*
>>> -      * Prevent parallel swapin from proceeding with the swap cache flag.
>>> -      *
>>> -      * Of course there is another possible concurrent scenario as well,
>>> -      * that is to say, the swap cache flag of a large folio has already
>>> -      * been set by swapcache_prepare(), while another thread may have
>>> -      * already split the large swap entry stored in the shmem mapping.
>>> -      * In this case, shmem_add_to_page_cache() will help identify the
>>> -      * concurrent swapin and return -EEXIST.
>>> -      */
>>> -     if (swapcache_prepare(entry, nr_pages)) {
>>> +     swapcache = swapin_folio(entry, new);
>>> +     if (swapcache != new) {
>>>                folio_put(new);
>>> -             new = ERR_PTR(-EEXIST);
>>> -             /* Try smaller folio to avoid cache conflict */
>>> -             goto fallback;
>>> +             if (!swapcache) {
>>> +                     /*
>>> +                      * The new folio is charged already, swapin can
>>> +                      * only fail due to another raced swapin.
>>> +                      */
>>> +                     new = ERR_PTR(-EEXIST);
>>> +                     goto fallback;
>>> +             }
>>>        }
>>> -
>>> -     __folio_set_locked(new);
>>> -     __folio_set_swapbacked(new);
>>> -     new->swap = entry;
>>> -
>>> -     memcg1_swapin(entry, nr_pages);
>>> -     shadow = swap_cache_get_shadow(entry);
>>> -     if (shadow)
>>> -             workingset_refault(new, shadow);
>>> -     folio_add_lru(new);
>>> -     swap_read_folio(new, NULL);
>>> -     return new;
>>> +     return swapcache;
>>>    fallback:
>>>        /* Order 0 swapin failed, nothing to fallback to, abort */
>>>        if (!order)
>>> @@ -2161,8 +2145,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
>>>    }
>>>
>>>    static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
>>> -                                      struct folio *folio, swp_entry_t swap,
>>> -                                      bool skip_swapcache)
>>> +                                      struct folio *folio, swp_entry_t swap)
>>>    {
>>>        struct address_space *mapping = inode->i_mapping;
>>>        swp_entry_t swapin_error;
>>> @@ -2178,8 +2161,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
>>>
>>>        nr_pages = folio_nr_pages(folio);
>>>        folio_wait_writeback(folio);
>>> -     if (!skip_swapcache)
>>> -             swap_cache_del_folio(folio);
>>> +     swap_cache_del_folio(folio);
>>>        /*
>>>         * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
>>>         * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
>>> @@ -2279,7 +2261,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>        softleaf_t index_entry;
>>>        struct swap_info_struct *si;
>>>        struct folio *folio = NULL;
>>> -     bool skip_swapcache = false;
>>>        int error, nr_pages, order;
>>>        pgoff_t offset;
>>>
>>> @@ -2322,7 +2303,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>                                folio = NULL;
>>>                                goto failed;
>>>                        }
>>> -                     skip_swapcache = true;
>>>                } else {
>>>                        /* Cached swapin only supports order 0 folio */
>>>                        folio = shmem_swapin_cluster(swap, gfp, info, index);
>>> @@ -2378,9 +2358,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>         * and swap cache folios are never partially freed.
>>>         */
>>>        folio_lock(folio);
>>> -     if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
>>> -         shmem_confirm_swap(mapping, index, swap) < 0 ||
>>> -         folio->swap.val != swap.val) {
>>> +     if (!folio_matches_swap_entry(folio, swap) ||
>>> +         shmem_confirm_swap(mapping, index, swap) < 0) {
>>
>> We should still keep the '!folio_test_swapcache(folio)' check here?
> 
> Thanks for the review, this one is OK because folio_test_swapcache is
> included in folio_matches_swap_entry already.

OK.