mm, swap: never bypass swap cache and cleanup flags (swap table phase II)

[PATCH 13/19] mm, swap: remove workaround for unsynchronized swap map cache state

Posted by Kairui Song 3 months, 1 week ago

From: Kairui Song <kasong@tencent.com>

Remove the "skip if exists" check from commit a65b0e7607ccb ("zswap:
make shrinking memcg-aware"). It was needed because there is a tiny time
window between setting the SWAP_HAS_CACHE bit and actually adding the
folio to the swap cache. If a user is trying to add the folio into the
swap cache but another user was interrupted after setting SWAP_HAS_CACHE
but hasn't added the folio to the swap cache yet, it might lead to a
deadlock.

We have moved the bit setting to the same critical section as adding the
folio, so this is no longer needed. Remove it and clean it up.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swap.h       |  2 +-
 mm/swap_state.c | 27 ++++++++++-----------------
 mm/zswap.c      |  2 +-
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/mm/swap.h b/mm/swap.h
index 3cd99850bbaf..a3c5f2dca0d5 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -260,7 +260,7 @@ int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
 void swap_cache_del_folio(struct folio *folio);
 struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *alloced, bool skip_if_exists);
+				     bool *alloced);
 /* Below helpers require the caller to lock and pass in the swap cluster. */
 void __swap_cache_del_folio(struct swap_cluster_info *ci,
 			    struct folio *folio, swp_entry_t entry, void *shadow);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2d53e3b5e8e9..d2bcca92b6e0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -447,8 +447,6 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
  * @folio: folio to be added.
  * @gfp: memory allocation flags for charge, can be 0 if @charged if true.
  * @charged: if the folio is already charged.
- * @skip_if_exists: if the slot is in a cached state, return NULL.
- *                  This is an old workaround that will be removed shortly.
  *
  * Update the swap_map and add folio as swap cache, typically before swapin.
  * All swap slots covered by the folio must have a non-zero swap count.
@@ -459,8 +457,7 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
  */
 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
 						  struct folio *folio,
-						  gfp_t gfp, bool charged,
-						  bool skip_if_exists)
+						  gfp_t gfp, bool charged)
 {
 	struct folio *swapcache = NULL;
 	void *shadow;
@@ -480,7 +477,7 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
 		 * might return a folio that is irrelevant to the faulting
 		 * entry because @entry is aligned down. Just return NULL.
 		 */
-		if (ret != -EEXIST || skip_if_exists || folio_test_large(folio))
+		if (ret != -EEXIST || folio_test_large(folio))
 			goto failed;
 
 		swapcache = swap_cache_get_folio(entry);
@@ -513,8 +510,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
  * @mpol: NUMA memory allocation policy to be applied
  * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  * @new_page_allocated: sets true if allocation happened, false otherwise
- * @skip_if_exists: if the slot is a partially cached state, return NULL.
- *                  This is a workaround that would be removed shortly.
  *
  * Allocate a folio in the swap cache for one swap slot, typically before
  * doing IO (swap in or swap out). The swap slot indicated by @entry must
@@ -526,8 +521,7 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
  */
 struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated,
-				     bool skip_if_exists)
+				     bool *new_page_allocated)
 {
 	struct swap_info_struct *si = __swap_entry_to_info(entry);
 	struct folio *folio;
@@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
 	if (!folio)
 		return NULL;
 	/* Try add the new folio, returns existing folio or NULL on failure. */
-	result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
-					      false, skip_if_exists);
+	result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
 	if (result == folio)
 		*new_page_allocated = true;
 	else
@@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
 	unsigned long nr_pages = folio_nr_pages(folio);
 
 	entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
-	swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
+	swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
 	if (swapcache == folio)
 		swap_read_folio(folio, NULL);
 	return swapcache;
@@ -606,7 +599,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
 	folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
-					&page_allocated, false);
+				       &page_allocated);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
@@ -725,7 +718,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		folio = swap_cache_alloc_folio(
 			swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx,
-			&page_allocated, false);
+			&page_allocated);
 		if (!folio)
 			continue;
 		if (page_allocated) {
@@ -743,7 +736,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
-					&page_allocated, false);
+				       &page_allocated);
 	if (unlikely(page_allocated))
 		swap_read_folio(folio, NULL);
 	return folio;
@@ -838,7 +831,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		pte_unmap(pte);
 		pte = NULL;
 		folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
-						&page_allocated, false);
+					       &page_allocated);
 		if (!folio)
 			continue;
 		if (page_allocated) {
@@ -858,7 +851,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 skip:
 	/* The folio was likely read above, so no need for plugging here */
 	folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx,
-					&page_allocated, false);
+				       &page_allocated);
 	if (unlikely(page_allocated))
 		swap_read_folio(folio, NULL);
 	return folio;
diff --git a/mm/zswap.c b/mm/zswap.c
index a7a2443912f4..d8a33db9d3cc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1015,7 +1015,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	mpol = get_task_policy(current);
 	folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol,
-				       NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+				       NO_INTERLEAVE_INDEX, &folio_was_allocated);
 	put_swap_device(si);
 	if (!folio)
 		return -ENOMEM;

-- 
2.51.1

Re: [PATCH 13/19] mm, swap: remove workaround for unsynchronized swap map cache state

Posted by Barry Song 3 months ago

>  struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
>                                      struct mempolicy *mpol, pgoff_t ilx,
> -                                    bool *new_page_allocated,
> -                                    bool skip_if_exists)
> +                                    bool *new_page_allocated)
>  {
>         struct swap_info_struct *si = __swap_entry_to_info(entry);
>         struct folio *folio;
> @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
>         if (!folio)
>                 return NULL;
>         /* Try add the new folio, returns existing folio or NULL on failure. */
> -       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> -                                             false, skip_if_exists);
> +       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
>         if (result == folio)
>                 *new_page_allocated = true;
>         else
> @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
>         unsigned long nr_pages = folio_nr_pages(folio);
>
>         entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> -       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> +       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
>         if (swapcache == folio)
>                 swap_read_folio(folio, NULL);
>         return swapcache;

I wonder if we could also drop the "charged" — it doesn’t seem
difficult to move the charging step before
__swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?

Thanks
Barry

Re: [PATCH 13/19] mm, swap: remove workaround for unsynchronized swap map cache state

Posted by Kairui Song 3 months ago

On Fri, Nov 7, 2025 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
>
> >  struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> >                                      struct mempolicy *mpol, pgoff_t ilx,
> > -                                    bool *new_page_allocated,
> > -                                    bool skip_if_exists)
> > +                                    bool *new_page_allocated)
> >  {
> >         struct swap_info_struct *si = __swap_entry_to_info(entry);
> >         struct folio *folio;
> > @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> >         if (!folio)
> >                 return NULL;
> >         /* Try add the new folio, returns existing folio or NULL on failure. */
> > -       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> > -                                             false, skip_if_exists);
> > +       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> >         if (result == folio)
> >                 *new_page_allocated = true;
> >         else
> > @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> >         unsigned long nr_pages = folio_nr_pages(folio);
> >
> >         entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> > -       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> > +       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> >         if (swapcache == folio)
> >                 swap_read_folio(folio, NULL);
> >         return swapcache;
>
> I wonder if we could also drop the "charged" — it doesn’t seem
> difficult to move the charging step before
> __swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?

Hi Barry, thanks for the review and suggestion.

It may cause much more serious cgroup thrashing. Charge may cause
reclaim, so races swapin will have a much larger race window and cause
a lot of repeated folio alloc / charge.

This param exists because anon / shmem does their own charge for large
folio swapin, and then inserts the folio into the swap cache, which is
causing more memory pressure already. I think ideally we want to unify
all alloc & charging for swap in folio allocation, and have a
swap_cache_alloc_folio that supports `orders`. For raced swapin only
one will insert a folio successfully into the swap cache and charge
it, which should make the race window very tiny or maybe avoid
redundant folio allocation completely with further work. I did some
tests and it shows that it will improve the memory usage and avoid
some OOM under pressure for (m)THP.

BTW with current SWAP_HAS_CACHE design, we also have redundant folio
alloc for order 0 when under global pressure, as folio alloc is done
before setting SWAP_HAS_CACHE.  But having SWAP_HAS_CACHE set then do
the folio alloc will increase the chance of hitting the idle/busy loop
on SWAP_HAS_CACHE which is also kind of problematic. We should be able
to clean it up in later phases.

Re: [PATCH 13/19] mm, swap: remove workaround for unsynchronized swap map cache state

Posted by Barry Song 3 months ago

On Sun, Nov 9, 2025 at 10:18 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Fri, Nov 7, 2025 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > >  struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > >                                      struct mempolicy *mpol, pgoff_t ilx,
> > > -                                    bool *new_page_allocated,
> > > -                                    bool skip_if_exists)
> > > +                                    bool *new_page_allocated)
> > >  {
> > >         struct swap_info_struct *si = __swap_entry_to_info(entry);
> > >         struct folio *folio;
> > > @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > >         if (!folio)
> > >                 return NULL;
> > >         /* Try add the new folio, returns existing folio or NULL on failure. */
> > > -       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> > > -                                             false, skip_if_exists);
> > > +       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> > >         if (result == folio)
> > >                 *new_page_allocated = true;
> > >         else
> > > @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> > >         unsigned long nr_pages = folio_nr_pages(folio);
> > >
> > >         entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> > > -       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> > > +       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> > >         if (swapcache == folio)
> > >                 swap_read_folio(folio, NULL);
> > >         return swapcache;
> >
> > I wonder if we could also drop the "charged" — it doesn’t seem
> > difficult to move the charging step before
> > __swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?
>
> Hi Barry, thanks for the review and suggestion.
>
> It may cause much more serious cgroup thrashing. Charge may cause
> reclaim, so races swapin will have a much larger race window and cause
> a lot of repeated folio alloc / charge.
>
> This param exists because anon / shmem does their own charge for large
> folio swapin, and then inserts the folio into the swap cache, which is
> causing more memory pressure already. I think ideally we want to unify
> all alloc & charging for swap in folio allocation, and have a
> swap_cache_alloc_folio that supports `orders`. For raced swapin only
> one will insert a folio successfully into the swap cache and charge
> it, which should make the race window very tiny or maybe avoid
> redundant folio allocation completely with further work. I did some
> tests and it shows that it will improve the memory usage and avoid
> some OOM under pressure for (m)THP.

This is quite interesting. I wonder if the change below could help reduce
mTHP swap thrashing. The fallback order-0 path also changes after
swap_cache_add_folio(), as order-0 pages are typically the ones triggering
memcg reclamation.

diff --git a/mm/memory.c b/mm/memory.c
index 27d91ae3648a..d97f1a8a5ca3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4470,11 +4470,13 @@ static struct folio *__alloc_swap_folio(struct
vm_fault *vmf)
                return NULL;

        entry = pte_to_swp_entry(vmf->orig_pte);
+#if 0
        if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                           GFP_KERNEL, entry)) {
                folio_put(folio);
                return NULL;
        }
+#endif

        return folio;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2bf72d58f6ee..9d0b55deacc6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -605,7 +605,7 @@ struct folio *swapin_folio(swp_entry_t entry,
struct folio *folio)
        unsigned long nr_pages = folio_nr_pages(folio);

        entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
-       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
+       swapcache = __swap_cache_prepare_and_add(entry, folio, 0,
folio_order(folio));
        if (swapcache == folio)
                swap_read_folio(folio, NULL);
        return swapcache;

>
> BTW with current SWAP_HAS_CACHE design, we also have redundant folio
> alloc for order 0 when under global pressure, as folio alloc is done
> before setting SWAP_HAS_CACHE.  But having SWAP_HAS_CACHE set then do
> the folio alloc will increase the chance of hitting the idle/busy loop
> on SWAP_HAS_CACHE which is also kind of problematic. We should be able
> to clean it up in later phases.

Thanks
Barry

Re: [PATCH 13/19] mm, swap: remove workaround for unsynchronized swap map cache state

Posted by Kairui Song 2 months, 3 weeks ago

On Mon, Nov 10, 2025 at 3:21 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Sun, Nov 9, 2025 at 10:18 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Fri, Nov 7, 2025 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > >  struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > > >                                      struct mempolicy *mpol, pgoff_t ilx,
> > > > -                                    bool *new_page_allocated,
> > > > -                                    bool skip_if_exists)
> > > > +                                    bool *new_page_allocated)
> > > >  {
> > > >         struct swap_info_struct *si = __swap_entry_to_info(entry);
> > > >         struct folio *folio;
> > > > @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > > >         if (!folio)
> > > >                 return NULL;
> > > >         /* Try add the new folio, returns existing folio or NULL on failure. */
> > > > -       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> > > > -                                             false, skip_if_exists);
> > > > +       result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> > > >         if (result == folio)
> > > >                 *new_page_allocated = true;
> > > >         else
> > > > @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> > > >         unsigned long nr_pages = folio_nr_pages(folio);
> > > >
> > > >         entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> > > > -       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> > > > +       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> > > >         if (swapcache == folio)
> > > >                 swap_read_folio(folio, NULL);
> > > >         return swapcache;
> > >
> > > I wonder if we could also drop the "charged" — it doesn’t seem
> > > difficult to move the charging step before
> > > __swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?
> >
> > Hi Barry, thanks for the review and suggestion.
> >
> > It may cause much more serious cgroup thrashing. Charge may cause
> > reclaim, so races swapin will have a much larger race window and cause
> > a lot of repeated folio alloc / charge.
> >
> > This param exists because anon / shmem does their own charge for large
> > folio swapin, and then inserts the folio into the swap cache, which is
> > causing more memory pressure already. I think ideally we want to unify
> > all alloc & charging for swap in folio allocation, and have a
> > swap_cache_alloc_folio that supports `orders`. For raced swapin only
> > one will insert a folio successfully into the swap cache and charge
> > it, which should make the race window very tiny or maybe avoid
> > redundant folio allocation completely with further work. I did some
> > tests and it shows that it will improve the memory usage and avoid
> > some OOM under pressure for (m)THP.
>
> This is quite interesting. I wonder if the change below could help reduce
> mTHP swap thrashing. The fallback order-0 path also changes after
> swap_cache_add_folio(), as order-0 pages are typically the ones triggering
> memcg reclamation.
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 27d91ae3648a..d97f1a8a5ca3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4470,11 +4470,13 @@ static struct folio *__alloc_swap_folio(struct
> vm_fault *vmf)
>                 return NULL;
>
>         entry = pte_to_swp_entry(vmf->orig_pte);
> +#if 0
>         if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
>                                            GFP_KERNEL, entry)) {
>                 folio_put(folio);
>                 return NULL;
>         }
> +#endif
>
>         return folio;
>  }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 2bf72d58f6ee..9d0b55deacc6 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -605,7 +605,7 @@ struct folio *swapin_folio(swp_entry_t entry,
> struct folio *folio)
>         unsigned long nr_pages = folio_nr_pages(folio);
>
>         entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> -       swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> +       swapcache = __swap_cache_prepare_and_add(entry, folio, 0,
> folio_order(folio));
>         if (swapcache == folio)
>                 swap_read_folio(folio, NULL);
>         return swapcache;

Yeah, that will surely improve the thrashing issue. Having a
`folio_order` check as the charged parameter looks strange though.
Ideally we will have the swap_cache_alloc_folio to do all the folio
allocation so there won't be many different swap in folio charging
callsites (currently we have like > 3 callsites, anon THP, anon order
0, shmem THP, and the common order 0 in swap_cache_alloc_folio). That
will also help remove a WARN_ON check in Patch 3.