From: Kairui Song <kasong@tencent.com>
Remove the "skip if exists" check from commit a65b0e7607ccb ("zswap:
make shrinking memcg-aware"). It was needed because there is a tiny time
window between setting the SWAP_HAS_CACHE bit and actually adding the
folio to the swap cache. If a user is trying to add the folio into the
swap cache but another user was interrupted after setting SWAP_HAS_CACHE
but hasn't added the folio to the swap cache yet, it might lead to a
deadlock.
We have moved the bit setting to the same critical section as adding the
folio, so this is no longer needed. Remove it and clean it up.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/swap.h | 2 +-
mm/swap_state.c | 27 ++++++++++-----------------
mm/zswap.c | 2 +-
3 files changed, 12 insertions(+), 19 deletions(-)
diff --git a/mm/swap.h b/mm/swap.h
index 3cd99850bbaf..a3c5f2dca0d5 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -260,7 +260,7 @@ int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
void swap_cache_del_folio(struct folio *folio);
struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
struct mempolicy *mpol, pgoff_t ilx,
- bool *alloced, bool skip_if_exists);
+ bool *alloced);
/* Below helpers require the caller to lock and pass in the swap cluster. */
void __swap_cache_del_folio(struct swap_cluster_info *ci,
struct folio *folio, swp_entry_t entry, void *shadow);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2d53e3b5e8e9..d2bcca92b6e0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -447,8 +447,6 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
* @folio: folio to be added.
* @gfp: memory allocation flags for charge, can be 0 if @charged if true.
* @charged: if the folio is already charged.
- * @skip_if_exists: if the slot is in a cached state, return NULL.
- * This is an old workaround that will be removed shortly.
*
* Update the swap_map and add folio as swap cache, typically before swapin.
* All swap slots covered by the folio must have a non-zero swap count.
@@ -459,8 +457,7 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
*/
static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
struct folio *folio,
- gfp_t gfp, bool charged,
- bool skip_if_exists)
+ gfp_t gfp, bool charged)
{
struct folio *swapcache = NULL;
void *shadow;
@@ -480,7 +477,7 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
* might return a folio that is irrelevant to the faulting
* entry because @entry is aligned down. Just return NULL.
*/
- if (ret != -EEXIST || skip_if_exists || folio_test_large(folio))
+ if (ret != -EEXIST || folio_test_large(folio))
goto failed;
swapcache = swap_cache_get_folio(entry);
@@ -513,8 +510,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
* @mpol: NUMA memory allocation policy to be applied
* @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
* @new_page_allocated: sets true if allocation happened, false otherwise
- * @skip_if_exists: if the slot is a partially cached state, return NULL.
- * This is a workaround that would be removed shortly.
*
* Allocate a folio in the swap cache for one swap slot, typically before
* doing IO (swap in or swap out). The swap slot indicated by @entry must
@@ -526,8 +521,7 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
*/
struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx,
- bool *new_page_allocated,
- bool skip_if_exists)
+ bool *new_page_allocated)
{
struct swap_info_struct *si = __swap_entry_to_info(entry);
struct folio *folio;
@@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
if (!folio)
return NULL;
/* Try add the new folio, returns existing folio or NULL on failure. */
- result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
- false, skip_if_exists);
+ result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
if (result == folio)
*new_page_allocated = true;
else
@@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
unsigned long nr_pages = folio_nr_pages(folio);
entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
- swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
+ swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
if (swapcache == folio)
swap_read_folio(folio, NULL);
return swapcache;
@@ -606,7 +599,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mpol = get_vma_policy(vma, addr, 0, &ilx);
folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
mpol_cond_put(mpol);
if (page_allocated)
@@ -725,7 +718,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
/* Ok, do the async read-ahead now */
folio = swap_cache_alloc_folio(
swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
if (!folio)
continue;
if (page_allocated) {
@@ -743,7 +736,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
skip:
/* The page was likely read above, so no need for plugging here */
folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
if (unlikely(page_allocated))
swap_read_folio(folio, NULL);
return folio;
@@ -838,7 +831,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
pte_unmap(pte);
pte = NULL;
folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ &page_allocated);
if (!folio)
continue;
if (page_allocated) {
@@ -858,7 +851,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
skip:
/* The folio was likely read above, so no need for plugging here */
folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx,
- &page_allocated, false);
+ &page_allocated);
if (unlikely(page_allocated))
swap_read_folio(folio, NULL);
return folio;
diff --git a/mm/zswap.c b/mm/zswap.c
index a7a2443912f4..d8a33db9d3cc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1015,7 +1015,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
mpol = get_task_policy(current);
folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ NO_INTERLEAVE_INDEX, &folio_was_allocated);
put_swap_device(si);
if (!folio)
return -ENOMEM;
--
2.51.1
> struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> struct mempolicy *mpol, pgoff_t ilx,
> - bool *new_page_allocated,
> - bool skip_if_exists)
> + bool *new_page_allocated)
> {
> struct swap_info_struct *si = __swap_entry_to_info(entry);
> struct folio *folio;
> @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> if (!folio)
> return NULL;
> /* Try add the new folio, returns existing folio or NULL on failure. */
> - result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> - false, skip_if_exists);
> + result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> if (result == folio)
> *new_page_allocated = true;
> else
> @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> unsigned long nr_pages = folio_nr_pages(folio);
>
> entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> if (swapcache == folio)
> swap_read_folio(folio, NULL);
> return swapcache;
I wonder if we could also drop the "charged" — it doesn’t seem
difficult to move the charging step before
__swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?
Thanks
Barry
On Fri, Nov 7, 2025 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
>
> > struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > struct mempolicy *mpol, pgoff_t ilx,
> > - bool *new_page_allocated,
> > - bool skip_if_exists)
> > + bool *new_page_allocated)
> > {
> > struct swap_info_struct *si = __swap_entry_to_info(entry);
> > struct folio *folio;
> > @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > if (!folio)
> > return NULL;
> > /* Try add the new folio, returns existing folio or NULL on failure. */
> > - result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> > - false, skip_if_exists);
> > + result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> > if (result == folio)
> > *new_page_allocated = true;
> > else
> > @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> > unsigned long nr_pages = folio_nr_pages(folio);
> >
> > entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> > - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> > + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> > if (swapcache == folio)
> > swap_read_folio(folio, NULL);
> > return swapcache;
>
> I wonder if we could also drop the "charged" — it doesn’t seem
> difficult to move the charging step before
> __swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?
Hi Barry, thanks for the review and suggestion.
It may cause much more serious cgroup thrashing. Charge may cause
reclaim, so races swapin will have a much larger race window and cause
a lot of repeated folio alloc / charge.
This param exists because anon / shmem does their own charge for large
folio swapin, and then inserts the folio into the swap cache, which is
causing more memory pressure already. I think ideally we want to unify
all alloc & charging for swap in folio allocation, and have a
swap_cache_alloc_folio that supports `orders`. For raced swapin only
one will insert a folio successfully into the swap cache and charge
it, which should make the race window very tiny or maybe avoid
redundant folio allocation completely with further work. I did some
tests and it shows that it will improve the memory usage and avoid
some OOM under pressure for (m)THP.
BTW with current SWAP_HAS_CACHE design, we also have redundant folio
alloc for order 0 when under global pressure, as folio alloc is done
before setting SWAP_HAS_CACHE. But having SWAP_HAS_CACHE set then do
the folio alloc will increase the chance of hitting the idle/busy loop
on SWAP_HAS_CACHE which is also kind of problematic. We should be able
to clean it up in later phases.
On Sun, Nov 9, 2025 at 10:18 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Fri, Nov 7, 2025 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > > struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > > struct mempolicy *mpol, pgoff_t ilx,
> > > - bool *new_page_allocated,
> > > - bool skip_if_exists)
> > > + bool *new_page_allocated)
> > > {
> > > struct swap_info_struct *si = __swap_entry_to_info(entry);
> > > struct folio *folio;
> > > @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > > if (!folio)
> > > return NULL;
> > > /* Try add the new folio, returns existing folio or NULL on failure. */
> > > - result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> > > - false, skip_if_exists);
> > > + result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> > > if (result == folio)
> > > *new_page_allocated = true;
> > > else
> > > @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> > > unsigned long nr_pages = folio_nr_pages(folio);
> > >
> > > entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> > > - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> > > + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> > > if (swapcache == folio)
> > > swap_read_folio(folio, NULL);
> > > return swapcache;
> >
> > I wonder if we could also drop the "charged" — it doesn’t seem
> > difficult to move the charging step before
> > __swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?
>
> Hi Barry, thanks for the review and suggestion.
>
> It may cause much more serious cgroup thrashing. Charge may cause
> reclaim, so races swapin will have a much larger race window and cause
> a lot of repeated folio alloc / charge.
>
> This param exists because anon / shmem does their own charge for large
> folio swapin, and then inserts the folio into the swap cache, which is
> causing more memory pressure already. I think ideally we want to unify
> all alloc & charging for swap in folio allocation, and have a
> swap_cache_alloc_folio that supports `orders`. For raced swapin only
> one will insert a folio successfully into the swap cache and charge
> it, which should make the race window very tiny or maybe avoid
> redundant folio allocation completely with further work. I did some
> tests and it shows that it will improve the memory usage and avoid
> some OOM under pressure for (m)THP.
This is quite interesting. I wonder if the change below could help reduce
mTHP swap thrashing. The fallback order-0 path also changes after
swap_cache_add_folio(), as order-0 pages are typically the ones triggering
memcg reclamation.
diff --git a/mm/memory.c b/mm/memory.c
index 27d91ae3648a..d97f1a8a5ca3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4470,11 +4470,13 @@ static struct folio *__alloc_swap_folio(struct
vm_fault *vmf)
return NULL;
entry = pte_to_swp_entry(vmf->orig_pte);
+#if 0
if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
GFP_KERNEL, entry)) {
folio_put(folio);
return NULL;
}
+#endif
return folio;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2bf72d58f6ee..9d0b55deacc6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -605,7 +605,7 @@ struct folio *swapin_folio(swp_entry_t entry,
struct folio *folio)
unsigned long nr_pages = folio_nr_pages(folio);
entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
- swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
+ swapcache = __swap_cache_prepare_and_add(entry, folio, 0,
folio_order(folio));
if (swapcache == folio)
swap_read_folio(folio, NULL);
return swapcache;
>
> BTW with current SWAP_HAS_CACHE design, we also have redundant folio
> alloc for order 0 when under global pressure, as folio alloc is done
> before setting SWAP_HAS_CACHE. But having SWAP_HAS_CACHE set then do
> the folio alloc will increase the chance of hitting the idle/busy loop
> on SWAP_HAS_CACHE which is also kind of problematic. We should be able
> to clean it up in later phases.
Thanks
Barry
On Mon, Nov 10, 2025 at 3:21 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Sun, Nov 9, 2025 at 10:18 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Fri, Nov 7, 2025 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > > struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > > > struct mempolicy *mpol, pgoff_t ilx,
> > > > - bool *new_page_allocated,
> > > > - bool skip_if_exists)
> > > > + bool *new_page_allocated)
> > > > {
> > > > struct swap_info_struct *si = __swap_entry_to_info(entry);
> > > > struct folio *folio;
> > > > @@ -548,8 +542,7 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
> > > > if (!folio)
> > > > return NULL;
> > > > /* Try add the new folio, returns existing folio or NULL on failure. */
> > > > - result = __swap_cache_prepare_and_add(entry, folio, gfp_mask,
> > > > - false, skip_if_exists);
> > > > + result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
> > > > if (result == folio)
> > > > *new_page_allocated = true;
> > > > else
> > > > @@ -578,7 +571,7 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
> > > > unsigned long nr_pages = folio_nr_pages(folio);
> > > >
> > > > entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> > > > - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false);
> > > > + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> > > > if (swapcache == folio)
> > > > swap_read_folio(folio, NULL);
> > > > return swapcache;
> > >
> > > I wonder if we could also drop the "charged" — it doesn’t seem
> > > difficult to move the charging step before
> > > __swap_cache_prepare_and_add(), even for swap_cache_alloc_folio()?
> >
> > Hi Barry, thanks for the review and suggestion.
> >
> > It may cause much more serious cgroup thrashing. Charge may cause
> > reclaim, so races swapin will have a much larger race window and cause
> > a lot of repeated folio alloc / charge.
> >
> > This param exists because anon / shmem does their own charge for large
> > folio swapin, and then inserts the folio into the swap cache, which is
> > causing more memory pressure already. I think ideally we want to unify
> > all alloc & charging for swap in folio allocation, and have a
> > swap_cache_alloc_folio that supports `orders`. For raced swapin only
> > one will insert a folio successfully into the swap cache and charge
> > it, which should make the race window very tiny or maybe avoid
> > redundant folio allocation completely with further work. I did some
> > tests and it shows that it will improve the memory usage and avoid
> > some OOM under pressure for (m)THP.
>
> This is quite interesting. I wonder if the change below could help reduce
> mTHP swap thrashing. The fallback order-0 path also changes after
> swap_cache_add_folio(), as order-0 pages are typically the ones triggering
> memcg reclamation.
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 27d91ae3648a..d97f1a8a5ca3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4470,11 +4470,13 @@ static struct folio *__alloc_swap_folio(struct
> vm_fault *vmf)
> return NULL;
>
> entry = pte_to_swp_entry(vmf->orig_pte);
> +#if 0
> if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> GFP_KERNEL, entry)) {
> folio_put(folio);
> return NULL;
> }
> +#endif
>
> return folio;
> }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 2bf72d58f6ee..9d0b55deacc6 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -605,7 +605,7 @@ struct folio *swapin_folio(swp_entry_t entry,
> struct folio *folio)
> unsigned long nr_pages = folio_nr_pages(folio);
>
> entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
> - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
> + swapcache = __swap_cache_prepare_and_add(entry, folio, 0,
> folio_order(folio));
> if (swapcache == folio)
> swap_read_folio(folio, NULL);
> return swapcache;
Yeah, that will surely improve the thrashing issue. Having a
`folio_order` check as the charged parameter looks strange though.
Ideally we will have the swap_cache_alloc_folio to do all the folio
allocation so there won't be many different swap in folio charging
callsites (currently we have like > 3 callsites, anon THP, anon order
0, shmem THP, and the common order 0 in swap_cache_alloc_folio). That
will also help remove a WARN_ON check in Patch 3.
© 2016 - 2026 Red Hat, Inc.