mm, swap: never bypass swap cache and cleanup flags (swap table phase II)

[PATCH 06/19] mm, swap: free the swap cache after folio is mapped

Posted by Kairui Song 3 months, 1 week ago

From: Kairui Song <kasong@tencent.com>

To prevent repeated faults of parallel swapin of the same PTE, remove
the folio from the swap cache after the folio is mapped. So any user
faulting from the swap PTE should see the folio in the swap cache and
wait on it.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/memory.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 6c5cd86c4a66..589d6fc3d424 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4362,6 +4362,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
 static inline bool should_try_to_free_swap(struct swap_info_struct *si,
 					   struct folio *folio,
 					   struct vm_area_struct *vma,
+					   unsigned int extra_refs,
 					   unsigned int fault_flags)
 {
 	if (!folio_test_swapcache(folio))
@@ -4384,7 +4385,7 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
 	 * reference only in case it's likely that we'll be the exclusive user.
 	 */
 	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
-		folio_ref_count(folio) == (1 + folio_nr_pages(folio));
+		folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
 }
 
 static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -4935,15 +4936,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 */
 	arch_swap_restore(folio_swap(entry, folio), folio);
 
-	/*
-	 * Remove the swap entry and conditionally try to free up the swapcache.
-	 * We're already holding a reference on the page but haven't mapped it
-	 * yet.
-	 */
-	swap_free_nr(entry, nr_pages);
-	if (should_try_to_free_swap(si, folio, vma, vmf->flags))
-		folio_free_swap(folio);
-
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
 	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
 	pte = mk_pte(page, vma->vm_page_prot);
@@ -4997,6 +4989,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	arch_do_swap_page_nr(vma->vm_mm, vma, address,
 			pte, pte, nr_pages);
 
+	/*
+	 * Remove the swap entry and conditionally try to free up the
+	 * swapcache. Do it after mapping so any raced page fault will
+	 * see the folio in swap cache and wait for us.
+	 */
+	swap_free_nr(entry, nr_pages);
+	if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags))
+		folio_free_swap(folio);
+
 	folio_unlock(folio);
 	if (unlikely(folio != swapcache)) {
 		/*

-- 
2.51.1

Re: [PATCH 06/19] mm, swap: free the swap cache after folio is mapped

Posted by Barry Song 3 months, 1 week ago

On Wed, Oct 29, 2025 at 11:59 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> To prevent repeated faults of parallel swapin of the same PTE, remove
> the folio from the swap cache after the folio is mapped. So any user
> faulting from the swap PTE should see the folio in the swap cache and
> wait on it.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/memory.c | 21 +++++++++++----------
>  1 file changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 6c5cd86c4a66..589d6fc3d424 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4362,6 +4362,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
>  static inline bool should_try_to_free_swap(struct swap_info_struct *si,
>                                            struct folio *folio,
>                                            struct vm_area_struct *vma,
> +                                          unsigned int extra_refs,
>                                            unsigned int fault_flags)
>  {
>         if (!folio_test_swapcache(folio))
> @@ -4384,7 +4385,7 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
>          * reference only in case it's likely that we'll be the exclusive user.
>          */
>         return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
> -               folio_ref_count(folio) == (1 + folio_nr_pages(folio));
> +               folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
>  }
>
>  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> @@ -4935,15 +4936,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>          */
>         arch_swap_restore(folio_swap(entry, folio), folio);
>
> -       /*
> -        * Remove the swap entry and conditionally try to free up the swapcache.
> -        * We're already holding a reference on the page but haven't mapped it
> -        * yet.
> -        */
> -       swap_free_nr(entry, nr_pages);
> -       if (should_try_to_free_swap(si, folio, vma, vmf->flags))
> -               folio_free_swap(folio);
> -
>         add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>         add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
>         pte = mk_pte(page, vma->vm_page_prot);
> @@ -4997,6 +4989,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         arch_do_swap_page_nr(vma->vm_mm, vma, address,
>                         pte, pte, nr_pages);
>
> +       /*
> +        * Remove the swap entry and conditionally try to free up the
> +        * swapcache. Do it after mapping so any raced page fault will
> +        * see the folio in swap cache and wait for us.

This seems like the right optimization—it reduces the race window where we might
allocate a folio, perform the read, and then attempt to map it, only
to find after
taking the PTL that the PTE has already changed.

Although I am not entirely sure that “any raced page fault will see the folio in
swapcache,” it seems there could still be cases where a fault occurs after
folio_free_swap(), and thus can’t see the swapcache entry.

T1:
swap in PF, allocate and add swapcache, map PTE, delete swapcache

T2:
swap in PF before PTE is changed;
...........................................................;
check swapcache after T1 deletes swapcache -> no swapcache found.


> +        */
> +       swap_free_nr(entry, nr_pages);
> +       if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags))
> +               folio_free_swap(folio);
> +
>         folio_unlock(folio);
>         if (unlikely(folio != swapcache)) {
>                 /*
>
> --
> 2.51.1
>

Thanks
Barry

Re: [PATCH 06/19] mm, swap: free the swap cache after folio is mapped

Posted by Kairui Song 3 months, 1 week ago

On Tue, Nov 4, 2025 at 5:15 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Oct 29, 2025 at 11:59 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > To prevent repeated faults of parallel swapin of the same PTE, remove
> > the folio from the swap cache after the folio is mapped. So any user
> > faulting from the swap PTE should see the folio in the swap cache and
> > wait on it.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/memory.c | 21 +++++++++++----------
> >  1 file changed, 11 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 6c5cd86c4a66..589d6fc3d424 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4362,6 +4362,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
> >  static inline bool should_try_to_free_swap(struct swap_info_struct *si,
> >                                            struct folio *folio,
> >                                            struct vm_area_struct *vma,
> > +                                          unsigned int extra_refs,
> >                                            unsigned int fault_flags)
> >  {
> >         if (!folio_test_swapcache(folio))
> > @@ -4384,7 +4385,7 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
> >          * reference only in case it's likely that we'll be the exclusive user.
> >          */
> >         return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
> > -               folio_ref_count(folio) == (1 + folio_nr_pages(folio));
> > +               folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
> >  }
> >
> >  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> > @@ -4935,15 +4936,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >          */
> >         arch_swap_restore(folio_swap(entry, folio), folio);
> >
> > -       /*
> > -        * Remove the swap entry and conditionally try to free up the swapcache.
> > -        * We're already holding a reference on the page but haven't mapped it
> > -        * yet.
> > -        */
> > -       swap_free_nr(entry, nr_pages);
> > -       if (should_try_to_free_swap(si, folio, vma, vmf->flags))
> > -               folio_free_swap(folio);
> > -
> >         add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> >         add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> >         pte = mk_pte(page, vma->vm_page_prot);
> > @@ -4997,6 +4989,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >         arch_do_swap_page_nr(vma->vm_mm, vma, address,
> >                         pte, pte, nr_pages);
> >
> > +       /*
> > +        * Remove the swap entry and conditionally try to free up the
> > +        * swapcache. Do it after mapping so any raced page fault will
> > +        * see the folio in swap cache and wait for us.
>
> This seems like the right optimization—it reduces the race window where we might
> allocate a folio, perform the read, and then attempt to map it, only
> to find after
> taking the PTL that the PTE has already changed.
>
> Although I am not entirely sure that “any raced page fault will see the folio in
> swapcache,” it seems there could still be cases where a fault occurs after
> folio_free_swap(), and thus can’t see the swapcache entry.
>
> T1:
> swap in PF, allocate and add swapcache, map PTE, delete swapcache
>
> T2:
> swap in PF before PTE is changed;
> ...........................................................;
> check swapcache after T1 deletes swapcache -> no swapcache found.

Right, that's true. But we will at most only have one repeated fault,
and the time window is much smaller. T2 will PTE != orig_pte and then
return just fine.

So this patch is only reducing the race time window for a potentially
better performance, and this race is basically harmless anyway. I
think it's good enough.

Re: [PATCH 06/19] mm, swap: free the swap cache after folio is mapped

Posted by Barry Song 3 months ago

On Tue, Nov 4, 2025 at 6:51 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Tue, Nov 4, 2025 at 5:15 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Wed, Oct 29, 2025 at 11:59 PM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > To prevent repeated faults of parallel swapin of the same PTE, remove
> > > the folio from the swap cache after the folio is mapped. So any user
> > > faulting from the swap PTE should see the folio in the swap cache and
> > > wait on it.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> > >  mm/memory.c | 21 +++++++++++----------
> > >  1 file changed, 11 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 6c5cd86c4a66..589d6fc3d424 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -4362,6 +4362,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
> > >  static inline bool should_try_to_free_swap(struct swap_info_struct *si,
> > >                                            struct folio *folio,
> > >                                            struct vm_area_struct *vma,
> > > +                                          unsigned int extra_refs,
> > >                                            unsigned int fault_flags)
> > >  {
> > >         if (!folio_test_swapcache(folio))
> > > @@ -4384,7 +4385,7 @@ static inline bool should_try_to_free_swap(struct swap_info_struct *si,
> > >          * reference only in case it's likely that we'll be the exclusive user.
> > >          */
> > >         return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
> > > -               folio_ref_count(folio) == (1 + folio_nr_pages(folio));
> > > +               folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
> > >  }
> > >
> > >  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
> > > @@ -4935,15 +4936,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >          */
> > >         arch_swap_restore(folio_swap(entry, folio), folio);
> > >
> > > -       /*
> > > -        * Remove the swap entry and conditionally try to free up the swapcache.
> > > -        * We're already holding a reference on the page but haven't mapped it
> > > -        * yet.
> > > -        */
> > > -       swap_free_nr(entry, nr_pages);
> > > -       if (should_try_to_free_swap(si, folio, vma, vmf->flags))
> > > -               folio_free_swap(folio);
> > > -
> > >         add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> > >         add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> > >         pte = mk_pte(page, vma->vm_page_prot);
> > > @@ -4997,6 +4989,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >         arch_do_swap_page_nr(vma->vm_mm, vma, address,
> > >                         pte, pte, nr_pages);
> > >
> > > +       /*
> > > +        * Remove the swap entry and conditionally try to free up the
> > > +        * swapcache. Do it after mapping so any raced page fault will
> > > +        * see the folio in swap cache and wait for us.
> >
> > This seems like the right optimization—it reduces the race window where we might
> > allocate a folio, perform the read, and then attempt to map it, only
> > to find after
> > taking the PTL that the PTE has already changed.
> >
> > Although I am not entirely sure that “any raced page fault will see the folio in
> > swapcache,” it seems there could still be cases where a fault occurs after
> > folio_free_swap(), and thus can’t see the swapcache entry.
> >
> > T1:
> > swap in PF, allocate and add swapcache, map PTE, delete swapcache
> >
> > T2:
> > swap in PF before PTE is changed;
> > ...........................................................;
> > check swapcache after T1 deletes swapcache -> no swapcache found.
>
> Right, that's true. But we will at most only have one repeated fault,
> and the time window is much smaller. T2 will PTE != orig_pte and then
> return just fine.
>
> So this patch is only reducing the race time window for a potentially
> better performance, and this race is basically harmless anyway. I
> think it's good enough.

Right. What I really disagree with is "Do it after mapping so any
raced page fault
will see the folio in swap cache and wait for". It sounds like it
guarantees no race
at all, so I’d rather we change it to something like "reduced race window".

Thanks
Barry