[v1] mm, swap: introduce swap table as swap cache (phase I)

[PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month, 1 week ago

From: Kairui Song <kasong@tencent.com>

Swap cache lookup is lockless, it only increases the reference count
of the returned folio. That's not enough to ensure a folio is stable in
the swap cache, so the folio could be removed from the swap cache at any
time. The caller always has to lock and check the folio before use.

Document this as a comment, and introduce a helper for swap cache folio
verification with proper sanity checks.

Also, sanitize all current users to use this convention, and use the new
helper when possible for easier debugging. Some existing callers won't
cause any major problem right now, only trivial issues like incorrect
readahead statistic (swapin) or wasted loop (swapoff). It's better to
always follow this convention to make things robust.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/memory.c     | 28 +++++++++++++---------------
 mm/shmem.c      |  4 ++--
 mm/swap.h       | 28 ++++++++++++++++++++++++++++
 mm/swap_state.c | 13 +++++++++----
 mm/swapfile.c   | 10 ++++++++--
 5 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 10ef528a5f44..9ca8e1873c6e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4661,12 +4661,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		goto out;
 
 	folio = swap_cache_get_folio(entry);
-	if (folio) {
-		swap_update_readahead(folio, vma, vmf->address);
-		page = folio_file_page(folio, swp_offset(entry));
-	}
 	swapcache = folio;
-
 	if (!folio) {
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
 		    __swap_count(entry) == 1) {
@@ -4735,20 +4730,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
-		page = folio_file_page(folio, swp_offset(entry));
-	} else if (PageHWPoison(page)) {
-		/*
-		 * hwpoisoned dirty swapcache pages are kept for killing
-		 * owner processes (which may be unknown at hwpoison time)
-		 */
-		ret = VM_FAULT_HWPOISON;
-		goto out_release;
 	}
 
 	ret |= folio_lock_or_retry(folio, vmf);
 	if (ret & VM_FAULT_RETRY)
 		goto out_release;
 
+	page = folio_file_page(folio, swp_offset(entry));
 	if (swapcache) {
 		/*
 		 * Make sure folio_free_swap() or swapoff did not release the
@@ -4757,10 +4745,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		 * swapcache, we need to check that the page's swap has not
 		 * changed.
 		 */
-		if (unlikely(!folio_test_swapcache(folio) ||
-			     page_swap_entry(page).val != entry.val))
+		if (!folio_contains_swap(folio, entry))
 			goto out_page;
 
+		if (PageHWPoison(page)) {
+			/*
+			 * hwpoisoned dirty swapcache pages are kept for killing
+			 * owner processes (which may be unknown at hwpoison time)
+			 */
+			ret = VM_FAULT_HWPOISON;
+			goto out_page;
+		}
+
+		swap_update_readahead(folio, vma, vmf->address);
+
 		/*
 		 * KSM sometimes has to copy on read faults, for example, if
 		 * folio->index of non-ksm folios would be nonlinear inside the
diff --git a/mm/shmem.c b/mm/shmem.c
index e9d0d2784cd5..b4d39f2a1e0a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
-	} else {
-		swap_update_readahead(folio, NULL, 0);
 	}
 
 	if (order > folio_order(folio)) {
@@ -2431,6 +2429,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		error = -EIO;
 		goto failed;
 	}
+	if (!skip_swapcache)
+		swap_update_readahead(folio, NULL, 0);
 	folio_wait_writeback(folio);
 	nr_pages = folio_nr_pages(folio);
 
diff --git a/mm/swap.h b/mm/swap.h
index efb6d7ff9f30..bb2adbfd64a9 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -52,6 +52,29 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
 	return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
 }
 
+/**
+ * folio_contains_swap - Does this folio contain this swap entry?
+ * @folio: The folio.
+ * @entry: The swap entry to check against.
+ *
+ * Swap version of folio_contains()
+ *
+ * Context: The caller should have the folio locked to ensure
+ * nothing will move it out of the swap cache.
+ * Return: true or false.
+ */
+static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
+{
+	pgoff_t offset = swp_offset(entry);
+
+	VM_WARN_ON_ONCE(!folio_test_locked(folio));
+	if (unlikely(!folio_test_swapcache(folio)))
+		return false;
+	if (unlikely(swp_type(entry) != swp_type(folio->swap)))
+		return false;
+	return offset - swp_offset(folio->swap) < folio_nr_pages(folio);
+}
+
 void show_swap_cache_info(void);
 void *get_shadow_from_swap_cache(swp_entry_t entry);
 int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
@@ -144,6 +167,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
 	return 0;
 }
 
+static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
+{
+	return false;
+}
+
 static inline void show_swap_cache_info(void)
 {
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ff9eb761a103..be0d96494dc1 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -70,10 +70,12 @@ void show_swap_cache_info(void)
 }
 
 /*
- * Lookup a swap entry in the swap cache. A found folio will be returned
- * unlocked and with its refcount incremented.
+ * swap_cache_get_folio - Lookup a swap entry in the swap cache.
  *
- * Caller must lock the swap device or hold a reference to keep it valid.
+ * A found folio will be returned unlocked and with its refcount increased.
+ *
+ * Context: Caller must ensure @entry is valid and pin the swap device, also
+ * check the returned folio after locking it (e.g. folio_swap_contains).
  */
 struct folio *swap_cache_get_folio(swp_entry_t entry)
 {
@@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	for (;;) {
 		int err;
 
-		/* Check the swap cache in case the folio is already there */
+		/*
+		 * Check the swap cache first, if a cached folio is found,
+		 * return it unlocked. The caller will lock and check it.
+		 */
 		folio = swap_cache_get_folio(entry);
 		if (folio)
 			goto got_folio;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4b8ab2cb49ca..12f2580ebe8d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 	 * Offset could point to the middle of a large folio, or folio
 	 * may no longer point to the expected offset before it's locked.
 	 */
-	entry = folio->swap;
-	if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+	if (!folio_contains_swap(folio, entry)) {
 		folio_unlock(folio);
 		folio_put(folio);
 		goto again;
 	}
+	entry = folio->swap;
 	offset = swp_offset(entry);
 
 	need_reclaim = ((flags & TTRS_ANYWAY) ||
@@ -2150,6 +2150,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		}
 
 		folio_lock(folio);
+		if (!folio_contains_swap(folio, entry)) {
+			folio_unlock(folio);
+			folio_put(folio);
+			continue;
+		}
+
 		folio_wait_writeback(folio);
 		ret = unuse_pte(vma, pmd, addr, entry, folio);
 		if (ret < 0) {
-- 
2.51.0

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by David Hildenbrand 1 month ago

On 22.08.25 21:20, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Swap cache lookup is lockless, it only increases the reference count
> of the returned folio. That's not enough to ensure a folio is stable in
> the swap cache, so the folio could be removed from the swap cache at any
> time. The caller always has to lock and check the folio before use.
> 
> Document this as a comment, and introduce a helper for swap cache folio
> verification with proper sanity checks.
> 
> Also, sanitize all current users to use this convention, and use the new
> helper when possible for easier debugging. Some existing callers won't
> cause any major problem right now, only trivial issues like incorrect
> readahead statistic (swapin) or wasted loop (swapoff). It's better to
> always follow this convention to make things robust.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---

[...]

> +/**
> + * folio_contains_swap - Does this folio contain this swap entry?
> + * @folio: The folio.
> + * @entry: The swap entry to check against.
> + *
> + * Swap version of folio_contains()
> + *
> + * Context: The caller should have the folio locked to ensure
> + * nothing will move it out of the swap cache.
> + * Return: true or false.
> + */

I appreciate the kerneldoc.

Intuitively, this should be called "..._swap_entry".

But I wonder if "contains" is really the right term to use here. It's 
more like that a swap entry "belongs to" (was assigned to) a folio, right?

Sure, we store the information in the folio, but the "contains" is a bit 
weird.

folio_matches_swp_entry() maybe?


> +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> +{

const struct folio *


> +	pgoff_t offset = swp_offset(entry);
> +
> +	VM_WARN_ON_ONCE(!folio_test_locked(folio));
> +	if (unlikely(!folio_test_swapcache(folio)))
> +		return false;
> +	if (unlikely(swp_type(entry) != swp_type(folio->swap)))
> +		return false;
> +	return offset - swp_offset(folio->swap) < folio_nr_pages(folio);
> +}
> +
>   void show_swap_cache_info(void);
>   void *get_shadow_from_swap_cache(swp_entry_t entry);
>   int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
> @@ -144,6 +167,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
>   	return 0;
>   }
>   

-- 
Cheers

David / dhildenb

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month ago

On Tue, Sep 2, 2025 at 3:18 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 22.08.25 21:20, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Swap cache lookup is lockless, it only increases the reference count
> > of the returned folio. That's not enough to ensure a folio is stable in
> > the swap cache, so the folio could be removed from the swap cache at any
> > time. The caller always has to lock and check the folio before use.
> >
> > Document this as a comment, and introduce a helper for swap cache folio
> > verification with proper sanity checks.
> >
> > Also, sanitize all current users to use this convention, and use the new
> > helper when possible for easier debugging. Some existing callers won't
> > cause any major problem right now, only trivial issues like incorrect
> > readahead statistic (swapin) or wasted loop (swapoff). It's better to
> > always follow this convention to make things robust.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
>
> [...]
>
> > +/**
> > + * folio_contains_swap - Does this folio contain this swap entry?
> > + * @folio: The folio.
> > + * @entry: The swap entry to check against.
> > + *
> > + * Swap version of folio_contains()
> > + *
> > + * Context: The caller should have the folio locked to ensure
> > + * nothing will move it out of the swap cache.
> > + * Return: true or false.
> > + */
>
> I appreciate the kerneldoc.
>
> Intuitively, this should be called "..._swap_entry".
>
> But I wonder if "contains" is really the right term to use here. It's
> more like that a swap entry "belongs to" (was assigned to) a folio, right?

Right, in the other design doc I use the word "binding" for the
relationship between folio and swap entry. As if it is a binding
contract, your folio data goes and only goes here. There is no owning
relationship. Other folios might want to compete and win over the
binding contract as well (the race in swap in).

> Sure, we store the information in the folio, but the "contains" is a bit
> weird.
>
> folio_matches_swp_entry() maybe?

Yes, I like the name folio_match_swap_entry() you suggested in the
other email as well.

Chris

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month ago

On Tue, Sep 2, 2025 at 9:03 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Sep 2, 2025 at 3:18 AM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 22.08.25 21:20, Kairui Song wrote:
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Swap cache lookup is lockless, it only increases the reference count
> > > of the returned folio. That's not enough to ensure a folio is stable in
> > > the swap cache, so the folio could be removed from the swap cache at any
> > > time. The caller always has to lock and check the folio before use.
> > >
> > > Document this as a comment, and introduce a helper for swap cache folio
> > > verification with proper sanity checks.
> > >
> > > Also, sanitize all current users to use this convention, and use the new
> > > helper when possible for easier debugging. Some existing callers won't
> > > cause any major problem right now, only trivial issues like incorrect
> > > readahead statistic (swapin) or wasted loop (swapoff). It's better to
> > > always follow this convention to make things robust.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> >
> > [...]
> >
> > > +/**
> > > + * folio_contains_swap - Does this folio contain this swap entry?
> > > + * @folio: The folio.
> > > + * @entry: The swap entry to check against.
> > > + *
> > > + * Swap version of folio_contains()
> > > + *
> > > + * Context: The caller should have the folio locked to ensure
> > > + * nothing will move it out of the swap cache.
> > > + * Return: true or false.
> > > + */
> >
> > I appreciate the kerneldoc.
> >
> > Intuitively, this should be called "..._swap_entry".
> >
> > But I wonder if "contains" is really the right term to use here. It's
> > more like that a swap entry "belongs to" (was assigned to) a folio, right?
>
> Right, in the other design doc I use the word "binding" for the
> relationship between folio and swap entry. As if it is a binding
> contract, your folio data goes and only goes here. There is no owning
> relationship. Other folios might want to compete and win over the
> binding contract as well (the race in swap in).
>
> > Sure, we store the information in the folio, but the "contains" is a bit
> > weird.
> >
> > folio_matches_swp_entry() maybe?
>
> Yes, I like the name folio_match_swap_entry() you suggested in the
> other email as well.

I like this name too. The `folio_contains_swap` name comes from
`folio_contains` as it's just like a swap version of it. But I also
found the name a bit strange as they are different things, but had no
better idea. Thanks for the suggestion.

>
> Chris
>

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by David Hildenbrand 1 month ago

On 02.09.25 12:18, David Hildenbrand wrote:
> On 22.08.25 21:20, Kairui Song wrote:
>> From: Kairui Song <kasong@tencent.com>
>>
>> Swap cache lookup is lockless, it only increases the reference count
>> of the returned folio. That's not enough to ensure a folio is stable in
>> the swap cache, so the folio could be removed from the swap cache at any
>> time. The caller always has to lock and check the folio before use.
>>
>> Document this as a comment, and introduce a helper for swap cache folio
>> verification with proper sanity checks.
>>
>> Also, sanitize all current users to use this convention, and use the new
>> helper when possible for easier debugging. Some existing callers won't
>> cause any major problem right now, only trivial issues like incorrect
>> readahead statistic (swapin) or wasted loop (swapoff). It's better to
>> always follow this convention to make things robust.
>>
>> Signed-off-by: Kairui Song <kasong@tencent.com>
>> ---
> 
> [...]
> 
>> +/**
>> + * folio_contains_swap - Does this folio contain this swap entry?
>> + * @folio: The folio.
>> + * @entry: The swap entry to check against.
>> + *
>> + * Swap version of folio_contains()
>> + *
>> + * Context: The caller should have the folio locked to ensure
>> + * nothing will move it out of the swap cache.
>> + * Return: true or false.
>> + */
> 
> I appreciate the kerneldoc.
> 
> Intuitively, this should be called "..._swap_entry".
> 
> But I wonder if "contains" is really the right term to use here. It's
> more like that a swap entry "belongs to" (was assigned to) a folio, right?
> 
> Sure, we store the information in the folio, but the "contains" is a bit
> weird.
> 
> folio_matches_swp_entry() maybe?

folio_matches_swap_entry() is what I wanted to say :)

-- 
Cheers

David / dhildenb

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Barry Song 1 month ago

> +/**
> + * folio_contains_swap - Does this folio contain this swap entry?
> + * @folio: The folio.
> + * @entry: The swap entry to check against.
> + *
> + * Swap version of folio_contains()
> + *
> + * Context: The caller should have the folio locked to ensure
> + * nothing will move it out of the swap cache.
> + * Return: true or false.
> + */
> +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> +{
> +       pgoff_t offset = swp_offset(entry);
> +
> +       VM_WARN_ON_ONCE(!folio_test_locked(folio));

VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);    ?

Thanks
Barry

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month, 1 week ago

On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:

> diff --git a/mm/shmem.c b/mm/shmem.c
> index e9d0d2784cd5..b4d39f2a1e0a 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                         count_vm_event(PGMAJFAULT);
>                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
>                 }
> -       } else {
> -               swap_update_readahead(folio, NULL, 0);

Also this update readahead move to later might have a similar problem.
All the bail out in the move will lose the readahead status update.
The readahead deed is already done. Missing the status update seems
incorrect.


>         }
>
>         if (order > folio_order(folio)) {
> @@ -2431,6 +2429,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                 error = -EIO;
>                 goto failed;
>         }
> +       if (!skip_swapcache)
> +               swap_update_readahead(folio, NULL, 0);
>         folio_wait_writeback(folio);
>         nr_pages = folio_nr_pages(folio);


>
> diff --git a/mm/swap.h b/mm/swap.h
> index efb6d7ff9f30..bb2adbfd64a9 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -52,6 +52,29 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
>         return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
>  }
>
> +/**
> + * folio_contains_swap - Does this folio contain this swap entry?
> + * @folio: The folio.
> + * @entry: The swap entry to check against.
> + *
> + * Swap version of folio_contains()
> + *
> + * Context: The caller should have the folio locked to ensure
> + * nothing will move it out of the swap cache.
> + * Return: true or false.
> + */
> +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> +{
> +       pgoff_t offset = swp_offset(entry);
> +
> +       VM_WARN_ON_ONCE(!folio_test_locked(folio));
> +       if (unlikely(!folio_test_swapcache(folio)))
> +               return false;
> +       if (unlikely(swp_type(entry) != swp_type(folio->swap)))
> +               return false;
> +       return offset - swp_offset(folio->swap) < folio_nr_pages(folio);
> +}
> +
>  void show_swap_cache_info(void);
>  void *get_shadow_from_swap_cache(swp_entry_t entry);
>  int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
> @@ -144,6 +167,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
>         return 0;
>  }
>
> +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> +{
> +       return false;
> +}
> +
>  static inline void show_swap_cache_info(void)
>  {
>  }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index ff9eb761a103..be0d96494dc1 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -70,10 +70,12 @@ void show_swap_cache_info(void)
>  }
>
>  /*
> - * Lookup a swap entry in the swap cache. A found folio will be returned
> - * unlocked and with its refcount incremented.
> + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
>   *
> - * Caller must lock the swap device or hold a reference to keep it valid.
> + * A found folio will be returned unlocked and with its refcount increased.
> + *
> + * Context: Caller must ensure @entry is valid and pin the swap device, also
Is the "pin" the same as  "lock the swap device or hold a reference"?
Not sure why you changed that comment to "pin".

It seems to me that you want to add the comment for the return value check.
Is that it?

> + * check the returned folio after locking it (e.g. folio_swap_contains).
>   */
>  struct folio *swap_cache_get_folio(swp_entry_t entry)
>  {
> @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>         for (;;) {
>                 int err;
>
> -               /* Check the swap cache in case the folio is already there */
> +               /*
> +                * Check the swap cache first, if a cached folio is found,
> +                * return it unlocked. The caller will lock and check it.
> +                */
>                 folio = swap_cache_get_folio(entry);
>                 if (folio)
>                         goto got_folio;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 4b8ab2cb49ca..12f2580ebe8d 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>          * Offset could point to the middle of a large folio, or folio
>          * may no longer point to the expected offset before it's locked.
>          */
> -       entry = folio->swap;
> -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> +       if (!folio_contains_swap(folio, entry)) {
>                 folio_unlock(folio);
>                 folio_put(folio);
>                 goto again;
>         }
> +       entry = folio->swap;

Can you also check this as well? The "goto again" will have entries
not assigned compared to previously.
Too late for me to think straight now if that will cause a problem.

To be continued.

Chris

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month, 1 week ago

On Wed, Aug 27, 2025 at 4:21 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index e9d0d2784cd5..b4d39f2a1e0a 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                         count_vm_event(PGMAJFAULT);
> >                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >                 }
> > -       } else {
> > -               swap_update_readahead(folio, NULL, 0);
>
> Also this update readahead move to later might have a similar problem.
> All the bail out in the move will lose the readahead status update.
>
> The readahead deed is already done. Missing the status update seems
> incorrect.

Thanks for the detailed review.

The only change I wanted here is that swap readahead update should be
done after checking the folio still corresponds to the swap entry
triggering the swapin. That should have slight to none effect compared
to before considering the extremely tiny time window. We are only
following the convention more strictly.

In theory it might even help to reduce false updates: if the folio no
longer corresponds to the swap entry, we are hitting an unrelated
folio, doing a readahead update will either mislead vma readahead's
address hint, or could clean up the readahead flag of an unrelated
folio without actually using it. If the folio does get hit in the
future, due to the missing readahead flag, the statistic will go
wrong.

>
>
> >         }
> >
> >         if (order > folio_order(folio)) {
> > @@ -2431,6 +2429,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                 error = -EIO;
> >                 goto failed;
> >         }
> > +       if (!skip_swapcache)
> > +               swap_update_readahead(folio, NULL, 0);
> >         folio_wait_writeback(folio);
> >         nr_pages = folio_nr_pages(folio);
>
>
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index efb6d7ff9f30..bb2adbfd64a9 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -52,6 +52,29 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
> >         return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
> >  }
> >
> > +/**
> > + * folio_contains_swap - Does this folio contain this swap entry?
> > + * @folio: The folio.
> > + * @entry: The swap entry to check against.
> > + *
> > + * Swap version of folio_contains()
> > + *
> > + * Context: The caller should have the folio locked to ensure
> > + * nothing will move it out of the swap cache.
> > + * Return: true or false.
> > + */
> > +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> > +{
> > +       pgoff_t offset = swp_offset(entry);
> > +
> > +       VM_WARN_ON_ONCE(!folio_test_locked(folio));
> > +       if (unlikely(!folio_test_swapcache(folio)))
> > +               return false;
> > +       if (unlikely(swp_type(entry) != swp_type(folio->swap)))
> > +               return false;
> > +       return offset - swp_offset(folio->swap) < folio_nr_pages(folio);
> > +}
> > +
> >  void show_swap_cache_info(void);
> >  void *get_shadow_from_swap_cache(swp_entry_t entry);
> >  int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
> > @@ -144,6 +167,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
> >         return 0;
> >  }
> >
> > +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> > +{
> > +       return false;
> > +}
> > +
> >  static inline void show_swap_cache_info(void)
> >  {
> >  }
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index ff9eb761a103..be0d96494dc1 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -70,10 +70,12 @@ void show_swap_cache_info(void)
> >  }
> >
> >  /*
> > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > - * unlocked and with its refcount incremented.
> > + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
> >   *
> > - * Caller must lock the swap device or hold a reference to keep it valid.
> > + * A found folio will be returned unlocked and with its refcount increased.
> > + *
> > + * Context: Caller must ensure @entry is valid and pin the swap device, also
> Is the "pin" the same as  "lock the swap device or hold a reference"?
> Not sure why you changed that comment to "pin".

Yes it's the same thing. We don't lock the device though, the device
can be pinned by the refcounf (get_swap_device) or locking anything
that is referencing the device (locking PTL the a PTE that contains an
swap entry pointing to the device, or locking a swap cache folio of a
swap entry that points to the device). So I juse used the word "pin".
I added some comments in mm/swap.h in later commits about what the
"pin" means.

>
> It seems to me that you want to add the comment for the return value check.
> Is that it?

Right, the caller has to check the folio before use, so I'm trying to
document this convention.

> > + * check the returned folio after locking it (e.g. folio_swap_contains).
> >   */
> >  struct folio *swap_cache_get_folio(swp_entry_t entry)
> >  {
> > @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >         for (;;) {
> >                 int err;
> >
> > -               /* Check the swap cache in case the folio is already there */
> > +               /*
> > +                * Check the swap cache first, if a cached folio is found,
> > +                * return it unlocked. The caller will lock and check it.
> > +                */
> >                 folio = swap_cache_get_folio(entry);
> >                 if (folio)
> >                         goto got_folio;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 4b8ab2cb49ca..12f2580ebe8d 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >          * Offset could point to the middle of a large folio, or folio
> >          * may no longer point to the expected offset before it's locked.
> >          */
> > -       entry = folio->swap;
> > -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> > +       if (!folio_contains_swap(folio, entry)) {
> >                 folio_unlock(folio);
> >                 folio_put(folio);
> >                 goto again;
> >         }
> > +       entry = folio->swap;
>
> Can you also check this as well? The "goto again" will have entries
> not assigned compared to previously.
> Too late for me to think straight now if that will cause a problem.

Oh, thanks for pointing this part out. This patch is correct, it's the
original behaviour that is not correct. If the folio is no longer
valid (the if check here failed), changing the `entry` value before
could lead to a wrong look in the next attempt with `goto again`. That
could lead to reclaim of an unrelated folio. It's a trivial issue
though, only might marginally slow down the performance. Maybe I
should make a seperate patch to fix this issue first in case anyone
wants to backport it.

>
> To be continued.
>
> Chris
>

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month ago

On Wed, Aug 27, 2025 at 7:36 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Wed, Aug 27, 2025 at 4:21 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > > diff --git a/mm/shmem.c b/mm/shmem.c
> > > index e9d0d2784cd5..b4d39f2a1e0a 100644
> > > --- a/mm/shmem.c
> > > +++ b/mm/shmem.c
> > > @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > >                         count_vm_event(PGMAJFAULT);
> > >                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
> > >                 }
> > > -       } else {
> > > -               swap_update_readahead(folio, NULL, 0);
> >
> > Also this update readahead move to later might have a similar problem.
> > All the bail out in the move will lose the readahead status update.
> >
> > The readahead deed is already done. Missing the status update seems
> > incorrect.
>
> Thanks for the detailed review.
>
> The only change I wanted here is that swap readahead update should be
> done after checking the folio still corresponds to the swap entry
> triggering the swapin. That should have slight to none effect compared
> to before considering the extremely tiny time window. We are only
> following the convention more strictly.
>
> In theory it might even help to reduce false updates: if the folio no
> longer corresponds to the swap entry, we are hitting an unrelated
> folio, doing a readahead update will either mislead vma readahead's
> address hint, or could clean up the readahead flag of an unrelated
> folio without actually using it. If the folio does get hit in the
> future, due to the missing readahead flag, the statistic will go
> wrong.

So the missing readahead stats update behavior is the correct and
better behavior. I suggest you spit that out as a separate patch with
appropriate comments about it too. It is also easier to bisect the
commit if that kind of the subtle change which is considered safe
turns out causing a problem. Causing problem not happen very often but
it does happen before.

> > >  /*
> > > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > > - * unlocked and with its refcount incremented.
> > > + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
> > >   *
> > > - * Caller must lock the swap device or hold a reference to keep it valid.
> > > + * A found folio will be returned unlocked and with its refcount increased.
> > > + *
> > > + * Context: Caller must ensure @entry is valid and pin the swap device, also
> > Is the "pin" the same as  "lock the swap device or hold a reference"?
> > Not sure why you changed that comment to "pin".
>
> Yes it's the same thing. We don't lock the device though, the device
> can be pinned by the refcounf (get_swap_device) or locking anything
> that is referencing the device (locking PTL the a PTE that contains an
> swap entry pointing to the device, or locking a swap cache folio of a
> swap entry that points to the device). So I juse used the word "pin".
> I added some comments in mm/swap.h in later commits about what the
> "pin" means.

In that case why not reuse the previous comment keeping "lock the swap
device or hold a reference" instead of "pin"?

> > It seems to me that you want to add the comment for the return value check.
> > Is that it?
>
> Right, the caller has to check the folio before use, so I'm trying to
> document this convention.

Again, I recommend reducing the unnecessary impact to the code, make
it more obvious what you did actually change. I spend quite some time
there trying to figure out what you are trying to accomplish with the
comments.

> > > + * check the returned folio after locking it (e.g. folio_swap_contains).
> > >   */
> > >  struct folio *swap_cache_get_folio(swp_entry_t entry)
> > >  {
> > > @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >         for (;;) {
> > >                 int err;
> > >
> > > -               /* Check the swap cache in case the folio is already there */
> > > +               /*
> > > +                * Check the swap cache first, if a cached folio is found,
> > > +                * return it unlocked. The caller will lock and check it.
> > > +                */
> > >                 folio = swap_cache_get_folio(entry);
> > >                 if (folio)
> > >                         goto got_folio;
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index 4b8ab2cb49ca..12f2580ebe8d 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > >          * Offset could point to the middle of a large folio, or folio
> > >          * may no longer point to the expected offset before it's locked.
> > >          */
> > > -       entry = folio->swap;
> > > -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> > > +       if (!folio_contains_swap(folio, entry)) {
> > >                 folio_unlock(folio);
> > >                 folio_put(folio);
> > >                 goto again;
> > >         }
> > > +       entry = folio->swap;
> >
> > Can you also check this as well? The "goto again" will have entries
> > not assigned compared to previously.
> > Too late for me to think straight now if that will cause a problem.
>
> Oh, thanks for pointing this part out. This patch is correct, it's the
> original behaviour that is not correct. If the folio is no longer
> valid (the if check here failed), changing the `entry` value before
> could lead to a wrong look in the next attempt with `goto again`. That
> could lead to reclaim of an unrelated folio. It's a trivial issue
> though, only might marginally slow down the performance. Maybe I
> should make a seperate patch to fix this issue first in case anyone
> wants to backport it.

Thanks for the explanation, please do split this subtle behavior
change out with appropriate commit messages documenting your change,
why it is safe and the better behavior.

Thanks

Chris

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month ago

On Sat, Aug 30, 2025 at 9:54 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Wed, Aug 27, 2025 at 7:36 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Wed, Aug 27, 2025 at 4:21 PM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > > diff --git a/mm/shmem.c b/mm/shmem.c
> > > > index e9d0d2784cd5..b4d39f2a1e0a 100644
> > > > --- a/mm/shmem.c
> > > > +++ b/mm/shmem.c
> > > > @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > > >                         count_vm_event(PGMAJFAULT);
> > > >                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
> > > >                 }
> > > > -       } else {
> > > > -               swap_update_readahead(folio, NULL, 0);
> > >
> > > Also this update readahead move to later might have a similar problem.
> > > All the bail out in the move will lose the readahead status update.
> > >
> > > The readahead deed is already done. Missing the status update seems
> > > incorrect.
> >
> > Thanks for the detailed review.
> >
> > The only change I wanted here is that swap readahead update should be
> > done after checking the folio still corresponds to the swap entry
> > triggering the swapin. That should have slight to none effect compared
> > to before considering the extremely tiny time window. We are only
> > following the convention more strictly.
> >
> > In theory it might even help to reduce false updates: if the folio no
> > longer corresponds to the swap entry, we are hitting an unrelated
> > folio, doing a readahead update will either mislead vma readahead's
> > address hint, or could clean up the readahead flag of an unrelated
> > folio without actually using it. If the folio does get hit in the
> > future, due to the missing readahead flag, the statistic will go
> > wrong.
>
> So the missing readahead stats update behavior is the correct and
> better behavior. I suggest you spit that out as a separate patch with
> appropriate comments about it too. It is also easier to bisect the
> commit if that kind of the subtle change which is considered safe
> turns out causing a problem. Causing problem not happen very often but
> it does happen before.
>

Hmm, after a second thought, maybe we should keep it as it is for now.

I just realized moving the swap_update_readahead after folio_lock is
more than just ensuring the folio is still valid. It will also cause
every swapin to do a readahead update. Previously, only cache hit
swapin will do a swap readahead update.

I did some tests, and didn't see any measurable performance difference
between putting it before / after the folio_lock. But changing it for
no good reason seems not a good idea after all.

So I think I'll keep it before the folio_lock. There is no evidence of
which strategy is better, just keep the current behaviour.

Calling swap_update_readahead even if the swap cache folio
is already invalidated is not really harmful, the only thing it does
that may effect the folio is the folio_test_clear_readahead call in
it, and we have been doing for years with no problem. Calling
swap_update_readahead for every folio might not be a good idea.

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month ago

On Mon, Sep 1, 2025 at 11:18 AM Kairui Song <ryncsn@gmail.com> wrote:
> > So the missing readahead stats update behavior is the correct and
> > better behavior. I suggest you spit that out as a separate patch with
> > appropriate comments about it too. It is also easier to bisect the
> > commit if that kind of the subtle change which is considered safe
> > turns out causing a problem. Causing problem not happen very often but
> > it does happen before.
> >
>
> Hmm, after a second thought, maybe we should keep it as it is for now.
>
> I just realized moving the swap_update_readahead after folio_lock is
> more than just ensuring the folio is still valid. It will also cause
> every swapin to do a readahead update. Previously, only cache hit
> swapin will do a swap readahead update.
>
> I did some tests, and didn't see any measurable performance difference
> between putting it before / after the folio_lock. But changing it for
> no good reason seems not a good idea after all.
>
> So I think I'll keep it before the folio_lock. There is no evidence of
> which strategy is better, just keep the current behaviour.
>
> Calling swap_update_readahead even if the swap cache folio
> is already invalidated is not really harmful, the only thing it does
> that may effect the folio is the folio_test_clear_readahead call in
> it, and we have been doing for years with no problem. Calling
> swap_update_readahead for every folio might not be a good idea.

Thanks for the update. That is what I originally felt as well. It
caught me by surprise by this detour code path. I need to spend extra
time for the detour. I don't see it necessary to contribute to the
phase I goal of merging the swap table.

We can still add it later as a separate patch if we want.

Chris

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month ago

On Sat, Aug 30, 2025 at 9:54 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Wed, Aug 27, 2025 at 7:36 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Wed, Aug 27, 2025 at 4:21 PM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > > diff --git a/mm/shmem.c b/mm/shmem.c
> > > > index e9d0d2784cd5..b4d39f2a1e0a 100644
> > > > --- a/mm/shmem.c
> > > > +++ b/mm/shmem.c
> > > > @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > > >                         count_vm_event(PGMAJFAULT);
> > > >                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
> > > >                 }
> > > > -       } else {
> > > > -               swap_update_readahead(folio, NULL, 0);
> > >
> > > Also this update readahead move to later might have a similar problem.
> > > All the bail out in the move will lose the readahead status update.
> > >
> > > The readahead deed is already done. Missing the status update seems
> > > incorrect.
> >
> > Thanks for the detailed review.
> >
> > The only change I wanted here is that swap readahead update should be
> > done after checking the folio still corresponds to the swap entry
> > triggering the swapin. That should have slight to none effect compared
> > to before considering the extremely tiny time window. We are only
> > following the convention more strictly.
> >
> > In theory it might even help to reduce false updates: if the folio no
> > longer corresponds to the swap entry, we are hitting an unrelated
> > folio, doing a readahead update will either mislead vma readahead's
> > address hint, or could clean up the readahead flag of an unrelated
> > folio without actually using it. If the folio does get hit in the
> > future, due to the missing readahead flag, the statistic will go
> > wrong.
>
> So the missing readahead stats update behavior is the correct and
> better behavior. I suggest you spit that out as a separate patch with
> appropriate comments about it too. It is also easier to bisect the
> commit if that kind of the subtle change which is considered safe
> turns out causing a problem. Causing problem not happen very often but
> it does happen before.

Yes, I'm planning to split one patch out for the readahead change.

> > > >  /*
> > > > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > > > - * unlocked and with its refcount incremented.
> > > > + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
> > > >   *
> > > > - * Caller must lock the swap device or hold a reference to keep it valid.
> > > > + * A found folio will be returned unlocked and with its refcount increased.
> > > > + *
> > > > + * Context: Caller must ensure @entry is valid and pin the swap device, also
> > > Is the "pin" the same as  "lock the swap device or hold a reference"?
> > > Not sure why you changed that comment to "pin".
> >
> > Yes it's the same thing. We don't lock the device though, the device
> > can be pinned by the refcounf (get_swap_device) or locking anything
> > that is referencing the device (locking PTL the a PTE that contains an
> > swap entry pointing to the device, or locking a swap cache folio of a
> > swap entry that points to the device). So I juse used the word "pin".
> > I added some comments in mm/swap.h in later commits about what the
> > "pin" means.
>
> In that case why not reuse the previous comment keeping "lock the swap
> device or hold a reference" instead of "pin"?

I'm worried that the sentence "lock the swap device" is kind of fuzzy,
people may misunderstand that they need to hold si->lock. Actually
they only need to hold si->user or lock anything. It's not wrong but
kind of overkill.

>
> > > It seems to me that you want to add the comment for the return value check.
> > > Is that it?
> >
> > Right, the caller has to check the folio before use, so I'm trying to
> > document this convention.
>
> Again, I recommend reducing the unnecessary impact to the code, make
> it more obvious what you did actually change. I spend quite some time
> there trying to figure out what you are trying to accomplish with the
> comments.
>
> > > > + * check the returned folio after locking it (e.g. folio_swap_contains).
> > > >   */
> > > >  struct folio *swap_cache_get_folio(swp_entry_t entry)
> > > >  {
> > > > @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > > >         for (;;) {
> > > >                 int err;
> > > >
> > > > -               /* Check the swap cache in case the folio is already there */
> > > > +               /*
> > > > +                * Check the swap cache first, if a cached folio is found,
> > > > +                * return it unlocked. The caller will lock and check it.
> > > > +                */
> > > >                 folio = swap_cache_get_folio(entry);
> > > >                 if (folio)
> > > >                         goto got_folio;
> > > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > > index 4b8ab2cb49ca..12f2580ebe8d 100644
> > > > --- a/mm/swapfile.c
> > > > +++ b/mm/swapfile.c
> > > > @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > > >          * Offset could point to the middle of a large folio, or folio
> > > >          * may no longer point to the expected offset before it's locked.
> > > >          */
> > > > -       entry = folio->swap;
> > > > -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> > > > +       if (!folio_contains_swap(folio, entry)) {
> > > >                 folio_unlock(folio);
> > > >                 folio_put(folio);
> > > >                 goto again;
> > > >         }
> > > > +       entry = folio->swap;
> > >
> > > Can you also check this as well? The "goto again" will have entries
> > > not assigned compared to previously.
> > > Too late for me to think straight now if that will cause a problem.
> >
> > Oh, thanks for pointing this part out. This patch is correct, it's the
> > original behaviour that is not correct. If the folio is no longer
> > valid (the if check here failed), changing the `entry` value before
> > could lead to a wrong look in the next attempt with `goto again`. That
> > could lead to reclaim of an unrelated folio. It's a trivial issue
> > though, only might marginally slow down the performance. Maybe I
> > should make a seperate patch to fix this issue first in case anyone
> > wants to backport it.
>
> Thanks for the explanation, please do split this subtle behavior
> change out with appropriate commit messages documenting your change,
> why it is safe and the better behavior.
>
> Thanks

Thanks for the review, I think separating 2 patches (one for
__try_to_reclaim_swap and one for readahead) out of this one should be
good enough and make everyone happy, overall the code is still the
same.

>
> Chris

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month ago

On Sat, Aug 30, 2025 at 8:16 AM Kairui Song <ryncsn@gmail.com> wrote:
> > So the missing readahead stats update behavior is the correct and
> > better behavior. I suggest you spit that out as a separate patch with
> > appropriate comments about it too. It is also easier to bisect the
> > commit if that kind of the subtle change which is considered safe
> > turns out causing a problem. Causing problem not happen very often but
> > it does happen before.
>
> Yes, I'm planning to split one patch out for the readahead change.

Ack.
>
> > > > >  /*
> > > > > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > > > > - * unlocked and with its refcount incremented.
> > > > > + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
> > > > >   *
> > > > > - * Caller must lock the swap device or hold a reference to keep it valid.
> > > > > + * A found folio will be returned unlocked and with its refcount increased.
> > > > > + *
> > > > > + * Context: Caller must ensure @entry is valid and pin the swap device, also
> > > > Is the "pin" the same as  "lock the swap device or hold a reference"?
> > > > Not sure why you changed that comment to "pin".
> > >
> > > Yes it's the same thing. We don't lock the device though, the device
> > > can be pinned by the refcounf (get_swap_device) or locking anything
> > > that is referencing the device (locking PTL the a PTE that contains an
> > > swap entry pointing to the device, or locking a swap cache folio of a
> > > swap entry that points to the device). So I juse used the word "pin".
> > > I added some comments in mm/swap.h in later commits about what the
> > > "pin" means.
> >
> > In that case why not reuse the previous comment keeping "lock the swap
> > device or hold a reference" instead of "pin"?
>
> I'm worried that the sentence "lock the swap device" is kind of fuzzy,
> people may misunderstand that they need to hold si->lock. Actually
> they only need to hold si->user or lock anything. It's not wrong but
> kind of overkill.

What you just told me is a lot more useful than what was previously there.
Try to incorporate that into the comments. e.g. instead of "lock the
swap device", do "lock si->user or any other lock" or something like
that.

> > > > It seems to me that you want to add the comment for the return value check.
> > > > Is that it?
> > >
> > > Right, the caller has to check the folio before use, so I'm trying to
> > > document this convention.
> >
> > Again, I recommend reducing the unnecessary impact to the code, make
> > it more obvious what you did actually change. I spend quite some time
> > there trying to figure out what you are trying to accomplish with the
> > comments.
> >
> > > > > + * check the returned folio after locking it (e.g. folio_swap_contains).
> > > > >   */
> > > > >  struct folio *swap_cache_get_folio(swp_entry_t entry)
> > > > >  {
> > > > > @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > > > >         for (;;) {
> > > > >                 int err;
> > > > >
> > > > > -               /* Check the swap cache in case the folio is already there */
> > > > > +               /*
> > > > > +                * Check the swap cache first, if a cached folio is found,
> > > > > +                * return it unlocked. The caller will lock and check it.
> > > > > +                */
> > > > >                 folio = swap_cache_get_folio(entry);
> > > > >                 if (folio)
> > > > >                         goto got_folio;
> > > > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > > > index 4b8ab2cb49ca..12f2580ebe8d 100644
> > > > > --- a/mm/swapfile.c
> > > > > +++ b/mm/swapfile.c
> > > > > @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> > > > >          * Offset could point to the middle of a large folio, or folio
> > > > >          * may no longer point to the expected offset before it's locked.
> > > > >          */
> > > > > -       entry = folio->swap;
> > > > > -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> > > > > +       if (!folio_contains_swap(folio, entry)) {
> > > > >                 folio_unlock(folio);
> > > > >                 folio_put(folio);
> > > > >                 goto again;
> > > > >         }
> > > > > +       entry = folio->swap;
> > > >
> > > > Can you also check this as well? The "goto again" will have entries
> > > > not assigned compared to previously.
> > > > Too late for me to think straight now if that will cause a problem.
> > >
> > > Oh, thanks for pointing this part out. This patch is correct, it's the
> > > original behaviour that is not correct. If the folio is no longer
> > > valid (the if check here failed), changing the `entry` value before
> > > could lead to a wrong look in the next attempt with `goto again`. That
> > > could lead to reclaim of an unrelated folio. It's a trivial issue
> > > though, only might marginally slow down the performance. Maybe I
> > > should make a seperate patch to fix this issue first in case anyone
> > > wants to backport it.
> >
> > Thanks for the explanation, please do split this subtle behavior
> > change out with appropriate commit messages documenting your change,
> > why it is safe and the better behavior.
> >
> > Thanks
>
> Thanks for the review, I think separating 2 patches (one for
> __try_to_reclaim_swap and one for readahead) out of this one should be
> good enough and make everyone happy, overall the code is still the
> same.

It is your call. I am happy to review them the same. It might take me
more time to reason about it and slightly delay your series merge to
mm-unstable, that is all.

Chris

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Baolin Wang 1 month ago


On 2025/8/27 22:35, Kairui Song wrote:
> On Wed, Aug 27, 2025 at 4:21 PM Chris Li <chrisl@kernel.org> wrote:
>>
>> On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index e9d0d2784cd5..b4d39f2a1e0a 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>                          count_vm_event(PGMAJFAULT);
>>>                          count_memcg_event_mm(fault_mm, PGMAJFAULT);
>>>                  }
>>> -       } else {
>>> -               swap_update_readahead(folio, NULL, 0);
>>
>> Also this update readahead move to later might have a similar problem.
>> All the bail out in the move will lose the readahead status update.
>>
>> The readahead deed is already done. Missing the status update seems
>> incorrect.
> 
> Thanks for the detailed review.
> 
> The only change I wanted here is that swap readahead update should be
> done after checking the folio still corresponds to the swap entry
> triggering the swapin. That should have slight to none effect compared
> to before considering the extremely tiny time window. We are only
> following the convention more strictly.
> 
> In theory it might even help to reduce false updates: if the folio no
> longer corresponds to the swap entry, we are hitting an unrelated
> folio, doing a readahead update will either mislead vma readahead's
> address hint, or could clean up the readahead flag of an unrelated
> folio without actually using it. If the folio does get hit in the
> future, due to the missing readahead flag, the statistic will go
> wrong.

Yes, that’s what I thought as well.

By the way, can we do it right all at once in patch 1 (I mean the shmem 
changes)?

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month ago

On Thu, Aug 28, 2025 at 11:41 AM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2025/8/27 22:35, Kairui Song wrote:
> > On Wed, Aug 27, 2025 at 4:21 PM Chris Li <chrisl@kernel.org> wrote:
> >>
> >> On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
> >>
> >>> diff --git a/mm/shmem.c b/mm/shmem.c
> >>> index e9d0d2784cd5..b4d39f2a1e0a 100644
> >>> --- a/mm/shmem.c
> >>> +++ b/mm/shmem.c
> >>> @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >>>                          count_vm_event(PGMAJFAULT);
> >>>                          count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >>>                  }
> >>> -       } else {
> >>> -               swap_update_readahead(folio, NULL, 0);
> >>
> >> Also this update readahead move to later might have a similar problem.
> >> All the bail out in the move will lose the readahead status update.
> >>
> >> The readahead deed is already done. Missing the status update seems
> >> incorrect.
> >
> > Thanks for the detailed review.
> >
> > The only change I wanted here is that swap readahead update should be
> > done after checking the folio still corresponds to the swap entry
> > triggering the swapin. That should have slight to none effect compared
> > to before considering the extremely tiny time window. We are only
> > following the convention more strictly.
> >
> > In theory it might even help to reduce false updates: if the folio no
> > longer corresponds to the swap entry, we are hitting an unrelated
> > folio, doing a readahead update will either mislead vma readahead's
> > address hint, or could clean up the readahead flag of an unrelated
> > folio without actually using it. If the folio does get hit in the
> > future, due to the missing readahead flag, the statistic will go
> > wrong.
>
> Yes, that’s what I thought as well.
>
> By the way, can we do it right all at once in patch 1 (I mean the shmem
> changes)?

Hi Baolin,

Yeah it's OK to do so but it's kind of a very slight behaviour change.
Currently patch 1 has zero behaviour change, so maybe just leave it in
this patch where we sanitize all swap cache conventions all at once.

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month, 1 week ago

On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Swap cache lookup is lockless, it only increases the reference count
> of the returned folio. That's not enough to ensure a folio is stable in
> the swap cache, so the folio could be removed from the swap cache at any
> time. The caller always has to lock and check the folio before use.
>
> Document this as a comment, and introduce a helper for swap cache folio
> verification with proper sanity checks.
>
> Also, sanitize all current users to use this convention, and use the new
> helper when possible for easier debugging. Some existing callers won't
> cause any major problem right now, only trivial issues like incorrect
> readahead statistic (swapin) or wasted loop (swapoff). It's better to
> always follow this convention to make things robust.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/memory.c     | 28 +++++++++++++---------------
>  mm/shmem.c      |  4 ++--
>  mm/swap.h       | 28 ++++++++++++++++++++++++++++
>  mm/swap_state.c | 13 +++++++++----
>  mm/swapfile.c   | 10 ++++++++--
>  5 files changed, 60 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 10ef528a5f44..9ca8e1873c6e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4661,12 +4661,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>                 goto out;
>
>         folio = swap_cache_get_folio(entry);
> -       if (folio) {
> -               swap_update_readahead(folio, vma, vmf->address);
> -               page = folio_file_page(folio, swp_offset(entry));
> -       }
>         swapcache = folio;
> -

Can simplify as:
           folio = swapcache = swap_cache_get_folio(entry);

>         if (!folio) {
>                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>                     __swap_count(entry) == 1) {
> @@ -4735,20 +4730,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>                 ret = VM_FAULT_MAJOR;
>                 count_vm_event(PGMAJFAULT);
>                 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> -               page = folio_file_page(folio, swp_offset(entry));
> -       } else if (PageHWPoison(page)) {
> -               /*
> -                * hwpoisoned dirty swapcache pages are kept for killing
> -                * owner processes (which may be unknown at hwpoison time)
> -                */
> -               ret = VM_FAULT_HWPOISON;
> -               goto out_release;

Here you move the HWPosion(page) bail out from before taking the page
lock to after the page lock. The HWPosion page should be able to bail
out without taking the lock.

>         }
>
>         ret |= folio_lock_or_retry(folio, vmf);
>         if (ret & VM_FAULT_RETRY)
>                 goto out_release;
>
> +       page = folio_file_page(folio, swp_offset(entry));
>         if (swapcache) {
>                 /*
>                  * Make sure folio_free_swap() or swapoff did not release the
> @@ -4757,10 +4745,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>                  * swapcache, we need to check that the page's swap has not
>                  * changed.
>                  */
> -               if (unlikely(!folio_test_swapcache(folio) ||
> -                            page_swap_entry(page).val != entry.val))
> +               if (!folio_contains_swap(folio, entry))
>                         goto out_page;
>
> +               if (PageHWPoison(page)) {
> +                       /*
> +                        * hwpoisoned dirty swapcache pages are kept for killing
> +                        * owner processes (which may be unknown at hwpoison time)
> +                        */
> +                       ret = VM_FAULT_HWPOISON;
> +                       goto out_page;

It seems you bail out with the page still locked, that seems like a bug to me.

I think this HWPoision() check move order with the page lock is problematic.

Can you double check?

To be continued.

Chris

> +               }
> +
> +               swap_update_readahead(folio, vma, vmf->address);
> +
>                 /*
>                  * KSM sometimes has to copy on read faults, for example, if
>                  * folio->index of non-ksm folios would be nonlinear inside the
> diff --git a/mm/shmem.c b/mm/shmem.c
> index e9d0d2784cd5..b4d39f2a1e0a 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                         count_vm_event(PGMAJFAULT);
>                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
>                 }
> -       } else {
> -               swap_update_readahead(folio, NULL, 0);
>         }
>
>         if (order > folio_order(folio)) {
> @@ -2431,6 +2429,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                 error = -EIO;
>                 goto failed;
>         }
> +       if (!skip_swapcache)
> +               swap_update_readahead(folio, NULL, 0);
>         folio_wait_writeback(folio);
>         nr_pages = folio_nr_pages(folio);
>
> diff --git a/mm/swap.h b/mm/swap.h
> index efb6d7ff9f30..bb2adbfd64a9 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -52,6 +52,29 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
>         return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
>  }
>
> +/**
> + * folio_contains_swap - Does this folio contain this swap entry?
> + * @folio: The folio.
> + * @entry: The swap entry to check against.
> + *
> + * Swap version of folio_contains()
> + *
> + * Context: The caller should have the folio locked to ensure
> + * nothing will move it out of the swap cache.
> + * Return: true or false.
> + */
> +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> +{
> +       pgoff_t offset = swp_offset(entry);
> +
> +       VM_WARN_ON_ONCE(!folio_test_locked(folio));
> +       if (unlikely(!folio_test_swapcache(folio)))
> +               return false;
> +       if (unlikely(swp_type(entry) != swp_type(folio->swap)))
> +               return false;
> +       return offset - swp_offset(folio->swap) < folio_nr_pages(folio);
> +}
> +
>  void show_swap_cache_info(void);
>  void *get_shadow_from_swap_cache(swp_entry_t entry);
>  int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
> @@ -144,6 +167,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
>         return 0;
>  }
>
> +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> +{
> +       return false;
> +}
> +
>  static inline void show_swap_cache_info(void)
>  {
>  }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index ff9eb761a103..be0d96494dc1 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -70,10 +70,12 @@ void show_swap_cache_info(void)
>  }
>
>  /*
> - * Lookup a swap entry in the swap cache. A found folio will be returned
> - * unlocked and with its refcount incremented.
> + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
>   *
> - * Caller must lock the swap device or hold a reference to keep it valid.
> + * A found folio will be returned unlocked and with its refcount increased.
> + *
> + * Context: Caller must ensure @entry is valid and pin the swap device, also
> + * check the returned folio after locking it (e.g. folio_swap_contains).
>   */
>  struct folio *swap_cache_get_folio(swp_entry_t entry)
>  {
> @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>         for (;;) {
>                 int err;
>
> -               /* Check the swap cache in case the folio is already there */
> +               /*
> +                * Check the swap cache first, if a cached folio is found,
> +                * return it unlocked. The caller will lock and check it.
> +                */
>                 folio = swap_cache_get_folio(entry);
>                 if (folio)
>                         goto got_folio;
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 4b8ab2cb49ca..12f2580ebe8d 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>          * Offset could point to the middle of a large folio, or folio
>          * may no longer point to the expected offset before it's locked.
>          */
> -       entry = folio->swap;
> -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> +       if (!folio_contains_swap(folio, entry)) {
>                 folio_unlock(folio);
>                 folio_put(folio);
>                 goto again;
>         }
> +       entry = folio->swap;
>         offset = swp_offset(entry);
>
>         need_reclaim = ((flags & TTRS_ANYWAY) ||
> @@ -2150,6 +2150,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>                 }
>
>                 folio_lock(folio);
> +               if (!folio_contains_swap(folio, entry)) {
> +                       folio_unlock(folio);
> +                       folio_put(folio);
> +                       continue;
> +               }
> +
>                 folio_wait_writeback(folio);
>                 ret = unuse_pte(vma, pmd, addr, entry, folio);
>                 if (ret < 0) {
> --
> 2.51.0
>
>

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Kairui Song 1 month, 1 week ago

On Wed, Aug 27, 2025 at 4:06 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Swap cache lookup is lockless, it only increases the reference count
> > of the returned folio. That's not enough to ensure a folio is stable in
> > the swap cache, so the folio could be removed from the swap cache at any
> > time. The caller always has to lock and check the folio before use.
> >
> > Document this as a comment, and introduce a helper for swap cache folio
> > verification with proper sanity checks.
> >
> > Also, sanitize all current users to use this convention, and use the new
> > helper when possible for easier debugging. Some existing callers won't
> > cause any major problem right now, only trivial issues like incorrect
> > readahead statistic (swapin) or wasted loop (swapoff). It's better to
> > always follow this convention to make things robust.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/memory.c     | 28 +++++++++++++---------------
> >  mm/shmem.c      |  4 ++--
> >  mm/swap.h       | 28 ++++++++++++++++++++++++++++
> >  mm/swap_state.c | 13 +++++++++----
> >  mm/swapfile.c   | 10 ++++++++--
> >  5 files changed, 60 insertions(+), 23 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 10ef528a5f44..9ca8e1873c6e 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4661,12 +4661,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >                 goto out;
> >
> >         folio = swap_cache_get_folio(entry);
> > -       if (folio) {
> > -               swap_update_readahead(folio, vma, vmf->address);
> > -               page = folio_file_page(folio, swp_offset(entry));
> > -       }
> >         swapcache = folio;
> > -
>
> Can simplify as:
>            folio = swapcache = swap_cache_get_folio(entry);

checkpatch.pl is unhappy about it:

checkpatch: multiple assignments should be avoided

>
> >         if (!folio) {
> >                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> >                     __swap_count(entry) == 1) {
> > @@ -4735,20 +4730,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >                 ret = VM_FAULT_MAJOR;
> >                 count_vm_event(PGMAJFAULT);
> >                 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > -               page = folio_file_page(folio, swp_offset(entry));
> > -       } else if (PageHWPoison(page)) {
> > -               /*
> > -                * hwpoisoned dirty swapcache pages are kept for killing
> > -                * owner processes (which may be unknown at hwpoison time)
> > -                */
> > -               ret = VM_FAULT_HWPOISON;
> > -               goto out_release;
>
> Here you move the HWPosion(page) bail out from before taking the page
> lock to after the page lock. The HWPosion page should be able to bail
> out without taking the lock.
>
>
> >         }
> >
> >         ret |= folio_lock_or_retry(folio, vmf);
> >         if (ret & VM_FAULT_RETRY)
> >                 goto out_release;
> >
> > +       page = folio_file_page(folio, swp_offset(entry));
> >         if (swapcache) {
> >                 /*
> >                  * Make sure folio_free_swap() or swapoff did not release the
> > @@ -4757,10 +4745,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >                  * swapcache, we need to check that the page's swap has not
> >                  * changed.
> >                  */
> > -               if (unlikely(!folio_test_swapcache(folio) ||
> > -                            page_swap_entry(page).val != entry.val))
> > +               if (!folio_contains_swap(folio, entry))
> >                         goto out_page;
> >
> > +               if (PageHWPoison(page)) {
> > +                       /*
> > +                        * hwpoisoned dirty swapcache pages are kept for killing
> > +                        * owner processes (which may be unknown at hwpoison time)
> > +                        */
> > +                       ret = VM_FAULT_HWPOISON;
> > +                       goto out_page;
>
> It seems you bail out with the page still locked, that seems like a bug to me.

I think it's the original behaviour that is kind of fragile. The
returned folio of swap_cache_get_folio is unstable unless locked, so
the folio could have been removed from swap cache or marked by some
other threads as Poisoned. So the PageHWPoison here could be tested
against an unrelated page, which leads to false positive PageHWPoison
results. We have encountered several similar bugs due to using folios
returned by swap cache lookup without lock & checking first.

So checking HWPoison after locking is actually safer.

>
> I think this HWPoision() check move order with the page lock is problematic.
>
> Can you double check?
>
> To be continued.
>
> Chris
>
> > +               }
> > +
> > +               swap_update_readahead(folio, vma, vmf->address);
> > +
> >                 /*
> >                  * KSM sometimes has to copy on read faults, for example, if
> >                  * folio->index of non-ksm folios would be nonlinear inside the
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index e9d0d2784cd5..b4d39f2a1e0a 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2379,8 +2379,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                         count_vm_event(PGMAJFAULT);
> >                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >                 }
> > -       } else {
> > -               swap_update_readahead(folio, NULL, 0);
> >         }
> >
> >         if (order > folio_order(folio)) {
> > @@ -2431,6 +2429,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                 error = -EIO;
> >                 goto failed;
> >         }
> > +       if (!skip_swapcache)
> > +               swap_update_readahead(folio, NULL, 0);
> >         folio_wait_writeback(folio);
> >         nr_pages = folio_nr_pages(folio);
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index efb6d7ff9f30..bb2adbfd64a9 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -52,6 +52,29 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
> >         return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
> >  }
> >
> > +/**
> > + * folio_contains_swap - Does this folio contain this swap entry?
> > + * @folio: The folio.
> > + * @entry: The swap entry to check against.
> > + *
> > + * Swap version of folio_contains()
> > + *
> > + * Context: The caller should have the folio locked to ensure
> > + * nothing will move it out of the swap cache.
> > + * Return: true or false.
> > + */
> > +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> > +{
> > +       pgoff_t offset = swp_offset(entry);
> > +
> > +       VM_WARN_ON_ONCE(!folio_test_locked(folio));
> > +       if (unlikely(!folio_test_swapcache(folio)))
> > +               return false;
> > +       if (unlikely(swp_type(entry) != swp_type(folio->swap)))
> > +               return false;
> > +       return offset - swp_offset(folio->swap) < folio_nr_pages(folio);
> > +}
> > +
> >  void show_swap_cache_info(void);
> >  void *get_shadow_from_swap_cache(swp_entry_t entry);
> >  int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
> > @@ -144,6 +167,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
> >         return 0;
> >  }
> >
> > +static inline bool folio_contains_swap(struct folio *folio, swp_entry_t entry)
> > +{
> > +       return false;
> > +}
> > +
> >  static inline void show_swap_cache_info(void)
> >  {
> >  }
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index ff9eb761a103..be0d96494dc1 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -70,10 +70,12 @@ void show_swap_cache_info(void)
> >  }
> >
> >  /*
> > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > - * unlocked and with its refcount incremented.
> > + * swap_cache_get_folio - Lookup a swap entry in the swap cache.
> >   *
> > - * Caller must lock the swap device or hold a reference to keep it valid.
> > + * A found folio will be returned unlocked and with its refcount increased.
> > + *
> > + * Context: Caller must ensure @entry is valid and pin the swap device, also
> > + * check the returned folio after locking it (e.g. folio_swap_contains).
> >   */
> >  struct folio *swap_cache_get_folio(swp_entry_t entry)
> >  {
> > @@ -338,7 +340,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >         for (;;) {
> >                 int err;
> >
> > -               /* Check the swap cache in case the folio is already there */
> > +               /*
> > +                * Check the swap cache first, if a cached folio is found,
> > +                * return it unlocked. The caller will lock and check it.
> > +                */
> >                 folio = swap_cache_get_folio(entry);
> >                 if (folio)
> >                         goto got_folio;
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 4b8ab2cb49ca..12f2580ebe8d 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -240,12 +240,12 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >          * Offset could point to the middle of a large folio, or folio
> >          * may no longer point to the expected offset before it's locked.
> >          */
> > -       entry = folio->swap;
> > -       if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
> > +       if (!folio_contains_swap(folio, entry)) {
> >                 folio_unlock(folio);
> >                 folio_put(folio);
> >                 goto again;
> >         }
> > +       entry = folio->swap;
> >         offset = swp_offset(entry);
> >
> >         need_reclaim = ((flags & TTRS_ANYWAY) ||
> > @@ -2150,6 +2150,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >                 }
> >
> >                 folio_lock(folio);
> > +               if (!folio_contains_swap(folio, entry)) {
> > +                       folio_unlock(folio);
> > +                       folio_put(folio);
> > +                       continue;
> > +               }
> > +
> >                 folio_wait_writeback(folio);
> >                 ret = unuse_pte(vma, pmd, addr, entry, folio);
> >                 if (ret < 0) {
> > --
> > 2.51.0
> >
> >
>

Re: [PATCH 2/9] mm, swap: always lock and check the swap cache folio before use

Posted by Chris Li 1 month ago

Hi Kairui,

Sorry for the late reply. I have been super busy this week.

On Wed, Aug 27, 2025 at 6:44 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Wed, Aug 27, 2025 at 4:06 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 10ef528a5f44..9ca8e1873c6e 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -4661,12 +4661,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >                 goto out;
> > >
> > >         folio = swap_cache_get_folio(entry);
> > > -       if (folio) {
> > > -               swap_update_readahead(folio, vma, vmf->address);
> > > -               page = folio_file_page(folio, swp_offset(entry));
> > > -       }
> > >         swapcache = folio;
> > > -
> >
> > Can simplify as:
> >            folio = swapcache = swap_cache_get_folio(entry);
>
> checkpatch.pl is unhappy about it:
>
> checkpatch: multiple assignments should be avoided

Ah, never mind then. I actually like multiple assignments but I can
see checkpatch wants to ban it.

>
> >
> > >         if (!folio) {
> > >                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> > >                     __swap_count(entry) == 1) {
> > > @@ -4735,20 +4730,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >                 ret = VM_FAULT_MAJOR;
> > >                 count_vm_event(PGMAJFAULT);
> > >                 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > > -               page = folio_file_page(folio, swp_offset(entry));
> > > -       } else if (PageHWPoison(page)) {
> > > -               /*
> > > -                * hwpoisoned dirty swapcache pages are kept for killing
> > > -                * owner processes (which may be unknown at hwpoison time)
> > > -                */
> > > -               ret = VM_FAULT_HWPOISON;
> > > -               goto out_release;
> >
> > Here you move the HWPosion(page) bail out from before taking the page
> > lock to after the page lock. The HWPosion page should be able to bail
> > out without taking the lock.
> >
> >
> > >         }
> > >
> > >         ret |= folio_lock_or_retry(folio, vmf);
> > >         if (ret & VM_FAULT_RETRY)
> > >                 goto out_release;
> > >
> > > +       page = folio_file_page(folio, swp_offset(entry));
> > >         if (swapcache) {
> > >                 /*
> > >                  * Make sure folio_free_swap() or swapoff did not release the
> > > @@ -4757,10 +4745,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >                  * swapcache, we need to check that the page's swap has not
> > >                  * changed.
> > >                  */
> > > -               if (unlikely(!folio_test_swapcache(folio) ||
> > > -                            page_swap_entry(page).val != entry.val))
> > > +               if (!folio_contains_swap(folio, entry))
> > >                         goto out_page;
> > >
> > > +               if (PageHWPoison(page)) {
> > > +                       /*
> > > +                        * hwpoisoned dirty swapcache pages are kept for killing
> > > +                        * owner processes (which may be unknown at hwpoison time)
> > > +                        */
> > > +                       ret = VM_FAULT_HWPOISON;
> > > +                       goto out_page;
> >
> > It seems you bail out with the page still locked, that seems like a bug to me.
>
> I think it's the original behaviour that is kind of fragile. The
> returned folio of swap_cache_get_folio is unstable unless locked, so
> the folio could have been removed from swap cache or marked by some
> other threads as Poisoned. So the PageHWPoison here could be tested
> against an unrelated page, which leads to false positive PageHWPoison
> results. We have encountered several similar bugs due to using folios
> returned by swap cache lookup without lock & checking first.
>
> So checking HWPoison after locking is actually safer.

How do I verify the code can handle HWPoison from unlock to lock page?
I haven't followed the HWPoison path very much. I am still wondering
how does the HWPoison code handle the page previously unlocked, now
become locked and without any additional code change?

If you want this behavior, I strongly suggest you split this portion
of the change out, ideally outside this 9 series if you can afford to
do so.

This patch description has nothing mentioning this kind of subtle
behavior change, as a reviewer I am caught off guard by this. At the
very least the commit should mention this. Talk explains why this
change is needed and why it is safe to do so.

Having very subtle behavior changes buried in a large code restructure
change is the worst. Splitting it out will reduce the reviewer's
workload to reason this behavior change, and makes your series easier
to review.

Does it make sense?

Chris