[v3] mm/shmem, swap: bugfix and improvement of mTHP swap in

[PATCH v3 7/7] mm/shmem, swap: avoid false positive swap cache lookup

Posted by Kairui Song 3 months, 1 week ago

From: Kairui Song <kasong@tencent.com>

If the shmem read request's index points to the middle of a large swap
entry, shmem swap in does the swap cache lookup use the large swap entry's
starting value (the first sub swap entry of this large entry).  This will
lead to false positive lookup result if only the first few swap entries
are cached, but the requested swap entry pointed by index is uncached.

Currently shmem will do a large entry split then retry the swapin from
beginning, which is a waste of CPU and fragile.  Handle this correctly.

Also add some sanity checks to help understand the code and ensure things
won't go wrong.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c | 60 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ea9a105ded5d..9341c51c3d10 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1977,14 +1977,19 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
 
 static struct folio *shmem_swapin_direct(struct inode *inode,
 		struct vm_area_struct *vma, pgoff_t index,
-		swp_entry_t entry, int order, gfp_t gfp)
+		swp_entry_t index_entry, swp_entry_t swap,
+		int order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	int nr_pages = 1 << order;
 	struct folio *new;
-	pgoff_t offset;
+	swp_entry_t entry;
 	gfp_t swap_gfp;
 	void *shadow;
+	int nr_pages;
+
+	/* Prefer aligned THP swapin */
+	entry.val = index_entry.val;
+	nr_pages = 1 << order;
 
 	/*
 	 * We have arrived here because our zones are constrained, so don't
@@ -2011,6 +2016,7 @@ static struct folio *shmem_swapin_direct(struct inode *inode,
 			swap_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
 		}
 	}
+
 retry:
 	new = shmem_alloc_folio(swap_gfp, order, info, index);
 	if (!new) {
@@ -2056,11 +2062,10 @@ static struct folio *shmem_swapin_direct(struct inode *inode,
 	if (!order)
 		return new;
 	/* High order swapin failed, fallback to order 0 and retry */
-	order = 0;
-	nr_pages = 1;
+	entry.val = swap.val;
 	swap_gfp = gfp;
-	offset = index - round_down(index, nr_pages);
-	entry = swp_entry(swp_type(entry), swp_offset(entry) + offset);
+	nr_pages = 1;
+	order = 0;
 	goto retry;
 }
 
@@ -2288,20 +2293,21 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error, nr_pages, order, swap_order;
+	swp_entry_t swap, index_entry;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
 	bool skip_swapcache = false;
-	swp_entry_t swap;
+	pgoff_t offset;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
-	swap = radix_to_swp_entry(*foliop);
+	index_entry = radix_to_swp_entry(*foliop);
 	*foliop = NULL;
 
-	if (is_poisoned_swp_entry(swap))
+	if (is_poisoned_swp_entry(index_entry))
 		return -EIO;
 
-	si = get_swap_device(swap);
-	order = shmem_confirm_swap(mapping, index, swap);
+	si = get_swap_device(index_entry);
+	order = shmem_confirm_swap(mapping, index, index_entry);
 	if (unlikely(!si)) {
 		if (order < 0)
 			return -EEXIST;
@@ -2313,13 +2319,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		return -EEXIST;
 	}
 
-	/* Look it up and read it in.. */
+	/* @index may points to the middle of a large entry, get the real swap value first */
+	offset = index - round_down(index, 1 << order);
+	swap.val = index_entry.val + offset;
 	folio = swap_cache_get_folio(swap, NULL, 0);
 	if (!folio) {
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
 			/* Direct mTHP swapin without swap cache or readahead */
 			folio = shmem_swapin_direct(inode, vma, index,
-						    swap, order, gfp);
+						    index_entry, swap, order, gfp);
 			if (IS_ERR(folio)) {
 				error = PTR_ERR(folio);
 				folio = NULL;
@@ -2341,28 +2349,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
 	}
+
+	swap_order = folio_order(folio);
+	nr_pages = folio_nr_pages(folio);
+	/* The swap-in should cover both @swap and @index */
+	swap.val = round_down(swap.val, nr_pages);
+	VM_WARN_ON_ONCE(swap.val > index_entry.val + offset);
+	VM_WARN_ON_ONCE(swap.val + nr_pages <= index_entry.val + offset);
+
 	/*
 	 * We need to split an existing large entry if swapin brought in a
 	 * smaller folio due to various of reasons.
-	 *
-	 * And worth noting there is a special case: if there is a smaller
-	 * cached folio that covers @swap, but not @index (it only covers
-	 * first few sub entries of the large entry, but @index points to
-	 * later parts), the swap cache lookup will still see this folio,
-	 * And we need to split the large entry here. Later checks will fail,
-	 * as it can't satisfy the swap requirement, and we will retry
-	 * the swapin from beginning.
 	 */
-	swap_order = folio_order(folio);
+	index = round_down(index, nr_pages);
 	if (order > swap_order) {
-		error = shmem_split_swap_entry(inode, index, swap, gfp);
+		error = shmem_split_swap_entry(inode, index, index_entry, gfp);
 		if (error)
 			goto failed_nolock;
 	}
 
-	index = round_down(index, 1 << swap_order);
-	swap.val = round_down(swap.val, 1 << swap_order);
-
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
 	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
@@ -2375,7 +2380,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		goto failed;
 	}
 	folio_wait_writeback(folio);
-	nr_pages = folio_nr_pages(folio);
 
 	/*
 	 * Some architectures may have to restore extra metadata to the
-- 
2.50.0

Re: [PATCH v3 7/7] mm/shmem, swap: avoid false positive swap cache lookup

Posted by Baolin Wang 3 months, 1 week ago


On 2025/6/27 14:20, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> If the shmem read request's index points to the middle of a large swap
> entry, shmem swap in does the swap cache lookup use the large swap entry's
> starting value (the first sub swap entry of this large entry).  

Right.

This will
> lead to false positive lookup result if only the first few swap entries
> are cached, but the requested swap entry pointed by index is uncached.
> 
> Currently shmem will do a large entry split then retry the swapin from
> beginning, which is a waste of CPU and fragile.  Handle this correctly.

Sorry, I did not get you here. Only when the 'order' (get from 
shmem_confirm_swap()) is not equal to folio_order(), will it trigger the 
large swap entry split. Could you describe the issue in more detail?

I also found a false positive swap-in in your patch 4, seems they are 
related?

> Also add some sanity checks to help understand the code and ensure things
> won't go wrong.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>   mm/shmem.c | 60 +++++++++++++++++++++++++++++-------------------------
>   1 file changed, 32 insertions(+), 28 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ea9a105ded5d..9341c51c3d10 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1977,14 +1977,19 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
>   
>   static struct folio *shmem_swapin_direct(struct inode *inode,
>   		struct vm_area_struct *vma, pgoff_t index,
> -		swp_entry_t entry, int order, gfp_t gfp)
> +		swp_entry_t index_entry, swp_entry_t swap,
> +		int order, gfp_t gfp)
>   {
>   	struct shmem_inode_info *info = SHMEM_I(inode);
> -	int nr_pages = 1 << order;
>   	struct folio *new;
> -	pgoff_t offset;
> +	swp_entry_t entry;
>   	gfp_t swap_gfp;
>   	void *shadow;
> +	int nr_pages;
> +
> +	/* Prefer aligned THP swapin */
> +	entry.val = index_entry.val;
> +	nr_pages = 1 << order;
>   
>   	/*
>   	 * We have arrived here because our zones are constrained, so don't
> @@ -2011,6 +2016,7 @@ static struct folio *shmem_swapin_direct(struct inode *inode,
>   			swap_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
>   		}
>   	}
> +
>   retry:
>   	new = shmem_alloc_folio(swap_gfp, order, info, index);
>   	if (!new) {
> @@ -2056,11 +2062,10 @@ static struct folio *shmem_swapin_direct(struct inode *inode,
>   	if (!order)
>   		return new;
>   	/* High order swapin failed, fallback to order 0 and retry */
> -	order = 0;
> -	nr_pages = 1;
> +	entry.val = swap.val;
>   	swap_gfp = gfp;
> -	offset = index - round_down(index, nr_pages);
> -	entry = swp_entry(swp_type(entry), swp_offset(entry) + offset);
> +	nr_pages = 1;
> +	order = 0;
>   	goto retry;
>   }
>   
> @@ -2288,20 +2293,21 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
>   	struct shmem_inode_info *info = SHMEM_I(inode);
>   	int error, nr_pages, order, swap_order;
> +	swp_entry_t swap, index_entry;
>   	struct swap_info_struct *si;
>   	struct folio *folio = NULL;
>   	bool skip_swapcache = false;
> -	swp_entry_t swap;
> +	pgoff_t offset;
>   
>   	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> -	swap = radix_to_swp_entry(*foliop);
> +	index_entry = radix_to_swp_entry(*foliop);
>   	*foliop = NULL;
>   
> -	if (is_poisoned_swp_entry(swap))
> +	if (is_poisoned_swp_entry(index_entry))
>   		return -EIO;
>   
> -	si = get_swap_device(swap);
> -	order = shmem_confirm_swap(mapping, index, swap);
> +	si = get_swap_device(index_entry);
> +	order = shmem_confirm_swap(mapping, index, index_entry);
>   	if (unlikely(!si)) {
>   		if (order < 0)
>   			return -EEXIST;
> @@ -2313,13 +2319,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   		return -EEXIST;
>   	}
>   
> -	/* Look it up and read it in.. */
> +	/* @index may points to the middle of a large entry, get the real swap value first */
> +	offset = index - round_down(index, 1 << order);
> +	swap.val = index_entry.val + offset;
>   	folio = swap_cache_get_folio(swap, NULL, 0);
>   	if (!folio) {
>   		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
>   			/* Direct mTHP swapin without swap cache or readahead */
>   			folio = shmem_swapin_direct(inode, vma, index,
> -						    swap, order, gfp);
> +						    index_entry, swap, order, gfp);
>   			if (IS_ERR(folio)) {
>   				error = PTR_ERR(folio);
>   				folio = NULL;
> @@ -2341,28 +2349,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   			count_memcg_event_mm(fault_mm, PGMAJFAULT);
>   		}
>   	}
> +
> +	swap_order = folio_order(folio);
> +	nr_pages = folio_nr_pages(folio);
> +	/* The swap-in should cover both @swap and @index */
> +	swap.val = round_down(swap.val, nr_pages);
> +	VM_WARN_ON_ONCE(swap.val > index_entry.val + offset);
> +	VM_WARN_ON_ONCE(swap.val + nr_pages <= index_entry.val + offset);
> +
>   	/*
>   	 * We need to split an existing large entry if swapin brought in a
>   	 * smaller folio due to various of reasons.
> -	 *
> -	 * And worth noting there is a special case: if there is a smaller
> -	 * cached folio that covers @swap, but not @index (it only covers
> -	 * first few sub entries of the large entry, but @index points to
> -	 * later parts), the swap cache lookup will still see this folio,
> -	 * And we need to split the large entry here. Later checks will fail,
> -	 * as it can't satisfy the swap requirement, and we will retry
> -	 * the swapin from beginning.
>   	 */
> -	swap_order = folio_order(folio);
> +	index = round_down(index, nr_pages);
>   	if (order > swap_order) {
> -		error = shmem_split_swap_entry(inode, index, swap, gfp);
> +		error = shmem_split_swap_entry(inode, index, index_entry, gfp);
>   		if (error)
>   			goto failed_nolock;
>   	}
>   
> -	index = round_down(index, 1 << swap_order);
> -	swap.val = round_down(swap.val, 1 << swap_order);
> -
>   	/* We have to do this with folio locked to prevent races */
>   	folio_lock(folio);
>   	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
> @@ -2375,7 +2380,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   		goto failed;
>   	}
>   	folio_wait_writeback(folio);
> -	nr_pages = folio_nr_pages(folio);
>   
>   	/*
>   	 * Some architectures may have to restore extra metadata to the

Re: [PATCH v3 7/7] mm/shmem, swap: avoid false positive swap cache lookup

Posted by Kairui Song 3 months, 1 week ago

On Mon, Jun 30, 2025 at 3:22 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
> On 2025/6/27 14:20, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > If the shmem read request's index points to the middle of a large swap
> > entry, shmem swap in does the swap cache lookup use the large swap entry's
> > starting value (the first sub swap entry of this large entry).
>
> Right.
>
> This will
> > lead to false positive lookup result if only the first few swap entries
> > are cached, but the requested swap entry pointed by index is uncached.
> >
> > Currently shmem will do a large entry split then retry the swapin from
> > beginning, which is a waste of CPU and fragile.  Handle this correctly.
>
> Sorry, I did not get you here. Only when the 'order' (get from
> shmem_confirm_swap()) is not equal to folio_order(), will it trigger the
> large swap entry split. Could you describe the issue in more detail?

Right, for example we have a `order = 4` swap entry covering `index 0
- 16`, `swap.val = 0x4000`.

A swapin request starts with `index = 3`. The swap_cache_get_folio
will be called with `swap.val = 0x4000`. It may return an order 0
folio with `swap.val = 0x4000` (especially readaheads read order 0
folios easily). It doesn't satisfy the swapin requests. A split is
issued and swapin will fall, then the fault is triggered again.

After this patch, swap_cache_get_folio will return either NULL or the
right folio.

> I also found a false positive swap-in in your patch 4, seems they are
> related?

It's unrelated, I should added this code in this patch for
!SWP_SYNCHRONOUS_IO path in that patch:

offset = index - round_down(index, 1 << order);
swap.val = index_entry.val + offset;

>
> > Also add some sanity checks to help understand the code and ensure things
> > won't go wrong.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >   mm/shmem.c | 60 +++++++++++++++++++++++++++++-------------------------
> >   1 file changed, 32 insertions(+), 28 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index ea9a105ded5d..9341c51c3d10 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1977,14 +1977,19 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
> >
> >   static struct folio *shmem_swapin_direct(struct inode *inode,
> >               struct vm_area_struct *vma, pgoff_t index,
> > -             swp_entry_t entry, int order, gfp_t gfp)
> > +             swp_entry_t index_entry, swp_entry_t swap,
> > +             int order, gfp_t gfp)
> >   {
> >       struct shmem_inode_info *info = SHMEM_I(inode);
> > -     int nr_pages = 1 << order;
> >       struct folio *new;
> > -     pgoff_t offset;
> > +     swp_entry_t entry;
> >       gfp_t swap_gfp;
> >       void *shadow;
> > +     int nr_pages;
> > +
> > +     /* Prefer aligned THP swapin */
> > +     entry.val = index_entry.val;
> > +     nr_pages = 1 << order;
> >
> >       /*
> >        * We have arrived here because our zones are constrained, so don't
> > @@ -2011,6 +2016,7 @@ static struct folio *shmem_swapin_direct(struct inode *inode,
> >                       swap_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
> >               }
> >       }
> > +
> >   retry:
> >       new = shmem_alloc_folio(swap_gfp, order, info, index);
> >       if (!new) {
> > @@ -2056,11 +2062,10 @@ static struct folio *shmem_swapin_direct(struct inode *inode,
> >       if (!order)
> >               return new;
> >       /* High order swapin failed, fallback to order 0 and retry */
> > -     order = 0;
> > -     nr_pages = 1;
> > +     entry.val = swap.val;
> >       swap_gfp = gfp;
> > -     offset = index - round_down(index, nr_pages);
> > -     entry = swp_entry(swp_type(entry), swp_offset(entry) + offset);
> > +     nr_pages = 1;
> > +     order = 0;
> >       goto retry;
> >   }
> >
> > @@ -2288,20 +2293,21 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >       struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
> >       struct shmem_inode_info *info = SHMEM_I(inode);
> >       int error, nr_pages, order, swap_order;
> > +     swp_entry_t swap, index_entry;
> >       struct swap_info_struct *si;
> >       struct folio *folio = NULL;
> >       bool skip_swapcache = false;
> > -     swp_entry_t swap;
> > +     pgoff_t offset;
> >
> >       VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> > -     swap = radix_to_swp_entry(*foliop);
> > +     index_entry = radix_to_swp_entry(*foliop);
> >       *foliop = NULL;
> >
> > -     if (is_poisoned_swp_entry(swap))
> > +     if (is_poisoned_swp_entry(index_entry))
> >               return -EIO;
> >
> > -     si = get_swap_device(swap);
> > -     order = shmem_confirm_swap(mapping, index, swap);
> > +     si = get_swap_device(index_entry);
> > +     order = shmem_confirm_swap(mapping, index, index_entry);
> >       if (unlikely(!si)) {
> >               if (order < 0)
> >                       return -EEXIST;
> > @@ -2313,13 +2319,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >               return -EEXIST;
> >       }
> >
> > -     /* Look it up and read it in.. */
> > +     /* @index may points to the middle of a large entry, get the real swap value first */
> > +     offset = index - round_down(index, 1 << order);
> > +     swap.val = index_entry.val + offset;
> >       folio = swap_cache_get_folio(swap, NULL, 0);
> >       if (!folio) {
> >               if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> >                       /* Direct mTHP swapin without swap cache or readahead */
> >                       folio = shmem_swapin_direct(inode, vma, index,
> > -                                                 swap, order, gfp);
> > +                                                 index_entry, swap, order, gfp);
> >                       if (IS_ERR(folio)) {
> >                               error = PTR_ERR(folio);
> >                               folio = NULL;
> > @@ -2341,28 +2349,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                       count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >               }
> >       }
> > +
> > +     swap_order = folio_order(folio);
> > +     nr_pages = folio_nr_pages(folio);
> > +     /* The swap-in should cover both @swap and @index */
> > +     swap.val = round_down(swap.val, nr_pages);
> > +     VM_WARN_ON_ONCE(swap.val > index_entry.val + offset);
> > +     VM_WARN_ON_ONCE(swap.val + nr_pages <= index_entry.val + offset);
> > +
> >       /*
> >        * We need to split an existing large entry if swapin brought in a
> >        * smaller folio due to various of reasons.
> > -      *
> > -      * And worth noting there is a special case: if there is a smaller
> > -      * cached folio that covers @swap, but not @index (it only covers
> > -      * first few sub entries of the large entry, but @index points to
> > -      * later parts), the swap cache lookup will still see this folio,
> > -      * And we need to split the large entry here. Later checks will fail,
> > -      * as it can't satisfy the swap requirement, and we will retry
> > -      * the swapin from beginning.
> >        */
> > -     swap_order = folio_order(folio);
> > +     index = round_down(index, nr_pages);
> >       if (order > swap_order) {
> > -             error = shmem_split_swap_entry(inode, index, swap, gfp);
> > +             error = shmem_split_swap_entry(inode, index, index_entry, gfp);
> >               if (error)
> >                       goto failed_nolock;
> >       }
> >
> > -     index = round_down(index, 1 << swap_order);
> > -     swap.val = round_down(swap.val, 1 << swap_order);
> > -
> >       /* We have to do this with folio locked to prevent races */
> >       folio_lock(folio);
> >       if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
> > @@ -2375,7 +2380,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >               goto failed;
> >       }
> >       folio_wait_writeback(folio);
> > -     nr_pages = folio_nr_pages(folio);
> >
> >       /*
> >        * Some architectures may have to restore extra metadata to the
>
>

[PATCH v3 1/7] mm/shmem, swap: improve cached mTHP handling and fix potential hung
[PATCH v3 2/7] mm/shmem, swap: avoid redundant Xarray lookup during swapin
[PATCH v3 3/7] mm/shmem, swap: tidy up THP swapin checks
[PATCH v3 4/7] mm/shmem, swap: clean up swap entry splitting
[PATCH v3 5/7] mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO
[PATCH v3 6/7] mm/shmem, swap: fix major fault counting
[PATCH v3 7/7] mm/shmem, swap: avoid false positive swap cache lookup