[v1] mm/shmem, swap: bugfix and improvement of mTHP swap in

[PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kairui Song 3 months, 3 weeks ago

From: Kairui Song <kasong@tencent.com>

Tidy up the mTHP swapin workflow. There should be no feature change, but
consolidates the mTHP related check to one place so they are now all
wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
compiler if not needed.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
 1 file changed, 78 insertions(+), 97 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 0ad49e57f736..46dea2fa1b43 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1975,31 +1975,51 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
 	return ERR_PTR(error);
 }
 
-static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+static struct folio *shmem_swapin_direct(struct inode *inode,
 		struct vm_area_struct *vma, pgoff_t index,
-		swp_entry_t entry, int order, gfp_t gfp)
+		swp_entry_t entry, int *order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	int nr_pages = 1 << *order;
 	struct folio *new;
+	pgoff_t offset;
 	void *shadow;
-	int nr_pages;
 
 	/*
 	 * We have arrived here because our zones are constrained, so don't
 	 * limit chance of success with further cpuset and node constraints.
 	 */
 	gfp &= ~GFP_CONSTRAINT_MASK;
-	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
-		gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+		if (WARN_ON_ONCE(*order))
+			return ERR_PTR(-EINVAL);
+	} else if (*order) {
+		/*
+		 * If uffd is active for the vma, we need per-page fault
+		 * fidelity to maintain the uffd semantics, then fallback
+		 * to swapin order-0 folio, as well as for zswap case.
+		 * Any existing sub folio in the swap cache also blocks
+		 * mTHP swapin.
+		 */
+		if ((vma && userfaultfd_armed(vma)) ||
+		    !zswap_never_enabled() ||
+		    non_swapcache_batch(entry, nr_pages) != nr_pages) {
+			offset = index - round_down(index, nr_pages);
+			entry = swp_entry(swp_type(entry),
+					  swp_offset(entry) + offset);
+			*order = 0;
+			nr_pages = 1;
+		} else {
+			gfp_t huge_gfp = vma_thp_gfp_mask(vma);
 
-		gfp = limit_gfp_mask(huge_gfp, gfp);
+			gfp = limit_gfp_mask(huge_gfp, gfp);
+		}
 	}
 
-	new = shmem_alloc_folio(gfp, order, info, index);
+	new = shmem_alloc_folio(gfp, *order, info, index);
 	if (!new)
 		return ERR_PTR(-ENOMEM);
 
-	nr_pages = folio_nr_pages(new);
 	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
 					   gfp, entry)) {
 		folio_put(new);
@@ -2165,8 +2185,12 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
 	swap_free_nr(swap, nr_pages);
 }
 
-static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
-				   swp_entry_t swap, gfp_t gfp)
+/*
+ * Split an existing large swap entry. @index should point to one sub mapping
+ * slot within the entry @swap, this sub slot will be split into order 0.
+ */
+static int shmem_split_swap_entry(struct inode *inode, pgoff_t index,
+				  swp_entry_t swap, gfp_t gfp)
 {
 	struct address_space *mapping = inode->i_mapping;
 	XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
@@ -2226,7 +2250,6 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
 			cur_order = split_order;
 			split_order = xas_try_split_min_order(split_order);
 		}
-
 unlock:
 		xas_unlock_irq(&xas);
 
@@ -2237,7 +2260,7 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
 	if (xas_error(&xas))
 		return xas_error(&xas);
 
-	return entry_order;
+	return 0;
 }
 
 /*
@@ -2254,11 +2277,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	int error, nr_pages, order, swap_order;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
 	bool skip_swapcache = false;
 	swp_entry_t swap;
-	int error, nr_pages, order, split_order;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
 	swap = radix_to_swp_entry(*foliop);
@@ -2283,110 +2306,66 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	/* Look it up and read it in.. */
 	folio = swap_cache_get_folio(swap, NULL, 0);
 	if (!folio) {
-		int nr_pages = 1 << order;
-		bool fallback_order0 = false;
-
 		/* Or update major stats only when swapin succeeds?? */
 		if (fault_type) {
 			*fault_type |= VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
-
-		/*
-		 * If uffd is active for the vma, we need per-page fault
-		 * fidelity to maintain the uffd semantics, then fallback
-		 * to swapin order-0 folio, as well as for zswap case.
-		 * Any existing sub folio in the swap cache also blocks
-		 * mTHP swapin.
-		 */
-		if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
-				  !zswap_never_enabled() ||
-				  non_swapcache_batch(swap, nr_pages) != nr_pages))
-			fallback_order0 = true;
-
-		/* Skip swapcache for synchronous device. */
-		if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
-			folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
+		/* Try direct mTHP swapin bypassing swap cache and readahead */
+		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+			swap_order = order;
+			folio = shmem_swapin_direct(inode, vma, index,
+						    swap, &swap_order, gfp);
 			if (!IS_ERR(folio)) {
 				skip_swapcache = true;
 				goto alloced;
 			}
-
-			/*
-			 * Fallback to swapin order-0 folio unless the swap entry
-			 * already exists.
-			 */
+			/* Fallback if order > 0 swapin failed with -ENOMEM */
 			error = PTR_ERR(folio);
 			folio = NULL;
-			if (error == -EEXIST)
+			if (error != -ENOMEM || !swap_order)
 				goto failed;
 		}
-
 		/*
-		 * Now swap device can only swap in order 0 folio, then we
-		 * should split the large swap entry stored in the pagecache
-		 * if necessary.
+		 * Try order 0 swapin using swap cache and readahead, it still
+		 * may return order > 0 folio due to raced swap cache.
 		 */
-		split_order = shmem_split_large_entry(inode, index, swap, gfp);
-		if (split_order < 0) {
-			error = split_order;
-			goto failed;
-		}
-
-		/*
-		 * If the large swap entry has already been split, it is
-		 * necessary to recalculate the new swap entry based on
-		 * the old order alignment.
-		 */
-		if (split_order > 0) {
-			pgoff_t offset = index - round_down(index, 1 << split_order);
-
-			swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
-		}
-
-		/* Here we actually start the io */
 		folio = shmem_swapin_cluster(swap, gfp, info, index);
 		if (!folio) {
 			error = -ENOMEM;
 			goto failed;
 		}
-	} else if (order > folio_order(folio)) {
-		/*
-		 * Swap readahead may swap in order 0 folios into swapcache
-		 * asynchronously, while the shmem mapping can still stores
-		 * large swap entries. In such cases, we should split the
-		 * large swap entry to prevent possible data corruption.
-		 */
-		split_order = shmem_split_large_entry(inode, index, swap, gfp);
-		if (split_order < 0) {
-			folio_put(folio);
-			folio = NULL;
-			error = split_order;
-			goto failed;
-		}
-
-		/*
-		 * If the large swap entry has already been split, it is
-		 * necessary to recalculate the new swap entry based on
-		 * the old order alignment.
-		 */
-		if (split_order > 0) {
-			pgoff_t offset = index - round_down(index, 1 << split_order);
-
-			swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
-		}
-	} else if (order < folio_order(folio)) {
-		swap.val = round_down(swp_type(swap), folio_order(folio));
 	}
-
 alloced:
+	/*
+	 * We need to split an existing large entry if swapin brought in a
+	 * smaller folio due to various of reasons.
+	 *
+	 * And worth noting there is a special case: if there is a smaller
+	 * cached folio that covers @swap, but not @index (it only covers
+	 * first few sub entries of the large entry, but @index points to
+	 * later parts), the swap cache lookup will still see this folio,
+	 * And we need to split the large entry here. Later checks will fail,
+	 * as it can't satisfy the swap requirement, and we will retry
+	 * the swapin from beginning.
+	 */
+	swap_order = folio_order(folio);
+	if (order > swap_order) {
+		error = shmem_split_swap_entry(inode, index, swap, gfp);
+		if (error)
+			goto failed_nolock;
+	}
+
+	index = round_down(index, 1 << swap_order);
+	swap.val = round_down(swap.val, 1 << swap_order);
+
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
 	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
 	    folio->swap.val != swap.val) {
 		error = -EEXIST;
-		goto unlock;
+		goto failed_unlock;
 	}
 	if (!folio_test_uptodate(folio)) {
 		error = -EIO;
@@ -2407,8 +2386,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 			goto failed;
 	}
 
-	error = shmem_add_to_page_cache(folio, mapping,
-					round_down(index, nr_pages),
+	error = shmem_add_to_page_cache(folio, mapping, index,
 					swp_to_radix_entry(swap), gfp);
 	if (error)
 		goto failed;
@@ -2419,8 +2397,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		folio_mark_accessed(folio);
 
 	if (skip_swapcache) {
+		swapcache_clear(si, folio->swap, folio_nr_pages(folio));
 		folio->swap.val = 0;
-		swapcache_clear(si, swap, nr_pages);
 	} else {
 		delete_from_swap_cache(folio);
 	}
@@ -2436,13 +2414,16 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (error == -EIO)
 		shmem_set_folio_swapin_error(inode, index, folio, swap,
 					     skip_swapcache);
-unlock:
-	if (skip_swapcache)
-		swapcache_clear(si, swap, folio_nr_pages(folio));
-	if (folio) {
+failed_unlock:
+	if (folio)
 		folio_unlock(folio);
-		folio_put(folio);
+failed_nolock:
+	if (skip_swapcache) {
+		swapcache_clear(si, folio->swap, folio_nr_pages(folio));
+		folio->swap.val = 0;
 	}
+	if (folio)
+		folio_put(folio);
 	put_swap_device(si);
 	return error;
 }
-- 
2.50.0

Re: [PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kemeng Shi 3 months, 3 weeks ago


on 6/18/2025 2:35 AM, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Tidy up the mTHP swapin workflow. There should be no feature change, but
> consolidates the mTHP related check to one place so they are now all
> wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
> compiler if not needed.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
>  1 file changed, 78 insertions(+), 97 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c

...

Hello, here is another potensial issue if shmem swapin can race with folio
split.

>  alloced:
> +	/*
> +	 * We need to split an existing large entry if swapin brought in a
> +	 * smaller folio due to various of reasons.
> +	 *
> +	 * And worth noting there is a special case: if there is a smaller
> +	 * cached folio that covers @swap, but not @index (it only covers
> +	 * first few sub entries of the large entry, but @index points to
> +	 * later parts), the swap cache lookup will still see this folio,
> +	 * And we need to split the large entry here. Later checks will fail,
> +	 * as it can't satisfy the swap requirement, and we will retry
> +	 * the swapin from beginning.
> +	 */
> +	swap_order = folio_order(folio);
> +	if (order > swap_order) {
> +		error = shmem_split_swap_entry(inode, index, swap, gfp);
> +		if (error)
> +			goto failed_nolock;
> +	}
> +
> +	index = round_down(index, 1 << swap_order);
> +	swap.val = round_down(swap.val, 1 << swap_order);
> +
/* suppose folio is splited */
>  	/* We have to do this with folio locked to prevent races */
>  	folio_lock(folio);
>  	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
>  	    folio->swap.val != swap.val) {
>  		error = -EEXIST;
> -		goto unlock;
> +		goto failed_unlock;
>  	}
>  	if (!folio_test_uptodate(folio)) {
>  		error = -EIO;
> @@ -2407,8 +2386,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  			goto failed;
>  	}
>  
> -	error = shmem_add_to_page_cache(folio, mapping,
> -					round_down(index, nr_pages),
> +	error = shmem_add_to_page_cache(folio, mapping, index,
>  					swp_to_radix_entry(swap), gfp);
The actual order swapin is less than swap_order and the swap-in folio
may not cover index from caller.

So we should move the index and swap.val calculation after folio is
locked.

Re: [PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kairui Song 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 4:26 PM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
> on 6/18/2025 2:35 AM, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Tidy up the mTHP swapin workflow. There should be no feature change, but
> > consolidates the mTHP related check to one place so they are now all
> > wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
> > compiler if not needed.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
> >  1 file changed, 78 insertions(+), 97 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
>
> ...
>
> Hello, here is another potensial issue if shmem swapin can race with folio
> split.
>
> >  alloced:
> > +     /*
> > +      * We need to split an existing large entry if swapin brought in a
> > +      * smaller folio due to various of reasons.
> > +      *
> > +      * And worth noting there is a special case: if there is a smaller
> > +      * cached folio that covers @swap, but not @index (it only covers
> > +      * first few sub entries of the large entry, but @index points to
> > +      * later parts), the swap cache lookup will still see this folio,
> > +      * And we need to split the large entry here. Later checks will fail,
> > +      * as it can't satisfy the swap requirement, and we will retry
> > +      * the swapin from beginning.
> > +      */
> > +     swap_order = folio_order(folio);
> > +     if (order > swap_order) {
> > +             error = shmem_split_swap_entry(inode, index, swap, gfp);
> > +             if (error)
> > +                     goto failed_nolock;
> > +     }
> > +
> > +     index = round_down(index, 1 << swap_order);
> > +     swap.val = round_down(swap.val, 1 << swap_order);
> > +
> /* suppose folio is splited */
> >       /* We have to do this with folio locked to prevent races */
> >       folio_lock(folio);
> >       if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
> >           folio->swap.val != swap.val) {
> >               error = -EEXIST;
> > -             goto unlock;
> > +             goto failed_unlock;
> >       }
> >       if (!folio_test_uptodate(folio)) {
> >               error = -EIO;
> > @@ -2407,8 +2386,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                       goto failed;
> >       }
> >
> > -     error = shmem_add_to_page_cache(folio, mapping,
> > -                                     round_down(index, nr_pages),
> > +     error = shmem_add_to_page_cache(folio, mapping, index,
> >                                       swp_to_radix_entry(swap), gfp);
>
> The actual order swapin is less than swap_order and the swap-in folio
> may not cover index from caller.
>
> So we should move the index and swap.val calculation after folio is
> locked.

Hi, Thanks very much for checking the code carefully!

If I'm not wrong here, holding a reference is enough to stabilize the folio
order.
See split_huge_page_to_list_to_order, "Any unexpected folio references
...  -EAGAIN" and can_split_folio.

We can add a `swap_order == folio_order(folio)` check after folio lock
though, as a (sanity) check, just in case.

Re: [PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kemeng Shi 3 months, 3 weeks ago


on 6/18/2025 4:46 PM, Kairui Song wrote:
> On Wed, Jun 18, 2025 at 4:26 PM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
>> on 6/18/2025 2:35 AM, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Tidy up the mTHP swapin workflow. There should be no feature change, but
>>> consolidates the mTHP related check to one place so they are now all
>>> wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
>>> compiler if not needed.
>>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>>  mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
>>>  1 file changed, 78 insertions(+), 97 deletions(-)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>
>> ...
>>
>> Hello, here is another potensial issue if shmem swapin can race with folio
>> split.
>>
>>>  alloced:
>>> +     /*
>>> +      * We need to split an existing large entry if swapin brought in a
>>> +      * smaller folio due to various of reasons.
>>> +      *
>>> +      * And worth noting there is a special case: if there is a smaller
>>> +      * cached folio that covers @swap, but not @index (it only covers
>>> +      * first few sub entries of the large entry, but @index points to
>>> +      * later parts), the swap cache lookup will still see this folio,
>>> +      * And we need to split the large entry here. Later checks will fail,
>>> +      * as it can't satisfy the swap requirement, and we will retry
>>> +      * the swapin from beginning.
>>> +      */
>>> +     swap_order = folio_order(folio);
>>> +     if (order > swap_order) {
>>> +             error = shmem_split_swap_entry(inode, index, swap, gfp);
>>> +             if (error)
>>> +                     goto failed_nolock;
>>> +     }
>>> +
>>> +     index = round_down(index, 1 << swap_order);
>>> +     swap.val = round_down(swap.val, 1 << swap_order);
>>> +
>> /* suppose folio is splited */
>>>       /* We have to do this with folio locked to prevent races */
>>>       folio_lock(folio);
>>>       if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
>>>           folio->swap.val != swap.val) {
>>>               error = -EEXIST;
>>> -             goto unlock;
>>> +             goto failed_unlock;
>>>       }
>>>       if (!folio_test_uptodate(folio)) {
>>>               error = -EIO;
>>> @@ -2407,8 +2386,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>                       goto failed;
>>>       }
>>>
>>> -     error = shmem_add_to_page_cache(folio, mapping,
>>> -                                     round_down(index, nr_pages),
>>> +     error = shmem_add_to_page_cache(folio, mapping, index,
>>>                                       swp_to_radix_entry(swap), gfp);
>>
>> The actual order swapin is less than swap_order and the swap-in folio
>> may not cover index from caller.
>>
>> So we should move the index and swap.val calculation after folio is
>> locked.
> 
> Hi, Thanks very much for checking the code carefully!
> 
> If I'm not wrong here, holding a reference is enough to stabilize the folio
> order.
> See split_huge_page_to_list_to_order, "Any unexpected folio references
> ...  -EAGAIN" and can_split_folio.
Thanks for feedback, then the change looks good to me.

Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
> 
> We can add a `swap_order == folio_order(folio)` check after folio lock
> though, as a (sanity) check, just in case.
>

Re: [PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kemeng Shi 3 months, 3 weeks ago


on 6/18/2025 2:35 AM, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Tidy up the mTHP swapin workflow. There should be no feature change, but
> consolidates the mTHP related check to one place so they are now all
> wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
> compiler if not needed.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
>  1 file changed, 78 insertions(+), 97 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 0ad49e57f736..46dea2fa1b43 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c

...

> @@ -2283,110 +2306,66 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	/* Look it up and read it in.. */
>  	folio = swap_cache_get_folio(swap, NULL, 0);
>  	if (!folio) {
> -		int nr_pages = 1 << order;
> -		bool fallback_order0 = false;
> -
>  		/* Or update major stats only when swapin succeeds?? */
>  		if (fault_type) {
>  			*fault_type |= VM_FAULT_MAJOR;
>  			count_vm_event(PGMAJFAULT);
>  			count_memcg_event_mm(fault_mm, PGMAJFAULT);
>  		}
> -
> -		/*
> -		 * If uffd is active for the vma, we need per-page fault
> -		 * fidelity to maintain the uffd semantics, then fallback
> -		 * to swapin order-0 folio, as well as for zswap case.
> -		 * Any existing sub folio in the swap cache also blocks
> -		 * mTHP swapin.
> -		 */
> -		if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
> -				  !zswap_never_enabled() ||
> -				  non_swapcache_batch(swap, nr_pages) != nr_pages))
> -			fallback_order0 = true;
> -
> -		/* Skip swapcache for synchronous device. */
> -		if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> -			folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
> +		/* Try direct mTHP swapin bypassing swap cache and readahead */
> +		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> +			swap_order = order;
> +			folio = shmem_swapin_direct(inode, vma, index,
> +						    swap, &swap_order, gfp);
>  			if (!IS_ERR(folio)) {
>  				skip_swapcache = true;
>  				goto alloced;
>  			}
> -
> -			/*
> -			 * Fallback to swapin order-0 folio unless the swap entry
> -			 * already exists.
> -			 */
> +			/* Fallback if order > 0 swapin failed with -ENOMEM */
>  			error = PTR_ERR(folio);
>  			folio = NULL;
> -			if (error == -EEXIST)
> +			if (error != -ENOMEM || !swap_order)
>  				goto failed;
>  		}
> -
>  		/*
> -		 * Now swap device can only swap in order 0 folio, then we
> -		 * should split the large swap entry stored in the pagecache
> -		 * if necessary.
> +		 * Try order 0 swapin using swap cache and readahead, it still
> +		 * may return order > 0 folio due to raced swap cache.
>  		 */
> -		split_order = shmem_split_large_entry(inode, index, swap, gfp);
> -		if (split_order < 0) {
> -			error = split_order;
> -			goto failed;
> -		}
> -
> -		/*
> -		 * If the large swap entry has already been split, it is
> -		 * necessary to recalculate the new swap entry based on
> -		 * the old order alignment.
> -		 */
> -		if (split_order > 0) {
> -			pgoff_t offset = index - round_down(index, 1 << split_order);
> -
> -			swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
> -		}
> -
For fallback order 0, we always call shmem_swapin_cluster() before but we will call
shmem_swap_alloc_folio() now. It seems fine to me. Just point this out for others
to recheck this.
> -		/* Here we actually start the io */
>  		folio = shmem_swapin_cluster(swap, gfp, info, index);
>  		if (!folio) {
>  			error = -ENOMEM;
>  			goto failed;
>  		}
> -	} else if (order > folio_order(folio)) {
> -		/*
> -		 * Swap readahead may swap in order 0 folios into swapcache
> -		 * asynchronously, while the shmem mapping can still stores
> -		 * large swap entries. In such cases, we should split the
> -		 * large swap entry to prevent possible data corruption.
> -		 */
> -		split_order = shmem_split_large_entry(inode, index, swap, gfp);
> -		if (split_order < 0) {
> -			folio_put(folio);
> -			folio = NULL;
> -			error = split_order;
> -			goto failed;
> -		}
> -
> -		/*
> -		 * If the large swap entry has already been split, it is
> -		 * necessary to recalculate the new swap entry based on
> -		 * the old order alignment.
> -		 */
> -		if (split_order > 0) {
> -			pgoff_t offset = index - round_down(index, 1 << split_order);
> -
> -			swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
> -		}
> -	} else if (order < folio_order(folio)) {
> -		swap.val = round_down(swp_type(swap), folio_order(folio));
>  	}
> -
>  alloced:
> +	/*
> +	 * We need to split an existing large entry if swapin brought in a
> +	 * smaller folio due to various of reasons.
> +	 *
> +	 * And worth noting there is a special case: if there is a smaller
> +	 * cached folio that covers @swap, but not @index (it only covers
> +	 * first few sub entries of the large entry, but @index points to
> +	 * later parts), the swap cache lookup will still see this folio,
> +	 * And we need to split the large entry here. Later checks will fail,
> +	 * as it can't satisfy the swap requirement, and we will retry
> +	 * the swapin from beginning.
> +	 */
> +	swap_order = folio_order(folio);
> +	if (order > swap_order) {
> +		error = shmem_split_swap_entry(inode, index, swap, gfp);
> +		if (error)
> +			goto failed_nolock;
> +	}
> +
> +	index = round_down(index, 1 << swap_order);
> +	swap.val = round_down(swap.val, 1 << swap_order);
> +
If swap entry order is reduced but index and value keep unchange,
the shmem_split_swap_entry() will split the reduced large swap entry
successfully but index and swap.val will be incorrect beacuse of wrong
swap_order. We can catch unexpected order and swap entry in
shmem_add_to_page_cache() and will retry the swapin, but this will
introduce extra cost.
If we return order of entry which is splited in shmem_split_swap_entry()
and update index and swap.val with returned order, we can avoid the extra
cost for mentioned racy case.

Re: [PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kairui Song 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 2:27 PM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
> on 6/18/2025 2:35 AM, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Tidy up the mTHP swapin workflow. There should be no feature change, but
> > consolidates the mTHP related check to one place so they are now all
> > wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
> > compiler if not needed.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
> >  1 file changed, 78 insertions(+), 97 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 0ad49e57f736..46dea2fa1b43 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
>
> ...
>
> > @@ -2283,110 +2306,66 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >       /* Look it up and read it in.. */
> >       folio = swap_cache_get_folio(swap, NULL, 0);
> >       if (!folio) {
> > -             int nr_pages = 1 << order;
> > -             bool fallback_order0 = false;
> > -
> >               /* Or update major stats only when swapin succeeds?? */
> >               if (fault_type) {
> >                       *fault_type |= VM_FAULT_MAJOR;
> >                       count_vm_event(PGMAJFAULT);
> >                       count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >               }
> > -
> > -             /*
> > -              * If uffd is active for the vma, we need per-page fault
> > -              * fidelity to maintain the uffd semantics, then fallback
> > -              * to swapin order-0 folio, as well as for zswap case.
> > -              * Any existing sub folio in the swap cache also blocks
> > -              * mTHP swapin.
> > -              */
> > -             if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
> > -                               !zswap_never_enabled() ||
> > -                               non_swapcache_batch(swap, nr_pages) != nr_pages))
> > -                     fallback_order0 = true;
> > -
> > -             /* Skip swapcache for synchronous device. */
> > -             if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> > -                     folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
> > +             /* Try direct mTHP swapin bypassing swap cache and readahead */
> > +             if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> > +                     swap_order = order;
> > +                     folio = shmem_swapin_direct(inode, vma, index,
> > +                                                 swap, &swap_order, gfp);
> >                       if (!IS_ERR(folio)) {
> >                               skip_swapcache = true;
> >                               goto alloced;
> >                       }
> > -
> > -                     /*
> > -                      * Fallback to swapin order-0 folio unless the swap entry
> > -                      * already exists.
> > -                      */
> > +                     /* Fallback if order > 0 swapin failed with -ENOMEM */
> >                       error = PTR_ERR(folio);
> >                       folio = NULL;
> > -                     if (error == -EEXIST)
> > +                     if (error != -ENOMEM || !swap_order)
> >                               goto failed;
> >               }
> > -
> >               /*
> > -              * Now swap device can only swap in order 0 folio, then we
> > -              * should split the large swap entry stored in the pagecache
> > -              * if necessary.
> > +              * Try order 0 swapin using swap cache and readahead, it still
> > +              * may return order > 0 folio due to raced swap cache.
> >                */
> > -             split_order = shmem_split_large_entry(inode, index, swap, gfp);
> > -             if (split_order < 0) {
> > -                     error = split_order;
> > -                     goto failed;
> > -             }
> > -
> > -             /*
> > -              * If the large swap entry has already been split, it is
> > -              * necessary to recalculate the new swap entry based on
> > -              * the old order alignment.
> > -              */
> > -             if (split_order > 0) {
> > -                     pgoff_t offset = index - round_down(index, 1 << split_order);
> > -
> > -                     swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
> > -             }
> > -
> For fallback order 0, we always call shmem_swapin_cluster() before but we will call
> shmem_swap_alloc_folio() now. It seems fine to me. Just point this out for others
> to recheck this.

It's an improvement, I forgot to mention that in the commit message.
Readahead is a burden for SWP_SYNCHRONOUS_IO devices so calling
shmem_swap_alloc_folio is better. I'll update the commit message.

> > -             /* Here we actually start the io */
> >               folio = shmem_swapin_cluster(swap, gfp, info, index);
> >               if (!folio) {
> >                       error = -ENOMEM;
> >                       goto failed;
> >               }
> > -     } else if (order > folio_order(folio)) {
> > -             /*
> > -              * Swap readahead may swap in order 0 folios into swapcache
> > -              * asynchronously, while the shmem mapping can still stores
> > -              * large swap entries. In such cases, we should split the
> > -              * large swap entry to prevent possible data corruption.
> > -              */
> > -             split_order = shmem_split_large_entry(inode, index, swap, gfp);
> > -             if (split_order < 0) {
> > -                     folio_put(folio);
> > -                     folio = NULL;
> > -                     error = split_order;
> > -                     goto failed;
> > -             }
> > -
> > -             /*
> > -              * If the large swap entry has already been split, it is
> > -              * necessary to recalculate the new swap entry based on
> > -              * the old order alignment.
> > -              */
> > -             if (split_order > 0) {
> > -                     pgoff_t offset = index - round_down(index, 1 << split_order);
> > -
> > -                     swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
> > -             }
> > -     } else if (order < folio_order(folio)) {
> > -             swap.val = round_down(swp_type(swap), folio_order(folio));
> >       }
> > -
> >  alloced:
> > +     /*
> > +      * We need to split an existing large entry if swapin brought in a
> > +      * smaller folio due to various of reasons.
> > +      *
> > +      * And worth noting there is a special case: if there is a smaller
> > +      * cached folio that covers @swap, but not @index (it only covers
> > +      * first few sub entries of the large entry, but @index points to
> > +      * later parts), the swap cache lookup will still see this folio,
> > +      * And we need to split the large entry here. Later checks will fail,
> > +      * as it can't satisfy the swap requirement, and we will retry
> > +      * the swapin from beginning.
> > +      */
> > +     swap_order = folio_order(folio);
> > +     if (order > swap_order) {
> > +             error = shmem_split_swap_entry(inode, index, swap, gfp);
> > +             if (error)
> > +                     goto failed_nolock;
> > +     }
> > +
> > +     index = round_down(index, 1 << swap_order);
> > +     swap.val = round_down(swap.val, 1 << swap_order);
> > +
>
> If swap entry order is reduced but index and value keep unchange,
> the shmem_split_swap_entry() will split the reduced large swap entry
> successfully but index and swap.val will be incorrect beacuse of wrong
> swap_order. We can catch unexpected order and swap entry in
> shmem_add_to_page_cache() and will retry the swapin, but this will
> introduce extra cost.
>
> If we return order of entry which is splited in shmem_split_swap_entry()
> and update index and swap.val with returned order, we can avoid the extra
> cost for mentioned racy case.

The swap_order here is simply the folio's order, so doing this
round_down will get the swap entry and index that will be covered by
this folio. (the later folio->swap.val != swap.val ensures the values
are valid here).

Remember the previous patch mentioned that, we may see the shmem
mapping's entry got split but still seeing a large folio here. With
current design, shmem_add_to_page_cache can still succeed as it should
be, but if we round_down with the returned order of
shmem_split_swap_entry, it will fail.

So I think to make it better to keep it this way, and besides, the
next patch makes use of this for sanity checks and reducing false
positives of swap cache lookups.

Re: [PATCH 3/4] mm/shmem, swap: improve mthp swapin process

Posted by Kemeng Shi 3 months, 3 weeks ago


on 6/18/2025 2:50 PM, Kairui Song wrote:
> On Wed, Jun 18, 2025 at 2:27 PM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
>> on 6/18/2025 2:35 AM, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Tidy up the mTHP swapin workflow. There should be no feature change, but
>>> consolidates the mTHP related check to one place so they are now all
>>> wrapped by CONFIG_TRANSPARENT_HUGEPAGE, and will be trimmed off by
>>> compiler if not needed.
>>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>>  mm/shmem.c | 175 ++++++++++++++++++++++++-----------------------------
>>>  1 file changed, 78 insertions(+), 97 deletions(-)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 0ad49e57f736..46dea2fa1b43 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>
>> ...
>>
>>> @@ -2283,110 +2306,66 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>       /* Look it up and read it in.. */
>>>       folio = swap_cache_get_folio(swap, NULL, 0);
>>>       if (!folio) {
>>> -             int nr_pages = 1 << order;
>>> -             bool fallback_order0 = false;
>>> -
>>>               /* Or update major stats only when swapin succeeds?? */
>>>               if (fault_type) {
>>>                       *fault_type |= VM_FAULT_MAJOR;
>>>                       count_vm_event(PGMAJFAULT);
>>>                       count_memcg_event_mm(fault_mm, PGMAJFAULT);
>>>               }
>>> -
>>> -             /*
>>> -              * If uffd is active for the vma, we need per-page fault
>>> -              * fidelity to maintain the uffd semantics, then fallback
>>> -              * to swapin order-0 folio, as well as for zswap case.
>>> -              * Any existing sub folio in the swap cache also blocks
>>> -              * mTHP swapin.
>>> -              */
>>> -             if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
>>> -                               !zswap_never_enabled() ||
>>> -                               non_swapcache_batch(swap, nr_pages) != nr_pages))
>>> -                     fallback_order0 = true;
>>> -
>>> -             /* Skip swapcache for synchronous device. */
>>> -             if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
>>> -                     folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
>>> +             /* Try direct mTHP swapin bypassing swap cache and readahead */
>>> +             if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
>>> +                     swap_order = order;
>>> +                     folio = shmem_swapin_direct(inode, vma, index,
>>> +                                                 swap, &swap_order, gfp);
>>>                       if (!IS_ERR(folio)) {
>>>                               skip_swapcache = true;
>>>                               goto alloced;
>>>                       }
>>> -
>>> -                     /*
>>> -                      * Fallback to swapin order-0 folio unless the swap entry
>>> -                      * already exists.
>>> -                      */
>>> +                     /* Fallback if order > 0 swapin failed with -ENOMEM */
>>>                       error = PTR_ERR(folio);
>>>                       folio = NULL;
>>> -                     if (error == -EEXIST)
>>> +                     if (error != -ENOMEM || !swap_order)
>>>                               goto failed;
>>>               }
>>> -
>>>               /*
>>> -              * Now swap device can only swap in order 0 folio, then we
>>> -              * should split the large swap entry stored in the pagecache
>>> -              * if necessary.
>>> +              * Try order 0 swapin using swap cache and readahead, it still
>>> +              * may return order > 0 folio due to raced swap cache.
>>>                */
>>> -             split_order = shmem_split_large_entry(inode, index, swap, gfp);
>>> -             if (split_order < 0) {
>>> -                     error = split_order;
>>> -                     goto failed;
>>> -             }
>>> -
>>> -             /*
>>> -              * If the large swap entry has already been split, it is
>>> -              * necessary to recalculate the new swap entry based on
>>> -              * the old order alignment.
>>> -              */
>>> -             if (split_order > 0) {
>>> -                     pgoff_t offset = index - round_down(index, 1 << split_order);
>>> -
>>> -                     swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
>>> -             }
>>> -
>> For fallback order 0, we always call shmem_swapin_cluster() before but we will call
>> shmem_swap_alloc_folio() now. It seems fine to me. Just point this out for others
>> to recheck this.
> 
> It's an improvement, I forgot to mention that in the commit message.
> Readahead is a burden for SWP_SYNCHRONOUS_IO devices so calling
> shmem_swap_alloc_folio is better. I'll update the commit message.
> 
>>> -             /* Here we actually start the io */
>>>               folio = shmem_swapin_cluster(swap, gfp, info, index);
>>>               if (!folio) {
>>>                       error = -ENOMEM;
>>>                       goto failed;
>>>               }
>>> -     } else if (order > folio_order(folio)) {
>>> -             /*
>>> -              * Swap readahead may swap in order 0 folios into swapcache
>>> -              * asynchronously, while the shmem mapping can still stores
>>> -              * large swap entries. In such cases, we should split the
>>> -              * large swap entry to prevent possible data corruption.
>>> -              */
>>> -             split_order = shmem_split_large_entry(inode, index, swap, gfp);
>>> -             if (split_order < 0) {
>>> -                     folio_put(folio);
>>> -                     folio = NULL;
>>> -                     error = split_order;
>>> -                     goto failed;
>>> -             }
>>> -
>>> -             /*
>>> -              * If the large swap entry has already been split, it is
>>> -              * necessary to recalculate the new swap entry based on
>>> -              * the old order alignment.
>>> -              */
>>> -             if (split_order > 0) {
>>> -                     pgoff_t offset = index - round_down(index, 1 << split_order);
>>> -
>>> -                     swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
>>> -             }
>>> -     } else if (order < folio_order(folio)) {
>>> -             swap.val = round_down(swp_type(swap), folio_order(folio));
>>>       }
>>> -
>>>  alloced:
>>> +     /*
>>> +      * We need to split an existing large entry if swapin brought in a
>>> +      * smaller folio due to various of reasons.
>>> +      *
>>> +      * And worth noting there is a special case: if there is a smaller
>>> +      * cached folio that covers @swap, but not @index (it only covers
>>> +      * first few sub entries of the large entry, but @index points to
>>> +      * later parts), the swap cache lookup will still see this folio,
>>> +      * And we need to split the large entry here. Later checks will fail,
>>> +      * as it can't satisfy the swap requirement, and we will retry
>>> +      * the swapin from beginning.
>>> +      */
>>> +     swap_order = folio_order(folio);
>>> +     if (order > swap_order) {
>>> +             error = shmem_split_swap_entry(inode, index, swap, gfp);
>>> +             if (error)
>>> +                     goto failed_nolock;
>>> +     }
>>> +
>>> +     index = round_down(index, 1 << swap_order);
>>> +     swap.val = round_down(swap.val, 1 << swap_order);
>>> +
>>
>> If swap entry order is reduced but index and value keep unchange,
>> the shmem_split_swap_entry() will split the reduced large swap entry
>> successfully but index and swap.val will be incorrect beacuse of wrong
>> swap_order. We can catch unexpected order and swap entry in
>> shmem_add_to_page_cache() and will retry the swapin, but this will
>> introduce extra cost.
>>
>> If we return order of entry which is splited in shmem_split_swap_entry()
>> and update index and swap.val with returned order, we can avoid the extra
>> cost for mentioned racy case.
> 
> The swap_order here is simply the folio's order, so doing this
> round_down will get the swap entry and index that will be covered by
> this folio. (the later folio->swap.val != swap.val ensures the values
> are valid here).
> 
> Remember the previous patch mentioned that, we may see the shmem
> mapping's entry got split but still seeing a large folio here. With
> current design, shmem_add_to_page_cache can still succeed as it should
> be, but if we round_down with the returned order of
> shmem_split_swap_entry, it will fail.
My bad... Thanks for explanation. The calculation looks fine to me.
> 
> So I think to make it better to keep it this way, and besides, the
> next patch makes use of this for sanity checks and reducing false
> positives of swap cache lookups.
>

[PATCH 1/4] mm/shmem, swap: improve cached mTHP handling and fix potential hung
[PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin
[PATCH 3/4] mm/shmem, swap: improve mthp swapin process
[PATCH 4/4] mm/shmem, swap: avoid false positive swap cache lookup