[v1] mm/shmem, swap: bugfix and improvement of mTHP swap in

[PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Kairui Song 3 months, 3 weeks ago

From: Kairui Song <kasong@tencent.com>

Currently shmem calls xa_get_order to get the swap radix entry order,
requiring a full tree walk. This can be easily combined with the swap
entry value checking (shmem_confirm_swap) to avoid the duplicated
lookup, which should improve the performance.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 4e7ef343a29b..0ad49e57f736 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
 
 /*
  * Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
  *
  * Checking folio is not enough: by the time a swapcache folio is locked, it
  * might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
  */
-static bool shmem_confirm_swap(struct address_space *mapping,
-			       pgoff_t index, swp_entry_t swap)
+static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
+				  swp_entry_t swap)
 {
-	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+	XA_STATE(xas, &mapping->i_pages, index);
+	int ret = -1;
+	void *entry;
+
+	rcu_read_lock();
+	do {
+		entry = xas_load(&xas);
+		if (entry == swp_to_radix_entry(swap))
+			ret = xas_get_order(&xas);
+	} while (xas_retry(&xas, entry));
+	rcu_read_unlock();
+	return ret;
 }
 
 /*
@@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		return -EIO;
 
 	si = get_swap_device(swap);
-	if (!si) {
-		if (!shmem_confirm_swap(mapping, index, swap))
+	order = shmem_swap_check_entry(mapping, index, swap);
+	if (unlikely(!si)) {
+		if (order < 0)
 			return -EEXIST;
 		else
 			return -EINVAL;
 	}
+	if (unlikely(order < 0)) {
+		put_swap_device(si);
+		return -EEXIST;
+	}
 
 	/* Look it up and read it in.. */
 	folio = swap_cache_get_folio(swap, NULL, 0);
-	order = xa_get_order(&mapping->i_pages, index);
 	if (!folio) {
 		int nr_pages = 1 << order;
 		bool fallback_order0 = false;
@@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	*foliop = folio;
 	return 0;
 failed:
-	if (!shmem_confirm_swap(mapping, index, swap))
+	if (shmem_swap_check_entry(mapping, index, swap) < 0)
 		error = -EEXIST;
 	if (error == -EIO)
 		shmem_set_folio_swapin_error(inode, index, folio, swap,
@@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		folio_put(folio);
 	}
 	put_swap_device(si);
-
 	return error;
 }
 
-- 
2.50.0

Re: [PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Dev Jain 3 months, 3 weeks ago

On 18/06/25 12:05 am, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
>
> Currently shmem calls xa_get_order to get the swap radix entry order,
> requiring a full tree walk. This can be easily combined with the swap
> entry value checking (shmem_confirm_swap) to avoid the duplicated
> lookup, which should improve the performance.

Nice spot!

>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>   mm/shmem.c | 33 ++++++++++++++++++++++++---------
>   1 file changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 4e7ef343a29b..0ad49e57f736 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>   
>   /*
>    * Sometimes, before we decide whether to proceed or to fail, we must check
> - * that an entry was not already brought back from swap by a racing thread.
> + * that an entry was not already brought back or split by a racing thread.
>    *
>    * Checking folio is not enough: by the time a swapcache folio is locked, it
>    * might be reused, and again be swapcache, using the same swap as before.
> + * Returns the swap entry's order if it still presents, else returns -1.
>    */
> -static bool shmem_confirm_swap(struct address_space *mapping,
> -			       pgoff_t index, swp_entry_t swap)
> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> +				  swp_entry_t swap)

I think the function name shmem_confirm_swap is already good enough? Anyhow the
changed name should at least be shmem_check_entry_is_swap.

>   {
> -	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
> +	XA_STATE(xas, &mapping->i_pages, index);
> +	int ret = -1;
> +	void *entry;
> +
> +	rcu_read_lock();
> +	do {
> +		entry = xas_load(&xas);
> +		if (entry == swp_to_radix_entry(swap))
> +			ret = xas_get_order(&xas);
> +	} while (xas_retry(&xas, entry));
> +	rcu_read_unlock();
> +	return ret;
>   }
>   
>   /*
> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   		return -EIO;
>   
>   	si = get_swap_device(swap);
> -	if (!si) {
> -		if (!shmem_confirm_swap(mapping, index, swap))
> +	order = shmem_swap_check_entry(mapping, index, swap);
> +	if (unlikely(!si)) {
> +		if (order < 0)
>   			return -EEXIST;
>   		else
>   			return -EINVAL;
>   	}
> +	if (unlikely(order < 0)) {
> +		put_swap_device(si);
> +		return -EEXIST;
> +	}
>   
>   	/* Look it up and read it in.. */
>   	folio = swap_cache_get_folio(swap, NULL, 0);
> -	order = xa_get_order(&mapping->i_pages, index);
>   	if (!folio) {
>   		int nr_pages = 1 << order;
>   		bool fallback_order0 = false;
> @@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   	*foliop = folio;
>   	return 0;
>   failed:
> -	if (!shmem_confirm_swap(mapping, index, swap))
> +	if (shmem_swap_check_entry(mapping, index, swap) < 0)
>   		error = -EEXIST;
>   	if (error == -EIO)
>   		shmem_set_folio_swapin_error(inode, index, folio, swap,
> @@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   		folio_put(folio);
>   	}
>   	put_swap_device(si);
> -
>   	return error;
>   }
>

Re: [PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Kairui Song 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 3:17 PM Dev Jain <dev.jain@arm.com> wrote:
>
>
> On 18/06/25 12:05 am, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently shmem calls xa_get_order to get the swap radix entry order,
> > requiring a full tree walk. This can be easily combined with the swap
> > entry value checking (shmem_confirm_swap) to avoid the duplicated
> > lookup, which should improve the performance.
>
> Nice spot!
>
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >   mm/shmem.c | 33 ++++++++++++++++++++++++---------
> >   1 file changed, 24 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 4e7ef343a29b..0ad49e57f736 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
> >
> >   /*
> >    * Sometimes, before we decide whether to proceed or to fail, we must check
> > - * that an entry was not already brought back from swap by a racing thread.
> > + * that an entry was not already brought back or split by a racing thread.
> >    *
> >    * Checking folio is not enough: by the time a swapcache folio is locked, it
> >    * might be reused, and again be swapcache, using the same swap as before.
> > + * Returns the swap entry's order if it still presents, else returns -1.
> >    */
> > -static bool shmem_confirm_swap(struct address_space *mapping,
> > -                            pgoff_t index, swp_entry_t swap)
> > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> > +                               swp_entry_t swap)
>
> I think the function name shmem_confirm_swap is already good enough? Anyhow the
> changed name should at least be shmem_check_entry_is_swap.
>

Good, I can keep the function name unchanged or follow your
suggestion, I thought a `confirm` function returning non-binary return
value may look strange. I'm terrible at naming things :P

Re: [PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Dev Jain 3 months, 3 weeks ago

On 18/06/25 12:52 pm, Kairui Song wrote:
> On Wed, Jun 18, 2025 at 3:17 PM Dev Jain <dev.jain@arm.com> wrote:
>>
>> On 18/06/25 12:05 am, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Currently shmem calls xa_get_order to get the swap radix entry order,
>>> requiring a full tree walk. This can be easily combined with the swap
>>> entry value checking (shmem_confirm_swap) to avoid the duplicated
>>> lookup, which should improve the performance.
>> Nice spot!
>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>>    mm/shmem.c | 33 ++++++++++++++++++++++++---------
>>>    1 file changed, 24 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 4e7ef343a29b..0ad49e57f736 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>>>
>>>    /*
>>>     * Sometimes, before we decide whether to proceed or to fail, we must check
>>> - * that an entry was not already brought back from swap by a racing thread.
>>> + * that an entry was not already brought back or split by a racing thread.
>>>     *
>>>     * Checking folio is not enough: by the time a swapcache folio is locked, it
>>>     * might be reused, and again be swapcache, using the same swap as before.
>>> + * Returns the swap entry's order if it still presents, else returns -1.
>>>     */
>>> -static bool shmem_confirm_swap(struct address_space *mapping,
>>> -                            pgoff_t index, swp_entry_t swap)
>>> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
>>> +                               swp_entry_t swap)
>> I think the function name shmem_confirm_swap is already good enough? Anyhow the
>> changed name should at least be shmem_check_entry_is_swap.
>>
> Good, I can keep the function name unchanged or follow your
> suggestion, I thought a `confirm` function returning non-binary return

True. I will vote for keeping the name unchanged; you have already documented
the return value so it should be fine. Just can you put a new line between
"Returns the swap entry's order..." and the previous line to make it clear.

> value may look strange. I'm terrible at naming things :P

Re: [PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Kemeng Shi 3 months, 3 weeks ago


on 6/18/2025 2:35 AM, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Currently shmem calls xa_get_order to get the swap radix entry order,
> requiring a full tree walk. This can be easily combined with the swap
> entry value checking (shmem_confirm_swap) to avoid the duplicated
> lookup, which should improve the performance.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/shmem.c | 33 ++++++++++++++++++++++++---------
>  1 file changed, 24 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 4e7ef343a29b..0ad49e57f736 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>  
>  /*
>   * Sometimes, before we decide whether to proceed or to fail, we must check
> - * that an entry was not already brought back from swap by a racing thread.
> + * that an entry was not already brought back or split by a racing thread.
>   *
>   * Checking folio is not enough: by the time a swapcache folio is locked, it
>   * might be reused, and again be swapcache, using the same swap as before.
> + * Returns the swap entry's order if it still presents, else returns -1.
>   */
> -static bool shmem_confirm_swap(struct address_space *mapping,
> -			       pgoff_t index, swp_entry_t swap)
> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> +				  swp_entry_t swap)
>  {
> -	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
> +	XA_STATE(xas, &mapping->i_pages, index);
> +	int ret = -1;
> +	void *entry;
> +
> +	rcu_read_lock();
> +	do {
> +		entry = xas_load(&xas);
> +		if (entry == swp_to_radix_entry(swap))
> +			ret = xas_get_order(&xas);
> +	} while (xas_retry(&xas, entry));
> +	rcu_read_unlock();
> +	return ret;
>  }
>  
>  /*
> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  		return -EIO;
>  
>  	si = get_swap_device(swap);
> -	if (!si) {
> -		if (!shmem_confirm_swap(mapping, index, swap))
> +	order = shmem_swap_check_entry(mapping, index, swap);
> +	if (unlikely(!si)) {
> +		if (order < 0)
>  			return -EEXIST;
>  		else
>  			return -EINVAL;
>  	}
> +	if (unlikely(order < 0)) {
> +		put_swap_device(si);
> +		return -EEXIST;
> +	}
Can we re-arrange the code block as following:
        order = shmem_swap_check_entry(mapping, index, swap);
        if (unlikely(order < 0))
                return -EEXIST;

        si = get_swap_device(swap);
        if (!si) {
                return -EINVAL;
...
>  
>  	/* Look it up and read it in.. */
>  	folio = swap_cache_get_folio(swap, NULL, 0);
> -	order = xa_get_order(&mapping->i_pages, index);
>  	if (!folio) {
>  		int nr_pages = 1 << order;
>  		bool fallback_order0 = false;
> @@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	*foliop = folio;
>  	return 0;
>  failed:
> -	if (!shmem_confirm_swap(mapping, index, swap))
> +	if (shmem_swap_check_entry(mapping, index, swap) < 0)
>  		error = -EEXIST;
>  	if (error == -EIO)
>  		shmem_set_folio_swapin_error(inode, index, folio, swap,
> @@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  		folio_put(folio);
>  	}
>  	put_swap_device(si);
> -
>  	return error;
>  }
>  
>

Re: [PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Kairui Song 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 10:49 AM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
> on 6/18/2025 2:35 AM, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently shmem calls xa_get_order to get the swap radix entry order,
> > requiring a full tree walk. This can be easily combined with the swap
> > entry value checking (shmem_confirm_swap) to avoid the duplicated
> > lookup, which should improve the performance.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/shmem.c | 33 ++++++++++++++++++++++++---------
> >  1 file changed, 24 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 4e7ef343a29b..0ad49e57f736 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
> >
> >  /*
> >   * Sometimes, before we decide whether to proceed or to fail, we must check
> > - * that an entry was not already brought back from swap by a racing thread.
> > + * that an entry was not already brought back or split by a racing thread.
> >   *
> >   * Checking folio is not enough: by the time a swapcache folio is locked, it
> >   * might be reused, and again be swapcache, using the same swap as before.
> > + * Returns the swap entry's order if it still presents, else returns -1.
> >   */
> > -static bool shmem_confirm_swap(struct address_space *mapping,
> > -                            pgoff_t index, swp_entry_t swap)
> > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> > +                               swp_entry_t swap)
> >  {
> > -     return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
> > +     XA_STATE(xas, &mapping->i_pages, index);
> > +     int ret = -1;
> > +     void *entry;
> > +
> > +     rcu_read_lock();
> > +     do {
> > +             entry = xas_load(&xas);
> > +             if (entry == swp_to_radix_entry(swap))
> > +                     ret = xas_get_order(&xas);
> > +     } while (xas_retry(&xas, entry));
> > +     rcu_read_unlock();
> > +     return ret;
> >  }
> >
> >  /*
> > @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >               return -EIO;
> >
> >       si = get_swap_device(swap);
> > -     if (!si) {
> > -             if (!shmem_confirm_swap(mapping, index, swap))
> > +     order = shmem_swap_check_entry(mapping, index, swap);
> > +     if (unlikely(!si)) {
> > +             if (order < 0)
> >                       return -EEXIST;
> >               else
> >                       return -EINVAL;
> >       }
> > +     if (unlikely(order < 0)) {
> > +             put_swap_device(si);
> > +             return -EEXIST;
> > +     }
> Can we re-arrange the code block as following:
>         order = shmem_swap_check_entry(mapping, index, swap);
>         if (unlikely(order < 0))
>                 return -EEXIST;
>
>         si = get_swap_device(swap);
>         if (!si) {
>                 return -EINVAL;
> ...

Hi, thanks for the suggestion.

This may lead to a trivial higher chance of getting -EINVAL when it
should return -EEXIST, leading to user space errors.

For example if this CPU get interrupted after `order =
shmem_swap_check_entry(mapping, index, swap);`, and another CPU
swapoff-ed the device. Next, we get `si = NULL` here, but the entry is
swapped in already, so it should return -EEXIST. Not -EINVAL.

The chance is really low so it's kind of trivial, we can do a `goto
failed` if got (!si) here, but it will make the logic under `failed:`
more complex. So I'd prefer to not change the original behaviour,
which looks more correct.

Re: [PATCH 2/4] mm/shmem, swap: avoid redundant Xarray lookup during swapin

Posted by Kemeng Shi 3 months, 3 weeks ago


on 6/18/2025 11:07 AM, Kairui Song wrote:
> On Wed, Jun 18, 2025 at 10:49 AM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
>> on 6/18/2025 2:35 AM, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Currently shmem calls xa_get_order to get the swap radix entry order,
>>> requiring a full tree walk. This can be easily combined with the swap
>>> entry value checking (shmem_confirm_swap) to avoid the duplicated
>>> lookup, which should improve the performance.
>>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>>  mm/shmem.c | 33 ++++++++++++++++++++++++---------
>>>  1 file changed, 24 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 4e7ef343a29b..0ad49e57f736 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>>>
>>>  /*
>>>   * Sometimes, before we decide whether to proceed or to fail, we must check
>>> - * that an entry was not already brought back from swap by a racing thread.
>>> + * that an entry was not already brought back or split by a racing thread.
>>>   *
>>>   * Checking folio is not enough: by the time a swapcache folio is locked, it
>>>   * might be reused, and again be swapcache, using the same swap as before.
>>> + * Returns the swap entry's order if it still presents, else returns -1.
>>>   */
>>> -static bool shmem_confirm_swap(struct address_space *mapping,
>>> -                            pgoff_t index, swp_entry_t swap)
>>> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
>>> +                               swp_entry_t swap)
>>>  {
>>> -     return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
>>> +     XA_STATE(xas, &mapping->i_pages, index);
>>> +     int ret = -1;
>>> +     void *entry;
>>> +
>>> +     rcu_read_lock();
>>> +     do {
>>> +             entry = xas_load(&xas);
>>> +             if (entry == swp_to_radix_entry(swap))
>>> +                     ret = xas_get_order(&xas);
>>> +     } while (xas_retry(&xas, entry));
>>> +     rcu_read_unlock();
>>> +     return ret;
>>>  }
>>>
>>>  /*
>>> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>>               return -EIO;
>>>
>>>       si = get_swap_device(swap);
>>> -     if (!si) {
>>> -             if (!shmem_confirm_swap(mapping, index, swap))
>>> +     order = shmem_swap_check_entry(mapping, index, swap);
>>> +     if (unlikely(!si)) {
>>> +             if (order < 0)
>>>                       return -EEXIST;
>>>               else
>>>                       return -EINVAL;
>>>       }
>>> +     if (unlikely(order < 0)) {
>>> +             put_swap_device(si);
>>> +             return -EEXIST;
>>> +     }
>> Can we re-arrange the code block as following:
>>         order = shmem_swap_check_entry(mapping, index, swap);
>>         if (unlikely(order < 0))
>>                 return -EEXIST;
>>
>>         si = get_swap_device(swap);
>>         if (!si) {
>>                 return -EINVAL;
>> ...
> 
> Hi, thanks for the suggestion.
> 
> This may lead to a trivial higher chance of getting -EINVAL when it
> should return -EEXIST, leading to user space errors.
> 
> For example if this CPU get interrupted after `order =
> shmem_swap_check_entry(mapping, index, swap);`, and another CPU
> swapoff-ed the device. Next, we get `si = NULL` here, but the entry is
> swapped in already, so it should return -EEXIST. Not -EINVAL.
> 
> The chance is really low so it's kind of trivial, we can do a `goto
> failed` if got (!si) here, but it will make the logic under `failed:`
> more complex. So I'd prefer to not change the original behaviour,
> which looks more correct.
> 
Right, thanks for explanation.

Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>