From: Kairui Song <kasong@tencent.com>
Currently shmem calls xa_get_order to get the swap radix entry order,
requiring a full tree walk. This can be easily combined with the swap
entry value checking (shmem_confirm_swap) to avoid the duplicated
lookup, which should improve the performance.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/shmem.c | 33 ++++++++++++++++++++++++---------
1 file changed, 24 insertions(+), 9 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 4e7ef343a29b..0ad49e57f736 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
return -EIO;
si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ order = shmem_swap_check_entry(mapping, index, swap);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
- order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
int nr_pages = 1 << order;
bool fallback_order0 = false;
@@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_swap_check_entry(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap,
@@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio_put(folio);
}
put_swap_device(si);
-
return error;
}
--
2.50.0
On 18/06/25 12:05 am, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
>
> Currently shmem calls xa_get_order to get the swap radix entry order,
> requiring a full tree walk. This can be easily combined with the swap
> entry value checking (shmem_confirm_swap) to avoid the duplicated
> lookup, which should improve the performance.
Nice spot!
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
> mm/shmem.c | 33 ++++++++++++++++++++++++---------
> 1 file changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 4e7ef343a29b..0ad49e57f736 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>
> /*
> * Sometimes, before we decide whether to proceed or to fail, we must check
> - * that an entry was not already brought back from swap by a racing thread.
> + * that an entry was not already brought back or split by a racing thread.
> *
> * Checking folio is not enough: by the time a swapcache folio is locked, it
> * might be reused, and again be swapcache, using the same swap as before.
> + * Returns the swap entry's order if it still presents, else returns -1.
> */
> -static bool shmem_confirm_swap(struct address_space *mapping,
> - pgoff_t index, swp_entry_t swap)
> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> + swp_entry_t swap)
I think the function name shmem_confirm_swap is already good enough? Anyhow the
changed name should at least be shmem_check_entry_is_swap.
> {
> - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
> + XA_STATE(xas, &mapping->i_pages, index);
> + int ret = -1;
> + void *entry;
> +
> + rcu_read_lock();
> + do {
> + entry = xas_load(&xas);
> + if (entry == swp_to_radix_entry(swap))
> + ret = xas_get_order(&xas);
> + } while (xas_retry(&xas, entry));
> + rcu_read_unlock();
> + return ret;
> }
>
> /*
> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> return -EIO;
>
> si = get_swap_device(swap);
> - if (!si) {
> - if (!shmem_confirm_swap(mapping, index, swap))
> + order = shmem_swap_check_entry(mapping, index, swap);
> + if (unlikely(!si)) {
> + if (order < 0)
> return -EEXIST;
> else
> return -EINVAL;
> }
> + if (unlikely(order < 0)) {
> + put_swap_device(si);
> + return -EEXIST;
> + }
>
> /* Look it up and read it in.. */
> folio = swap_cache_get_folio(swap, NULL, 0);
> - order = xa_get_order(&mapping->i_pages, index);
> if (!folio) {
> int nr_pages = 1 << order;
> bool fallback_order0 = false;
> @@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> *foliop = folio;
> return 0;
> failed:
> - if (!shmem_confirm_swap(mapping, index, swap))
> + if (shmem_swap_check_entry(mapping, index, swap) < 0)
> error = -EEXIST;
> if (error == -EIO)
> shmem_set_folio_swapin_error(inode, index, folio, swap,
> @@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> folio_put(folio);
> }
> put_swap_device(si);
> -
> return error;
> }
>
On Wed, Jun 18, 2025 at 3:17 PM Dev Jain <dev.jain@arm.com> wrote: > > > On 18/06/25 12:05 am, Kairui Song wrote: > > From: Kairui Song <kasong@tencent.com> > > > > Currently shmem calls xa_get_order to get the swap radix entry order, > > requiring a full tree walk. This can be easily combined with the swap > > entry value checking (shmem_confirm_swap) to avoid the duplicated > > lookup, which should improve the performance. > > Nice spot! > > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/shmem.c | 33 ++++++++++++++++++++++++--------- > > 1 file changed, 24 insertions(+), 9 deletions(-) > > > > diff --git a/mm/shmem.c b/mm/shmem.c > > index 4e7ef343a29b..0ad49e57f736 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, > > > > /* > > * Sometimes, before we decide whether to proceed or to fail, we must check > > - * that an entry was not already brought back from swap by a racing thread. > > + * that an entry was not already brought back or split by a racing thread. > > * > > * Checking folio is not enough: by the time a swapcache folio is locked, it > > * might be reused, and again be swapcache, using the same swap as before. > > + * Returns the swap entry's order if it still presents, else returns -1. > > */ > > -static bool shmem_confirm_swap(struct address_space *mapping, > > - pgoff_t index, swp_entry_t swap) > > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, > > + swp_entry_t swap) > > I think the function name shmem_confirm_swap is already good enough? Anyhow the > changed name should at least be shmem_check_entry_is_swap. > Good, I can keep the function name unchanged or follow your suggestion, I thought a `confirm` function returning non-binary return value may look strange. I'm terrible at naming things :P
On 18/06/25 12:52 pm, Kairui Song wrote: > On Wed, Jun 18, 2025 at 3:17 PM Dev Jain <dev.jain@arm.com> wrote: >> >> On 18/06/25 12:05 am, Kairui Song wrote: >>> From: Kairui Song <kasong@tencent.com> >>> >>> Currently shmem calls xa_get_order to get the swap radix entry order, >>> requiring a full tree walk. This can be easily combined with the swap >>> entry value checking (shmem_confirm_swap) to avoid the duplicated >>> lookup, which should improve the performance. >> Nice spot! >> >>> Signed-off-by: Kairui Song <kasong@tencent.com> >>> --- >>> mm/shmem.c | 33 ++++++++++++++++++++++++--------- >>> 1 file changed, 24 insertions(+), 9 deletions(-) >>> >>> diff --git a/mm/shmem.c b/mm/shmem.c >>> index 4e7ef343a29b..0ad49e57f736 100644 >>> --- a/mm/shmem.c >>> +++ b/mm/shmem.c >>> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, >>> >>> /* >>> * Sometimes, before we decide whether to proceed or to fail, we must check >>> - * that an entry was not already brought back from swap by a racing thread. >>> + * that an entry was not already brought back or split by a racing thread. >>> * >>> * Checking folio is not enough: by the time a swapcache folio is locked, it >>> * might be reused, and again be swapcache, using the same swap as before. >>> + * Returns the swap entry's order if it still presents, else returns -1. >>> */ >>> -static bool shmem_confirm_swap(struct address_space *mapping, >>> - pgoff_t index, swp_entry_t swap) >>> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, >>> + swp_entry_t swap) >> I think the function name shmem_confirm_swap is already good enough? Anyhow the >> changed name should at least be shmem_check_entry_is_swap. >> > Good, I can keep the function name unchanged or follow your > suggestion, I thought a `confirm` function returning non-binary return True. I will vote for keeping the name unchanged; you have already documented the return value so it should be fine. Just can you put a new line between "Returns the swap entry's order..." and the previous line to make it clear. > value may look strange. I'm terrible at naming things :P
on 6/18/2025 2:35 AM, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
>
> Currently shmem calls xa_get_order to get the swap radix entry order,
> requiring a full tree walk. This can be easily combined with the swap
> entry value checking (shmem_confirm_swap) to avoid the duplicated
> lookup, which should improve the performance.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
> mm/shmem.c | 33 ++++++++++++++++++++++++---------
> 1 file changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 4e7ef343a29b..0ad49e57f736 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>
> /*
> * Sometimes, before we decide whether to proceed or to fail, we must check
> - * that an entry was not already brought back from swap by a racing thread.
> + * that an entry was not already brought back or split by a racing thread.
> *
> * Checking folio is not enough: by the time a swapcache folio is locked, it
> * might be reused, and again be swapcache, using the same swap as before.
> + * Returns the swap entry's order if it still presents, else returns -1.
> */
> -static bool shmem_confirm_swap(struct address_space *mapping,
> - pgoff_t index, swp_entry_t swap)
> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> + swp_entry_t swap)
> {
> - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
> + XA_STATE(xas, &mapping->i_pages, index);
> + int ret = -1;
> + void *entry;
> +
> + rcu_read_lock();
> + do {
> + entry = xas_load(&xas);
> + if (entry == swp_to_radix_entry(swap))
> + ret = xas_get_order(&xas);
> + } while (xas_retry(&xas, entry));
> + rcu_read_unlock();
> + return ret;
> }
>
> /*
> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> return -EIO;
>
> si = get_swap_device(swap);
> - if (!si) {
> - if (!shmem_confirm_swap(mapping, index, swap))
> + order = shmem_swap_check_entry(mapping, index, swap);
> + if (unlikely(!si)) {
> + if (order < 0)
> return -EEXIST;
> else
> return -EINVAL;
> }
> + if (unlikely(order < 0)) {
> + put_swap_device(si);
> + return -EEXIST;
> + }
Can we re-arrange the code block as following:
order = shmem_swap_check_entry(mapping, index, swap);
if (unlikely(order < 0))
return -EEXIST;
si = get_swap_device(swap);
if (!si) {
return -EINVAL;
...
>
> /* Look it up and read it in.. */
> folio = swap_cache_get_folio(swap, NULL, 0);
> - order = xa_get_order(&mapping->i_pages, index);
> if (!folio) {
> int nr_pages = 1 << order;
> bool fallback_order0 = false;
> @@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> *foliop = folio;
> return 0;
> failed:
> - if (!shmem_confirm_swap(mapping, index, swap))
> + if (shmem_swap_check_entry(mapping, index, swap) < 0)
> error = -EEXIST;
> if (error == -EIO)
> shmem_set_folio_swapin_error(inode, index, folio, swap,
> @@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> folio_put(folio);
> }
> put_swap_device(si);
> -
> return error;
> }
>
>
On Wed, Jun 18, 2025 at 10:49 AM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
> on 6/18/2025 2:35 AM, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently shmem calls xa_get_order to get the swap radix entry order,
> > requiring a full tree walk. This can be easily combined with the swap
> > entry value checking (shmem_confirm_swap) to avoid the duplicated
> > lookup, which should improve the performance.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> > mm/shmem.c | 33 ++++++++++++++++++++++++---------
> > 1 file changed, 24 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 4e7ef343a29b..0ad49e57f736 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
> >
> > /*
> > * Sometimes, before we decide whether to proceed or to fail, we must check
> > - * that an entry was not already brought back from swap by a racing thread.
> > + * that an entry was not already brought back or split by a racing thread.
> > *
> > * Checking folio is not enough: by the time a swapcache folio is locked, it
> > * might be reused, and again be swapcache, using the same swap as before.
> > + * Returns the swap entry's order if it still presents, else returns -1.
> > */
> > -static bool shmem_confirm_swap(struct address_space *mapping,
> > - pgoff_t index, swp_entry_t swap)
> > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
> > + swp_entry_t swap)
> > {
> > - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
> > + XA_STATE(xas, &mapping->i_pages, index);
> > + int ret = -1;
> > + void *entry;
> > +
> > + rcu_read_lock();
> > + do {
> > + entry = xas_load(&xas);
> > + if (entry == swp_to_radix_entry(swap))
> > + ret = xas_get_order(&xas);
> > + } while (xas_retry(&xas, entry));
> > + rcu_read_unlock();
> > + return ret;
> > }
> >
> > /*
> > @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> > return -EIO;
> >
> > si = get_swap_device(swap);
> > - if (!si) {
> > - if (!shmem_confirm_swap(mapping, index, swap))
> > + order = shmem_swap_check_entry(mapping, index, swap);
> > + if (unlikely(!si)) {
> > + if (order < 0)
> > return -EEXIST;
> > else
> > return -EINVAL;
> > }
> > + if (unlikely(order < 0)) {
> > + put_swap_device(si);
> > + return -EEXIST;
> > + }
> Can we re-arrange the code block as following:
> order = shmem_swap_check_entry(mapping, index, swap);
> if (unlikely(order < 0))
> return -EEXIST;
>
> si = get_swap_device(swap);
> if (!si) {
> return -EINVAL;
> ...
Hi, thanks for the suggestion.
This may lead to a trivial higher chance of getting -EINVAL when it
should return -EEXIST, leading to user space errors.
For example if this CPU get interrupted after `order =
shmem_swap_check_entry(mapping, index, swap);`, and another CPU
swapoff-ed the device. Next, we get `si = NULL` here, but the entry is
swapped in already, so it should return -EEXIST. Not -EINVAL.
The chance is really low so it's kind of trivial, we can do a `goto
failed` if got (!si) here, but it will make the logic under `failed:`
more complex. So I'd prefer to not change the original behaviour,
which looks more correct.
on 6/18/2025 11:07 AM, Kairui Song wrote:
> On Wed, Jun 18, 2025 at 10:49 AM Kemeng Shi <shikemeng@huaweicloud.com> wrote:
>> on 6/18/2025 2:35 AM, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Currently shmem calls xa_get_order to get the swap radix entry order,
>>> requiring a full tree walk. This can be easily combined with the swap
>>> entry value checking (shmem_confirm_swap) to avoid the duplicated
>>> lookup, which should improve the performance.
>>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>> mm/shmem.c | 33 ++++++++++++++++++++++++---------
>>> 1 file changed, 24 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 4e7ef343a29b..0ad49e57f736 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
>>>
>>> /*
>>> * Sometimes, before we decide whether to proceed or to fail, we must check
>>> - * that an entry was not already brought back from swap by a racing thread.
>>> + * that an entry was not already brought back or split by a racing thread.
>>> *
>>> * Checking folio is not enough: by the time a swapcache folio is locked, it
>>> * might be reused, and again be swapcache, using the same swap as before.
>>> + * Returns the swap entry's order if it still presents, else returns -1.
>>> */
>>> -static bool shmem_confirm_swap(struct address_space *mapping,
>>> - pgoff_t index, swp_entry_t swap)
>>> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
>>> + swp_entry_t swap)
>>> {
>>> - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
>>> + XA_STATE(xas, &mapping->i_pages, index);
>>> + int ret = -1;
>>> + void *entry;
>>> +
>>> + rcu_read_lock();
>>> + do {
>>> + entry = xas_load(&xas);
>>> + if (entry == swp_to_radix_entry(swap))
>>> + ret = xas_get_order(&xas);
>>> + } while (xas_retry(&xas, entry));
>>> + rcu_read_unlock();
>>> + return ret;
>>> }
>>>
>>> /*
>>> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>>> return -EIO;
>>>
>>> si = get_swap_device(swap);
>>> - if (!si) {
>>> - if (!shmem_confirm_swap(mapping, index, swap))
>>> + order = shmem_swap_check_entry(mapping, index, swap);
>>> + if (unlikely(!si)) {
>>> + if (order < 0)
>>> return -EEXIST;
>>> else
>>> return -EINVAL;
>>> }
>>> + if (unlikely(order < 0)) {
>>> + put_swap_device(si);
>>> + return -EEXIST;
>>> + }
>> Can we re-arrange the code block as following:
>> order = shmem_swap_check_entry(mapping, index, swap);
>> if (unlikely(order < 0))
>> return -EEXIST;
>>
>> si = get_swap_device(swap);
>> if (!si) {
>> return -EINVAL;
>> ...
>
> Hi, thanks for the suggestion.
>
> This may lead to a trivial higher chance of getting -EINVAL when it
> should return -EEXIST, leading to user space errors.
>
> For example if this CPU get interrupted after `order =
> shmem_swap_check_entry(mapping, index, swap);`, and another CPU
> swapoff-ed the device. Next, we get `si = NULL` here, but the entry is
> swapped in already, so it should return -EEXIST. Not -EINVAL.
>
> The chance is really low so it's kind of trivial, we can do a `goto
> failed` if got (!si) here, but it will make the logic under `failed:`
> more complex. So I'd prefer to not change the original behaviour,
> which looks more correct.
>
Right, thanks for explanation.
Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
© 2016 - 2026 Red Hat, Inc.