From: Kairui Song <kasong@tencent.com>
Currently shmem calls xa_get_order to get the swap radix entry order,
requiring a full tree walk. This can be easily combined with the swap
entry value checking (shmem_confirm_swap) to avoid the duplicated
lookup, which should improve the performance.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/shmem.c | 33 ++++++++++++++++++++++++---------
1 file changed, 24 insertions(+), 9 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 4e7ef343a29b..0ad49e57f736 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
return -EIO;
si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ order = shmem_swap_check_entry(mapping, index, swap);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
- order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
int nr_pages = 1 << order;
bool fallback_order0 = false;
@@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_swap_check_entry(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap,
@@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio_put(folio);
}
put_swap_device(si);
-
return error;
}
--
2.50.0
On 18/06/25 12:05 am, Kairui Song wrote: > From: Kairui Song <kasong@tencent.com> > > Currently shmem calls xa_get_order to get the swap radix entry order, > requiring a full tree walk. This can be easily combined with the swap > entry value checking (shmem_confirm_swap) to avoid the duplicated > lookup, which should improve the performance. Nice spot! > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/shmem.c | 33 ++++++++++++++++++++++++--------- > 1 file changed, 24 insertions(+), 9 deletions(-) > > diff --git a/mm/shmem.c b/mm/shmem.c > index 4e7ef343a29b..0ad49e57f736 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, > > /* > * Sometimes, before we decide whether to proceed or to fail, we must check > - * that an entry was not already brought back from swap by a racing thread. > + * that an entry was not already brought back or split by a racing thread. > * > * Checking folio is not enough: by the time a swapcache folio is locked, it > * might be reused, and again be swapcache, using the same swap as before. > + * Returns the swap entry's order if it still presents, else returns -1. > */ > -static bool shmem_confirm_swap(struct address_space *mapping, > - pgoff_t index, swp_entry_t swap) > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, > + swp_entry_t swap) I think the function name shmem_confirm_swap is already good enough? Anyhow the changed name should at least be shmem_check_entry_is_swap. > { > - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); > + XA_STATE(xas, &mapping->i_pages, index); > + int ret = -1; > + void *entry; > + > + rcu_read_lock(); > + do { > + entry = xas_load(&xas); > + if (entry == swp_to_radix_entry(swap)) > + ret = xas_get_order(&xas); > + } while (xas_retry(&xas, entry)); > + rcu_read_unlock(); > + return ret; > } > > /* > @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > return -EIO; > > si = get_swap_device(swap); > - if (!si) { > - if (!shmem_confirm_swap(mapping, index, swap)) > + order = shmem_swap_check_entry(mapping, index, swap); > + if (unlikely(!si)) { > + if (order < 0) > return -EEXIST; > else > return -EINVAL; > } > + if (unlikely(order < 0)) { > + put_swap_device(si); > + return -EEXIST; > + } > > /* Look it up and read it in.. */ > folio = swap_cache_get_folio(swap, NULL, 0); > - order = xa_get_order(&mapping->i_pages, index); > if (!folio) { > int nr_pages = 1 << order; > bool fallback_order0 = false; > @@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > *foliop = folio; > return 0; > failed: > - if (!shmem_confirm_swap(mapping, index, swap)) > + if (shmem_swap_check_entry(mapping, index, swap) < 0) > error = -EEXIST; > if (error == -EIO) > shmem_set_folio_swapin_error(inode, index, folio, swap, > @@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > folio_put(folio); > } > put_swap_device(si); > - > return error; > } >
On Wed, Jun 18, 2025 at 3:17 PM Dev Jain <dev.jain@arm.com> wrote: > > > On 18/06/25 12:05 am, Kairui Song wrote: > > From: Kairui Song <kasong@tencent.com> > > > > Currently shmem calls xa_get_order to get the swap radix entry order, > > requiring a full tree walk. This can be easily combined with the swap > > entry value checking (shmem_confirm_swap) to avoid the duplicated > > lookup, which should improve the performance. > > Nice spot! > > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/shmem.c | 33 ++++++++++++++++++++++++--------- > > 1 file changed, 24 insertions(+), 9 deletions(-) > > > > diff --git a/mm/shmem.c b/mm/shmem.c > > index 4e7ef343a29b..0ad49e57f736 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, > > > > /* > > * Sometimes, before we decide whether to proceed or to fail, we must check > > - * that an entry was not already brought back from swap by a racing thread. > > + * that an entry was not already brought back or split by a racing thread. > > * > > * Checking folio is not enough: by the time a swapcache folio is locked, it > > * might be reused, and again be swapcache, using the same swap as before. > > + * Returns the swap entry's order if it still presents, else returns -1. > > */ > > -static bool shmem_confirm_swap(struct address_space *mapping, > > - pgoff_t index, swp_entry_t swap) > > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, > > + swp_entry_t swap) > > I think the function name shmem_confirm_swap is already good enough? Anyhow the > changed name should at least be shmem_check_entry_is_swap. > Good, I can keep the function name unchanged or follow your suggestion, I thought a `confirm` function returning non-binary return value may look strange. I'm terrible at naming things :P
On 18/06/25 12:52 pm, Kairui Song wrote: > On Wed, Jun 18, 2025 at 3:17 PM Dev Jain <dev.jain@arm.com> wrote: >> >> On 18/06/25 12:05 am, Kairui Song wrote: >>> From: Kairui Song <kasong@tencent.com> >>> >>> Currently shmem calls xa_get_order to get the swap radix entry order, >>> requiring a full tree walk. This can be easily combined with the swap >>> entry value checking (shmem_confirm_swap) to avoid the duplicated >>> lookup, which should improve the performance. >> Nice spot! >> >>> Signed-off-by: Kairui Song <kasong@tencent.com> >>> --- >>> mm/shmem.c | 33 ++++++++++++++++++++++++--------- >>> 1 file changed, 24 insertions(+), 9 deletions(-) >>> >>> diff --git a/mm/shmem.c b/mm/shmem.c >>> index 4e7ef343a29b..0ad49e57f736 100644 >>> --- a/mm/shmem.c >>> +++ b/mm/shmem.c >>> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, >>> >>> /* >>> * Sometimes, before we decide whether to proceed or to fail, we must check >>> - * that an entry was not already brought back from swap by a racing thread. >>> + * that an entry was not already brought back or split by a racing thread. >>> * >>> * Checking folio is not enough: by the time a swapcache folio is locked, it >>> * might be reused, and again be swapcache, using the same swap as before. >>> + * Returns the swap entry's order if it still presents, else returns -1. >>> */ >>> -static bool shmem_confirm_swap(struct address_space *mapping, >>> - pgoff_t index, swp_entry_t swap) >>> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, >>> + swp_entry_t swap) >> I think the function name shmem_confirm_swap is already good enough? Anyhow the >> changed name should at least be shmem_check_entry_is_swap. >> > Good, I can keep the function name unchanged or follow your > suggestion, I thought a `confirm` function returning non-binary return True. I will vote for keeping the name unchanged; you have already documented the return value so it should be fine. Just can you put a new line between "Returns the swap entry's order..." and the previous line to make it clear. > value may look strange. I'm terrible at naming things :P
on 6/18/2025 2:35 AM, Kairui Song wrote: > From: Kairui Song <kasong@tencent.com> > > Currently shmem calls xa_get_order to get the swap radix entry order, > requiring a full tree walk. This can be easily combined with the swap > entry value checking (shmem_confirm_swap) to avoid the duplicated > lookup, which should improve the performance. > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/shmem.c | 33 ++++++++++++++++++++++++--------- > 1 file changed, 24 insertions(+), 9 deletions(-) > > diff --git a/mm/shmem.c b/mm/shmem.c > index 4e7ef343a29b..0ad49e57f736 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, > > /* > * Sometimes, before we decide whether to proceed or to fail, we must check > - * that an entry was not already brought back from swap by a racing thread. > + * that an entry was not already brought back or split by a racing thread. > * > * Checking folio is not enough: by the time a swapcache folio is locked, it > * might be reused, and again be swapcache, using the same swap as before. > + * Returns the swap entry's order if it still presents, else returns -1. > */ > -static bool shmem_confirm_swap(struct address_space *mapping, > - pgoff_t index, swp_entry_t swap) > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, > + swp_entry_t swap) > { > - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); > + XA_STATE(xas, &mapping->i_pages, index); > + int ret = -1; > + void *entry; > + > + rcu_read_lock(); > + do { > + entry = xas_load(&xas); > + if (entry == swp_to_radix_entry(swap)) > + ret = xas_get_order(&xas); > + } while (xas_retry(&xas, entry)); > + rcu_read_unlock(); > + return ret; > } > > /* > @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > return -EIO; > > si = get_swap_device(swap); > - if (!si) { > - if (!shmem_confirm_swap(mapping, index, swap)) > + order = shmem_swap_check_entry(mapping, index, swap); > + if (unlikely(!si)) { > + if (order < 0) > return -EEXIST; > else > return -EINVAL; > } > + if (unlikely(order < 0)) { > + put_swap_device(si); > + return -EEXIST; > + } Can we re-arrange the code block as following: order = shmem_swap_check_entry(mapping, index, swap); if (unlikely(order < 0)) return -EEXIST; si = get_swap_device(swap); if (!si) { return -EINVAL; ... > > /* Look it up and read it in.. */ > folio = swap_cache_get_folio(swap, NULL, 0); > - order = xa_get_order(&mapping->i_pages, index); > if (!folio) { > int nr_pages = 1 << order; > bool fallback_order0 = false; > @@ -2415,7 +2431,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > *foliop = folio; > return 0; > failed: > - if (!shmem_confirm_swap(mapping, index, swap)) > + if (shmem_swap_check_entry(mapping, index, swap) < 0) > error = -EEXIST; > if (error == -EIO) > shmem_set_folio_swapin_error(inode, index, folio, swap, > @@ -2428,7 +2444,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > folio_put(folio); > } > put_swap_device(si); > - > return error; > } > >
On Wed, Jun 18, 2025 at 10:49 AM Kemeng Shi <shikemeng@huaweicloud.com> wrote: > on 6/18/2025 2:35 AM, Kairui Song wrote: > > From: Kairui Song <kasong@tencent.com> > > > > Currently shmem calls xa_get_order to get the swap radix entry order, > > requiring a full tree walk. This can be easily combined with the swap > > entry value checking (shmem_confirm_swap) to avoid the duplicated > > lookup, which should improve the performance. > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/shmem.c | 33 ++++++++++++++++++++++++--------- > > 1 file changed, 24 insertions(+), 9 deletions(-) > > > > diff --git a/mm/shmem.c b/mm/shmem.c > > index 4e7ef343a29b..0ad49e57f736 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, > > > > /* > > * Sometimes, before we decide whether to proceed or to fail, we must check > > - * that an entry was not already brought back from swap by a racing thread. > > + * that an entry was not already brought back or split by a racing thread. > > * > > * Checking folio is not enough: by the time a swapcache folio is locked, it > > * might be reused, and again be swapcache, using the same swap as before. > > + * Returns the swap entry's order if it still presents, else returns -1. > > */ > > -static bool shmem_confirm_swap(struct address_space *mapping, > > - pgoff_t index, swp_entry_t swap) > > +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, > > + swp_entry_t swap) > > { > > - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); > > + XA_STATE(xas, &mapping->i_pages, index); > > + int ret = -1; > > + void *entry; > > + > > + rcu_read_lock(); > > + do { > > + entry = xas_load(&xas); > > + if (entry == swp_to_radix_entry(swap)) > > + ret = xas_get_order(&xas); > > + } while (xas_retry(&xas, entry)); > > + rcu_read_unlock(); > > + return ret; > > } > > > > /* > > @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, > > return -EIO; > > > > si = get_swap_device(swap); > > - if (!si) { > > - if (!shmem_confirm_swap(mapping, index, swap)) > > + order = shmem_swap_check_entry(mapping, index, swap); > > + if (unlikely(!si)) { > > + if (order < 0) > > return -EEXIST; > > else > > return -EINVAL; > > } > > + if (unlikely(order < 0)) { > > + put_swap_device(si); > > + return -EEXIST; > > + } > Can we re-arrange the code block as following: > order = shmem_swap_check_entry(mapping, index, swap); > if (unlikely(order < 0)) > return -EEXIST; > > si = get_swap_device(swap); > if (!si) { > return -EINVAL; > ... Hi, thanks for the suggestion. This may lead to a trivial higher chance of getting -EINVAL when it should return -EEXIST, leading to user space errors. For example if this CPU get interrupted after `order = shmem_swap_check_entry(mapping, index, swap);`, and another CPU swapoff-ed the device. Next, we get `si = NULL` here, but the entry is swapped in already, so it should return -EEXIST. Not -EINVAL. The chance is really low so it's kind of trivial, we can do a `goto failed` if got (!si) here, but it will make the logic under `failed:` more complex. So I'd prefer to not change the original behaviour, which looks more correct.
on 6/18/2025 11:07 AM, Kairui Song wrote: > On Wed, Jun 18, 2025 at 10:49 AM Kemeng Shi <shikemeng@huaweicloud.com> wrote: >> on 6/18/2025 2:35 AM, Kairui Song wrote: >>> From: Kairui Song <kasong@tencent.com> >>> >>> Currently shmem calls xa_get_order to get the swap radix entry order, >>> requiring a full tree walk. This can be easily combined with the swap >>> entry value checking (shmem_confirm_swap) to avoid the duplicated >>> lookup, which should improve the performance. >>> >>> Signed-off-by: Kairui Song <kasong@tencent.com> >>> --- >>> mm/shmem.c | 33 ++++++++++++++++++++++++--------- >>> 1 file changed, 24 insertions(+), 9 deletions(-) >>> >>> diff --git a/mm/shmem.c b/mm/shmem.c >>> index 4e7ef343a29b..0ad49e57f736 100644 >>> --- a/mm/shmem.c >>> +++ b/mm/shmem.c >>> @@ -505,15 +505,27 @@ static int shmem_replace_entry(struct address_space *mapping, >>> >>> /* >>> * Sometimes, before we decide whether to proceed or to fail, we must check >>> - * that an entry was not already brought back from swap by a racing thread. >>> + * that an entry was not already brought back or split by a racing thread. >>> * >>> * Checking folio is not enough: by the time a swapcache folio is locked, it >>> * might be reused, and again be swapcache, using the same swap as before. >>> + * Returns the swap entry's order if it still presents, else returns -1. >>> */ >>> -static bool shmem_confirm_swap(struct address_space *mapping, >>> - pgoff_t index, swp_entry_t swap) >>> +static int shmem_swap_check_entry(struct address_space *mapping, pgoff_t index, >>> + swp_entry_t swap) >>> { >>> - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); >>> + XA_STATE(xas, &mapping->i_pages, index); >>> + int ret = -1; >>> + void *entry; >>> + >>> + rcu_read_lock(); >>> + do { >>> + entry = xas_load(&xas); >>> + if (entry == swp_to_radix_entry(swap)) >>> + ret = xas_get_order(&xas); >>> + } while (xas_retry(&xas, entry)); >>> + rcu_read_unlock(); >>> + return ret; >>> } >>> >>> /* >>> @@ -2256,16 +2268,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, >>> return -EIO; >>> >>> si = get_swap_device(swap); >>> - if (!si) { >>> - if (!shmem_confirm_swap(mapping, index, swap)) >>> + order = shmem_swap_check_entry(mapping, index, swap); >>> + if (unlikely(!si)) { >>> + if (order < 0) >>> return -EEXIST; >>> else >>> return -EINVAL; >>> } >>> + if (unlikely(order < 0)) { >>> + put_swap_device(si); >>> + return -EEXIST; >>> + } >> Can we re-arrange the code block as following: >> order = shmem_swap_check_entry(mapping, index, swap); >> if (unlikely(order < 0)) >> return -EEXIST; >> >> si = get_swap_device(swap); >> if (!si) { >> return -EINVAL; >> ... > > Hi, thanks for the suggestion. > > This may lead to a trivial higher chance of getting -EINVAL when it > should return -EEXIST, leading to user space errors. > > For example if this CPU get interrupted after `order = > shmem_swap_check_entry(mapping, index, swap);`, and another CPU > swapoff-ed the device. Next, we get `si = NULL` here, but the entry is > swapped in already, so it should return -EEXIST. Not -EINVAL. > > The chance is really low so it's kind of trivial, we can do a `goto > failed` if got (!si) here, but it will make the logic under `failed:` > more complex. So I'd prefer to not change the original behaviour, > which looks more correct. > Right, thanks for explanation. Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
© 2016 - 2025 Red Hat, Inc.