From: Barry Song <v-songbaohua@oppo.com>
Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
one entry only, to support large folio swap-in, we need to handle multiple
swap entries.
Cc: Kairui Song <kasong@tencent.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
include/linux/swap.h | 1 +
mm/swap.h | 1 +
mm/swapfile.c | 117 ++++++++++++++++++++++++++-----------------
3 files changed, 72 insertions(+), 47 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b3581c976e5f..2691c739d9a4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -480,6 +480,7 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
+extern int swapcache_prepare_nr(swp_entry_t, int nr);
extern void swap_free(swp_entry_t);
extern void swap_nr_free(swp_entry_t entry, int nr_pages);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
diff --git a/mm/swap.h b/mm/swap.h
index fc2f6ade7f80..1cec991efcda 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -42,6 +42,7 @@ void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
+void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c0c058ee7b69..c8c8b6dbaeda 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3308,7 +3308,7 @@ void si_swapinfo(struct sysinfo *val)
}
/*
- * Verify that a swap entry is valid and increment its swap map count.
+ * Verify that nr swap entries are valid and increment their swap map count.
*
* Returns error code in following case.
* - success -> 0
@@ -3318,66 +3318,73 @@ void si_swapinfo(struct sysinfo *val)
* - swap-cache reference is requested but the entry is not used. -> ENOENT
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
*/
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+static int __swap_duplicate_nr(swp_entry_t entry, int nr, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
unsigned long offset;
- unsigned char count;
- unsigned char has_cache;
- int err;
+ unsigned char count[SWAPFILE_CLUSTER];
+ unsigned char has_cache[SWAPFILE_CLUSTER];
+ int err, i;
p = swp_swap_info(entry);
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
- count = p->swap_map[offset];
-
- /*
- * swapin_readahead() doesn't check if a swap entry is valid, so the
- * swap entry could be SWAP_MAP_BAD. Check here with lock held.
- */
- if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
- err = -ENOENT;
- goto unlock_out;
- }
+ for (i = 0; i < nr; i++) {
+ count[i] = p->swap_map[offset + i];
- has_cache = count & SWAP_HAS_CACHE;
- count &= ~SWAP_HAS_CACHE;
- err = 0;
-
- if (usage == SWAP_HAS_CACHE) {
-
- /* set SWAP_HAS_CACHE if there is no cache and entry is used */
- if (!has_cache && count)
- has_cache = SWAP_HAS_CACHE;
- else if (has_cache) /* someone else added cache */
- err = -EEXIST;
- else /* no users remaining */
+ /*
+ * swapin_readahead() doesn't check if a swap entry is valid, so the
+ * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+ */
+ if (unlikely(swap_count(count[i]) == SWAP_MAP_BAD)) {
err = -ENOENT;
+ goto unlock_out;
+ }
- } else if (count || has_cache) {
-
- if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
- count += usage;
- else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
- err = -EINVAL;
- else if (swap_count_continued(p, offset, count))
- count = COUNT_CONTINUED;
- else
- err = -ENOMEM;
- } else
- err = -ENOENT; /* unused swap entry */
-
- if (!err)
- WRITE_ONCE(p->swap_map[offset], count | has_cache);
+ has_cache[i] = count[i] & SWAP_HAS_CACHE;
+ count[i] &= ~SWAP_HAS_CACHE;
+ err = 0;
+
+ if (usage == SWAP_HAS_CACHE) {
+
+ /* set SWAP_HAS_CACHE if there is no cache and entry is used */
+ if (!has_cache[i] && count[i])
+ has_cache[i] = SWAP_HAS_CACHE;
+ else if (has_cache[i]) /* someone else added cache */
+ err = -EEXIST;
+ else /* no users remaining */
+ err = -ENOENT;
+ } else if (count[i] || has_cache[i]) {
+
+ if ((count[i] & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ count[i] += usage;
+ else if ((count[i] & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
+ err = -EINVAL;
+ else if (swap_count_continued(p, offset + i, count[i]))
+ count[i] = COUNT_CONTINUED;
+ else
+ err = -ENOMEM;
+ } else
+ err = -ENOENT; /* unused swap entry */
+ }
+ if (!err) {
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
+ }
unlock_out:
unlock_cluster_or_swap_info(p, ci);
return err;
}
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+{
+ return __swap_duplicate_nr(entry, 1, usage);
+}
+
/*
* Help swapoff by noting that swap entry belongs to shmem/tmpfs
* (in which case its reference count is never incremented).
@@ -3416,17 +3423,33 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+int swapcache_prepare_nr(swp_entry_t entry, int nr)
+{
+ return __swap_duplicate_nr(entry, nr, SWAP_HAS_CACHE);
+}
+
+void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
+ unsigned char usage[SWAPFILE_CLUSTER];
+ int i;
ci = lock_cluster_or_swap_info(si, offset);
- usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ for (i = 0; i < nr; i++)
+ usage[i] = __swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE);
unlock_cluster_or_swap_info(si, ci);
- if (!usage)
- free_swap_slot(entry);
+ for (i = 0; i < nr; i++) {
+ if (!usage[i]) {
+ free_swap_slot(entry);
+ entry.val++;
+ }
+ }
+}
+
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ swapcache_clear_nr(si, entry, 1);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
--
2.34.1
On Wed, Feb 28, 2024 at 4:39 PM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
> one entry only, to support large folio swap-in, we need to handle multiple
> swap entries.
>
> Cc: Kairui Song <kasong@tencent.com>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Yu Zhao <yuzhao@google.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Yosry Ahmed <yosryahmed@google.com>
> Cc: Yu Zhao <yuzhao@google.com>
> Cc: SeongJae Park <sj@kernel.org>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
> include/linux/swap.h | 1 +
> mm/swap.h | 1 +
> mm/swapfile.c | 117 ++++++++++++++++++++++++++-----------------
> 3 files changed, 72 insertions(+), 47 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index b3581c976e5f..2691c739d9a4 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -480,6 +480,7 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> extern void swap_shmem_alloc(swp_entry_t);
> extern int swap_duplicate(swp_entry_t);
> extern int swapcache_prepare(swp_entry_t);
> +extern int swapcache_prepare_nr(swp_entry_t, int nr);
> extern void swap_free(swp_entry_t);
> extern void swap_nr_free(swp_entry_t entry, int nr_pages);
> extern void swapcache_free_entries(swp_entry_t *entries, int n);
> diff --git a/mm/swap.h b/mm/swap.h
> index fc2f6ade7f80..1cec991efcda 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -42,6 +42,7 @@ void delete_from_swap_cache(struct folio *folio);
> void clear_shadow_from_swap_cache(int type, unsigned long begin,
> unsigned long end);
> void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
> +void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr);
> struct folio *swap_cache_get_folio(swp_entry_t entry,
> struct vm_area_struct *vma, unsigned long addr);
> struct folio *filemap_get_incore_folio(struct address_space *mapping,
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index c0c058ee7b69..c8c8b6dbaeda 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -3308,7 +3308,7 @@ void si_swapinfo(struct sysinfo *val)
> }
>
> /*
> - * Verify that a swap entry is valid and increment its swap map count.
> + * Verify that nr swap entries are valid and increment their swap map count.
> *
> * Returns error code in following case.
> * - success -> 0
> @@ -3318,66 +3318,73 @@ void si_swapinfo(struct sysinfo *val)
> * - swap-cache reference is requested but the entry is not used. -> ENOENT
> * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> */
> -static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> +static int __swap_duplicate_nr(swp_entry_t entry, int nr, unsigned char usage)
> {
> struct swap_info_struct *p;
> struct swap_cluster_info *ci;
> unsigned long offset;
> - unsigned char count;
> - unsigned char has_cache;
> - int err;
> + unsigned char count[SWAPFILE_CLUSTER];
> + unsigned char has_cache[SWAPFILE_CLUSTER];
I am not closely following this series, but a couple of things caught my eyes.
Is this reasonable for stack usage?
> + int err, i;
>
> p = swp_swap_info(entry);
>
> offset = swp_offset(entry);
> ci = lock_cluster_or_swap_info(p, offset);
>
> - count = p->swap_map[offset];
> -
> - /*
> - * swapin_readahead() doesn't check if a swap entry is valid, so the
> - * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> - */
> - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
> - err = -ENOENT;
> - goto unlock_out;
> - }
> + for (i = 0; i < nr; i++) {
> + count[i] = p->swap_map[offset + i];
>
> - has_cache = count & SWAP_HAS_CACHE;
> - count &= ~SWAP_HAS_CACHE;
> - err = 0;
> -
> - if (usage == SWAP_HAS_CACHE) {
> -
> - /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> - if (!has_cache && count)
> - has_cache = SWAP_HAS_CACHE;
> - else if (has_cache) /* someone else added cache */
> - err = -EEXIST;
> - else /* no users remaining */
> + /*
> + * swapin_readahead() doesn't check if a swap entry is valid, so the
> + * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> + */
> + if (unlikely(swap_count(count[i]) == SWAP_MAP_BAD)) {
> err = -ENOENT;
> + goto unlock_out;
> + }
Here we immediately exit if there is an error, but we don't below, we
just keep overwriting the error every iteration as far as I can tell.
Also, it doesn't seem like we do any cleanups if we hit an error
halfway through. Should we undo previously updated swap entries, or am
I missing something here?
>
> - } else if (count || has_cache) {
> -
> - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> - count += usage;
> - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> - err = -EINVAL;
> - else if (swap_count_continued(p, offset, count))
> - count = COUNT_CONTINUED;
> - else
> - err = -ENOMEM;
> - } else
> - err = -ENOENT; /* unused swap entry */
> -
> - if (!err)
> - WRITE_ONCE(p->swap_map[offset], count | has_cache);
> + has_cache[i] = count[i] & SWAP_HAS_CACHE;
> + count[i] &= ~SWAP_HAS_CACHE;
> + err = 0;
> +
> + if (usage == SWAP_HAS_CACHE) {
> +
> + /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> + if (!has_cache[i] && count[i])
> + has_cache[i] = SWAP_HAS_CACHE;
> + else if (has_cache[i]) /* someone else added cache */
> + err = -EEXIST;
> + else /* no users remaining */
> + err = -ENOENT;
> + } else if (count[i] || has_cache[i]) {
> +
> + if ((count[i] & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> + count[i] += usage;
> + else if ((count[i] & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> + err = -EINVAL;
> + else if (swap_count_continued(p, offset + i, count[i]))
> + count[i] = COUNT_CONTINUED;
> + else
> + err = -ENOMEM;
> + } else
> + err = -ENOENT; /* unused swap entry */
> + }
>
> + if (!err) {
> + for (i = 0; i < nr; i++)
> + WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
> + }
> unlock_out:
> unlock_cluster_or_swap_info(p, ci);
> return err;
> }
On Thu, Feb 29, 2024 at 1:52 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Wed, Feb 28, 2024 at 4:39 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
> > one entry only, to support large folio swap-in, we need to handle multiple
> > swap entries.
> >
> > Cc: Kairui Song <kasong@tencent.com>
> > Cc: "Huang, Ying" <ying.huang@intel.com>
> > Cc: Yu Zhao <yuzhao@google.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Chris Li <chrisl@kernel.org>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Minchan Kim <minchan@kernel.org>
> > Cc: Yosry Ahmed <yosryahmed@google.com>
> > Cc: Yu Zhao <yuzhao@google.com>
> > Cc: SeongJae Park <sj@kernel.org>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> > include/linux/swap.h | 1 +
> > mm/swap.h | 1 +
> > mm/swapfile.c | 117 ++++++++++++++++++++++++++-----------------
> > 3 files changed, 72 insertions(+), 47 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index b3581c976e5f..2691c739d9a4 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -480,6 +480,7 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> > extern void swap_shmem_alloc(swp_entry_t);
> > extern int swap_duplicate(swp_entry_t);
> > extern int swapcache_prepare(swp_entry_t);
> > +extern int swapcache_prepare_nr(swp_entry_t, int nr);
> > extern void swap_free(swp_entry_t);
> > extern void swap_nr_free(swp_entry_t entry, int nr_pages);
> > extern void swapcache_free_entries(swp_entry_t *entries, int n);
> > diff --git a/mm/swap.h b/mm/swap.h
> > index fc2f6ade7f80..1cec991efcda 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -42,6 +42,7 @@ void delete_from_swap_cache(struct folio *folio);
> > void clear_shadow_from_swap_cache(int type, unsigned long begin,
> > unsigned long end);
> > void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
> > +void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr);
> > struct folio *swap_cache_get_folio(swp_entry_t entry,
> > struct vm_area_struct *vma, unsigned long addr);
> > struct folio *filemap_get_incore_folio(struct address_space *mapping,
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index c0c058ee7b69..c8c8b6dbaeda 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -3308,7 +3308,7 @@ void si_swapinfo(struct sysinfo *val)
> > }
> >
> > /*
> > - * Verify that a swap entry is valid and increment its swap map count.
> > + * Verify that nr swap entries are valid and increment their swap map count.
> > *
> > * Returns error code in following case.
> > * - success -> 0
> > @@ -3318,66 +3318,73 @@ void si_swapinfo(struct sysinfo *val)
> > * - swap-cache reference is requested but the entry is not used. -> ENOENT
> > * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> > */
> > -static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> > +static int __swap_duplicate_nr(swp_entry_t entry, int nr, unsigned char usage)
> > {
> > struct swap_info_struct *p;
> > struct swap_cluster_info *ci;
> > unsigned long offset;
> > - unsigned char count;
> > - unsigned char has_cache;
> > - int err;
> > + unsigned char count[SWAPFILE_CLUSTER];
> > + unsigned char has_cache[SWAPFILE_CLUSTER];
>
Hi Yosry,
Thanks for reviewing!
> I am not closely following this series, but a couple of things caught my eyes.
>
> Is this reasonable for stack usage?
SWAPFILE_CLUSTER is not huge. typically 512 or 256.
#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
#define swap_entry_size(size) (size)
#else
#define SWAPFILE_CLUSTER 256
If this is still a concern, I can move it to a bitmap.
>
> > + int err, i;
> >
> > p = swp_swap_info(entry);
> >
> > offset = swp_offset(entry);
> > ci = lock_cluster_or_swap_info(p, offset);
> >
> > - count = p->swap_map[offset];
> > -
> > - /*
> > - * swapin_readahead() doesn't check if a swap entry is valid, so the
> > - * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> > - */
> > - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
> > - err = -ENOENT;
> > - goto unlock_out;
> > - }
> > + for (i = 0; i < nr; i++) {
> > + count[i] = p->swap_map[offset + i];
> >
> > - has_cache = count & SWAP_HAS_CACHE;
> > - count &= ~SWAP_HAS_CACHE;
> > - err = 0;
> > -
> > - if (usage == SWAP_HAS_CACHE) {
> > -
> > - /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> > - if (!has_cache && count)
> > - has_cache = SWAP_HAS_CACHE;
> > - else if (has_cache) /* someone else added cache */
> > - err = -EEXIST;
> > - else /* no users remaining */
> > + /*
> > + * swapin_readahead() doesn't check if a swap entry is valid, so the
> > + * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> > + */
> > + if (unlikely(swap_count(count[i]) == SWAP_MAP_BAD)) {
> > err = -ENOENT;
> > + goto unlock_out;
> > + }
>
> Here we immediately exit if there is an error, but we don't below, we
> just keep overwriting the error every iteration as far as I can tell.
> Also, it doesn't seem like we do any cleanups if we hit an error
> halfway through. Should we undo previously updated swap entries, or am
> I missing something here?
we are safely immediately exiting because we don't change swap_map
till we finish all checks. while all checks are done, we write them by
WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
at the end.
>
> >
> > - } else if (count || has_cache) {
> > -
> > - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> > - count += usage;
> > - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> > - err = -EINVAL;
> > - else if (swap_count_continued(p, offset, count))
> > - count = COUNT_CONTINUED;
> > - else
> > - err = -ENOMEM;
> > - } else
> > - err = -ENOENT; /* unused swap entry */
> > -
> > - if (!err)
> > - WRITE_ONCE(p->swap_map[offset], count | has_cache);
> > + has_cache[i] = count[i] & SWAP_HAS_CACHE;
> > + count[i] &= ~SWAP_HAS_CACHE;
> > + err = 0;
> > +
> > + if (usage == SWAP_HAS_CACHE) {
> > +
> > + /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> > + if (!has_cache[i] && count[i])
> > + has_cache[i] = SWAP_HAS_CACHE;
> > + else if (has_cache[i]) /* someone else added cache */
> > + err = -EEXIST;
> > + else /* no users remaining */
> > + err = -ENOENT;
> > + } else if (count[i] || has_cache[i]) {
> > +
> > + if ((count[i] & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> > + count[i] += usage;
> > + else if ((count[i] & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> > + err = -EINVAL;
> > + else if (swap_count_continued(p, offset + i, count[i]))
> > + count[i] = COUNT_CONTINUED;
> > + else
> > + err = -ENOMEM;
> > + } else
> > + err = -ENOENT; /* unused swap entry */
> > + }
> >
> > + if (!err) {
> > + for (i = 0; i < nr; i++)
> > + WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
Here is the place where we really write data. Before that, we only
touched temp variables.
> > + }
> > unlock_out:
> > unlock_cluster_or_swap_info(p, ci);
> > return err;
> > }
thanks
Barry
On Wed, Feb 28, 2024 at 4:59 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, Feb 29, 2024 at 1:52 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Wed, Feb 28, 2024 at 4:39 PM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > From: Barry Song <v-songbaohua@oppo.com>
> > >
> > > Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
> > > one entry only, to support large folio swap-in, we need to handle multiple
> > > swap entries.
> > >
> > > Cc: Kairui Song <kasong@tencent.com>
> > > Cc: "Huang, Ying" <ying.huang@intel.com>
> > > Cc: Yu Zhao <yuzhao@google.com>
> > > Cc: David Hildenbrand <david@redhat.com>
> > > Cc: Chris Li <chrisl@kernel.org>
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > Cc: Michal Hocko <mhocko@suse.com>
> > > Cc: Minchan Kim <minchan@kernel.org>
> > > Cc: Yosry Ahmed <yosryahmed@google.com>
> > > Cc: Yu Zhao <yuzhao@google.com>
> > > Cc: SeongJae Park <sj@kernel.org>
> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > > ---
> > > include/linux/swap.h | 1 +
> > > mm/swap.h | 1 +
> > > mm/swapfile.c | 117 ++++++++++++++++++++++++++-----------------
> > > 3 files changed, 72 insertions(+), 47 deletions(-)
> > >
> > > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > > index b3581c976e5f..2691c739d9a4 100644
> > > --- a/include/linux/swap.h
> > > +++ b/include/linux/swap.h
> > > @@ -480,6 +480,7 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> > > extern void swap_shmem_alloc(swp_entry_t);
> > > extern int swap_duplicate(swp_entry_t);
> > > extern int swapcache_prepare(swp_entry_t);
> > > +extern int swapcache_prepare_nr(swp_entry_t, int nr);
> > > extern void swap_free(swp_entry_t);
> > > extern void swap_nr_free(swp_entry_t entry, int nr_pages);
> > > extern void swapcache_free_entries(swp_entry_t *entries, int n);
> > > diff --git a/mm/swap.h b/mm/swap.h
> > > index fc2f6ade7f80..1cec991efcda 100644
> > > --- a/mm/swap.h
> > > +++ b/mm/swap.h
> > > @@ -42,6 +42,7 @@ void delete_from_swap_cache(struct folio *folio);
> > > void clear_shadow_from_swap_cache(int type, unsigned long begin,
> > > unsigned long end);
> > > void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
> > > +void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr);
> > > struct folio *swap_cache_get_folio(swp_entry_t entry,
> > > struct vm_area_struct *vma, unsigned long addr);
> > > struct folio *filemap_get_incore_folio(struct address_space *mapping,
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index c0c058ee7b69..c8c8b6dbaeda 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -3308,7 +3308,7 @@ void si_swapinfo(struct sysinfo *val)
> > > }
> > >
> > > /*
> > > - * Verify that a swap entry is valid and increment its swap map count.
> > > + * Verify that nr swap entries are valid and increment their swap map count.
> > > *
> > > * Returns error code in following case.
> > > * - success -> 0
> > > @@ -3318,66 +3318,73 @@ void si_swapinfo(struct sysinfo *val)
> > > * - swap-cache reference is requested but the entry is not used. -> ENOENT
> > > * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> > > */
> > > -static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> > > +static int __swap_duplicate_nr(swp_entry_t entry, int nr, unsigned char usage)
> > > {
> > > struct swap_info_struct *p;
> > > struct swap_cluster_info *ci;
> > > unsigned long offset;
> > > - unsigned char count;
> > > - unsigned char has_cache;
> > > - int err;
> > > + unsigned char count[SWAPFILE_CLUSTER];
> > > + unsigned char has_cache[SWAPFILE_CLUSTER];
> >
>
> Hi Yosry,
>
> Thanks for reviewing!
>
> > I am not closely following this series, but a couple of things caught my eyes.
> >
> > Is this reasonable for stack usage?
>
> SWAPFILE_CLUSTER is not huge. typically 512 or 256.
So that's 1K of stack usage out of 16K total on x86. I think this may
be a lot for a single function to use, but perhaps others will
disagree.
>
> #ifdef CONFIG_THP_SWAP
> #define SWAPFILE_CLUSTER HPAGE_PMD_NR
>
> #define swap_entry_size(size) (size)
> #else
> #define SWAPFILE_CLUSTER 256
>
> If this is still a concern, I can move it to a bitmap.
>
> >
> > > + int err, i;
> > >
> > > p = swp_swap_info(entry);
> > >
> > > offset = swp_offset(entry);
> > > ci = lock_cluster_or_swap_info(p, offset);
> > >
> > > - count = p->swap_map[offset];
> > > -
> > > - /*
> > > - * swapin_readahead() doesn't check if a swap entry is valid, so the
> > > - * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> > > - */
> > > - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
> > > - err = -ENOENT;
> > > - goto unlock_out;
> > > - }
> > > + for (i = 0; i < nr; i++) {
> > > + count[i] = p->swap_map[offset + i];
> > >
> > > - has_cache = count & SWAP_HAS_CACHE;
> > > - count &= ~SWAP_HAS_CACHE;
> > > - err = 0;
> > > -
> > > - if (usage == SWAP_HAS_CACHE) {
> > > -
> > > - /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> > > - if (!has_cache && count)
> > > - has_cache = SWAP_HAS_CACHE;
> > > - else if (has_cache) /* someone else added cache */
> > > - err = -EEXIST;
> > > - else /* no users remaining */
> > > + /*
> > > + * swapin_readahead() doesn't check if a swap entry is valid, so the
> > > + * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> > > + */
> > > + if (unlikely(swap_count(count[i]) == SWAP_MAP_BAD)) {
> > > err = -ENOENT;
> > > + goto unlock_out;
> > > + }
> >
>
>
> > Here we immediately exit if there is an error, but we don't below, we
> > just keep overwriting the error every iteration as far as I can tell.
> > Also, it doesn't seem like we do any cleanups if we hit an error
> > halfway through. Should we undo previously updated swap entries, or am
> > I missing something here?
>
> we are safely immediately exiting because we don't change swap_map
> till we finish all checks. while all checks are done, we write them by
> WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
> at the end.
I see, but I think we may be overwriting the error from each iteration below?
>
> >
> > >
> > > - } else if (count || has_cache) {
> > > -
> > > - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> > > - count += usage;
> > > - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> > > - err = -EINVAL;
> > > - else if (swap_count_continued(p, offset, count))
> > > - count = COUNT_CONTINUED;
> > > - else
> > > - err = -ENOMEM;
> > > - } else
> > > - err = -ENOENT; /* unused swap entry */
> > > -
> > > - if (!err)
> > > - WRITE_ONCE(p->swap_map[offset], count | has_cache);
> > > + has_cache[i] = count[i] & SWAP_HAS_CACHE;
> > > + count[i] &= ~SWAP_HAS_CACHE;
> > > + err = 0;
> > > +
> > > + if (usage == SWAP_HAS_CACHE) {
> > > +
> > > + /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> > > + if (!has_cache[i] && count[i])
> > > + has_cache[i] = SWAP_HAS_CACHE;
> > > + else if (has_cache[i]) /* someone else added cache */
> > > + err = -EEXIST;
> > > + else /* no users remaining */
> > > + err = -ENOENT;
> > > + } else if (count[i] || has_cache[i]) {
> > > +
> > > + if ((count[i] & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> > > + count[i] += usage;
> > > + else if ((count[i] & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> > > + err = -EINVAL;
> > > + else if (swap_count_continued(p, offset + i, count[i]))
> > > + count[i] = COUNT_CONTINUED;
> > > + else
> > > + err = -ENOMEM;
> > > + } else
> > > + err = -ENOENT; /* unused swap entry */
> > > + }
> > >
> > > + if (!err) {
> > > + for (i = 0; i < nr; i++)
> > > + WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
>
> Here is the place where we really write data. Before that, we only
> touched temp variables.
>
> > > + }
> > > unlock_out:
> > > unlock_cluster_or_swap_info(p, ci);
> > > return err;
> > > }
>
> thanks
> Barry
On Thu, Feb 29, 2024 at 2:12 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Wed, Feb 28, 2024 at 4:59 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Thu, Feb 29, 2024 at 1:52 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Wed, Feb 28, 2024 at 4:39 PM Barry Song <21cnbao@gmail.com> wrote:
> > > >
> > > > From: Barry Song <v-songbaohua@oppo.com>
> > > >
> > > > Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache") supports
> > > > one entry only, to support large folio swap-in, we need to handle multiple
> > > > swap entries.
> > > >
> > > > Cc: Kairui Song <kasong@tencent.com>
> > > > Cc: "Huang, Ying" <ying.huang@intel.com>
> > > > Cc: Yu Zhao <yuzhao@google.com>
> > > > Cc: David Hildenbrand <david@redhat.com>
> > > > Cc: Chris Li <chrisl@kernel.org>
> > > > Cc: Hugh Dickins <hughd@google.com>
> > > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > > Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > > Cc: Michal Hocko <mhocko@suse.com>
> > > > Cc: Minchan Kim <minchan@kernel.org>
> > > > Cc: Yosry Ahmed <yosryahmed@google.com>
> > > > Cc: Yu Zhao <yuzhao@google.com>
> > > > Cc: SeongJae Park <sj@kernel.org>
> > > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > > > ---
> > > > include/linux/swap.h | 1 +
> > > > mm/swap.h | 1 +
> > > > mm/swapfile.c | 117 ++++++++++++++++++++++++++-----------------
> > > > 3 files changed, 72 insertions(+), 47 deletions(-)
> > > >
> > > > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > > > index b3581c976e5f..2691c739d9a4 100644
> > > > --- a/include/linux/swap.h
> > > > +++ b/include/linux/swap.h
> > > > @@ -480,6 +480,7 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> > > > extern void swap_shmem_alloc(swp_entry_t);
> > > > extern int swap_duplicate(swp_entry_t);
> > > > extern int swapcache_prepare(swp_entry_t);
> > > > +extern int swapcache_prepare_nr(swp_entry_t, int nr);
> > > > extern void swap_free(swp_entry_t);
> > > > extern void swap_nr_free(swp_entry_t entry, int nr_pages);
> > > > extern void swapcache_free_entries(swp_entry_t *entries, int n);
> > > > diff --git a/mm/swap.h b/mm/swap.h
> > > > index fc2f6ade7f80..1cec991efcda 100644
> > > > --- a/mm/swap.h
> > > > +++ b/mm/swap.h
> > > > @@ -42,6 +42,7 @@ void delete_from_swap_cache(struct folio *folio);
> > > > void clear_shadow_from_swap_cache(int type, unsigned long begin,
> > > > unsigned long end);
> > > > void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
> > > > +void swapcache_clear_nr(struct swap_info_struct *si, swp_entry_t entry, int nr);
> > > > struct folio *swap_cache_get_folio(swp_entry_t entry,
> > > > struct vm_area_struct *vma, unsigned long addr);
> > > > struct folio *filemap_get_incore_folio(struct address_space *mapping,
> > > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > > index c0c058ee7b69..c8c8b6dbaeda 100644
> > > > --- a/mm/swapfile.c
> > > > +++ b/mm/swapfile.c
> > > > @@ -3308,7 +3308,7 @@ void si_swapinfo(struct sysinfo *val)
> > > > }
> > > >
> > > > /*
> > > > - * Verify that a swap entry is valid and increment its swap map count.
> > > > + * Verify that nr swap entries are valid and increment their swap map count.
> > > > *
> > > > * Returns error code in following case.
> > > > * - success -> 0
> > > > @@ -3318,66 +3318,73 @@ void si_swapinfo(struct sysinfo *val)
> > > > * - swap-cache reference is requested but the entry is not used. -> ENOENT
> > > > * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> > > > */
> > > > -static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> > > > +static int __swap_duplicate_nr(swp_entry_t entry, int nr, unsigned char usage)
> > > > {
> > > > struct swap_info_struct *p;
> > > > struct swap_cluster_info *ci;
> > > > unsigned long offset;
> > > > - unsigned char count;
> > > > - unsigned char has_cache;
> > > > - int err;
> > > > + unsigned char count[SWAPFILE_CLUSTER];
> > > > + unsigned char has_cache[SWAPFILE_CLUSTER];
> > >
> >
> > Hi Yosry,
> >
> > Thanks for reviewing!
> >
> > > I am not closely following this series, but a couple of things caught my eyes.
> > >
> > > Is this reasonable for stack usage?
> >
> > SWAPFILE_CLUSTER is not huge. typically 512 or 256.
>
> So that's 1K of stack usage out of 16K total on x86. I think this may
> be a lot for a single function to use, but perhaps others will
> disagree.
>
> >
> > #ifdef CONFIG_THP_SWAP
> > #define SWAPFILE_CLUSTER HPAGE_PMD_NR
> >
> > #define swap_entry_size(size) (size)
> > #else
> > #define SWAPFILE_CLUSTER 256
> >
> > If this is still a concern, I can move it to a bitmap.
> >
> > >
> > > > + int err, i;
> > > >
> > > > p = swp_swap_info(entry);
> > > >
> > > > offset = swp_offset(entry);
> > > > ci = lock_cluster_or_swap_info(p, offset);
> > > >
> > > > - count = p->swap_map[offset];
> > > > -
> > > > - /*
> > > > - * swapin_readahead() doesn't check if a swap entry is valid, so the
> > > > - * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> > > > - */
> > > > - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
> > > > - err = -ENOENT;
> > > > - goto unlock_out;
> > > > - }
> > > > + for (i = 0; i < nr; i++) {
> > > > + count[i] = p->swap_map[offset + i];
> > > >
> > > > - has_cache = count & SWAP_HAS_CACHE;
> > > > - count &= ~SWAP_HAS_CACHE;
> > > > - err = 0;
> > > > -
> > > > - if (usage == SWAP_HAS_CACHE) {
> > > > -
> > > > - /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> > > > - if (!has_cache && count)
> > > > - has_cache = SWAP_HAS_CACHE;
> > > > - else if (has_cache) /* someone else added cache */
> > > > - err = -EEXIST;
> > > > - else /* no users remaining */
> > > > + /*
> > > > + * swapin_readahead() doesn't check if a swap entry is valid, so the
> > > > + * swap entry could be SWAP_MAP_BAD. Check here with lock held.
> > > > + */
> > > > + if (unlikely(swap_count(count[i]) == SWAP_MAP_BAD)) {
> > > > err = -ENOENT;
> > > > + goto unlock_out;
> > > > + }
> > >
> >
> >
> > > Here we immediately exit if there is an error, but we don't below, we
> > > just keep overwriting the error every iteration as far as I can tell.
> > > Also, it doesn't seem like we do any cleanups if we hit an error
> > > halfway through. Should we undo previously updated swap entries, or am
> > > I missing something here?
> >
> > we are safely immediately exiting because we don't change swap_map
> > till we finish all checks. while all checks are done, we write them by
> > WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
> > at the end.
>
> I see, but I think we may be overwriting the error from each iteration below?
you are right, Yosry, i used to have the below to break. don't know
when I accidentally
dropped it :-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c8c8b6dbaeda..091966056aec 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3369,6 +3369,9 @@ static int __swap_duplicate_nr(swp_entry_t
entry, int nr, unsigned char usage)
err = -ENOMEM;
} else
err = -ENOENT; /* unused swap entry */
+
+ if (err)
+ break;
}
if (!err) {
>
> >
> > >
> > > >
> > > > - } else if (count || has_cache) {
> > > > -
> > > > - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> > > > - count += usage;
> > > > - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> > > > - err = -EINVAL;
> > > > - else if (swap_count_continued(p, offset, count))
> > > > - count = COUNT_CONTINUED;
> > > > - else
> > > > - err = -ENOMEM;
> > > > - } else
> > > > - err = -ENOENT; /* unused swap entry */
> > > > -
> > > > - if (!err)
> > > > - WRITE_ONCE(p->swap_map[offset], count | has_cache);
> > > > + has_cache[i] = count[i] & SWAP_HAS_CACHE;
> > > > + count[i] &= ~SWAP_HAS_CACHE;
> > > > + err = 0;
> > > > +
> > > > + if (usage == SWAP_HAS_CACHE) {
> > > > +
> > > > + /* set SWAP_HAS_CACHE if there is no cache and entry is used */
> > > > + if (!has_cache[i] && count[i])
> > > > + has_cache[i] = SWAP_HAS_CACHE;
> > > > + else if (has_cache[i]) /* someone else added cache */
> > > > + err = -EEXIST;
> > > > + else /* no users remaining */
> > > > + err = -ENOENT;
> > > > + } else if (count[i] || has_cache[i]) {
> > > > +
> > > > + if ((count[i] & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
> > > > + count[i] += usage;
> > > > + else if ((count[i] & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
> > > > + err = -EINVAL;
> > > > + else if (swap_count_continued(p, offset + i, count[i]))
> > > > + count[i] = COUNT_CONTINUED;
> > > > + else
> > > > + err = -ENOMEM;
> > > > + } else
> > > > + err = -ENOENT; /* unused swap entry */
> > > > + }
> > > >
> > > > + if (!err) {
> > > > + for (i = 0; i < nr; i++)
> > > > + WRITE_ONCE(p->swap_map[offset + i], count[i] | has_cache[i]);
> >
> > Here is the place where we really write data. Before that, we only
> > touched temp variables.
> >
> > > > + }
> > > > unlock_out:
> > > > unlock_cluster_or_swap_info(p, ci);
> > > > return err;
> > > > }
> >
thanks
Barry
© 2016 - 2026 Red Hat, Inc.