In swap_range_free(), we update inuse_pages then do some cleanups (arch
invalidation, zswap invalidation, swap cache cleanups, etc). During
swapoff, try_to_unuse() uses inuse_pages to make sure all swap entries
are freed. Make sure we only update inuse_pages after we are done with
the cleanups.
In practice, this shouldn't matter, because swap_range_free() is called
with the swap info lock held, and the swapoff code will spin for that
lock after try_to_unuse() anyway.
The goal is to make it obvious and more future proof that once
try_to_unuse() returns, all cleanups are done. This also facilitates a
following zswap cleanup patch which uses this fact to simplify
zswap_swapoff().
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
mm/swapfile.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f0..2fedb148b9404 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
if (was_full && (si->flags & SWP_WRITEOK))
add_to_avail_list(si);
}
- atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -752,6 +750,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
offset++;
}
clear_shadow_from_swap_cache(si->type, begin, end);
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}
static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
--
2.43.0.429.g432eaa2c6b-goog
Yosry Ahmed <yosryahmed@google.com> writes: > In swap_range_free(), we update inuse_pages then do some cleanups (arch > invalidation, zswap invalidation, swap cache cleanups, etc). During > swapoff, try_to_unuse() uses inuse_pages to make sure all swap entries > are freed. Make sure we only update inuse_pages after we are done with > the cleanups. > > In practice, this shouldn't matter, because swap_range_free() is called > with the swap info lock held, and the swapoff code will spin for that > lock after try_to_unuse() anyway. > > The goal is to make it obvious and more future proof that once > try_to_unuse() returns, all cleanups are done. Defines "all cleanups". Apparently, some other operations are still to be done after try_to_unuse() in swap_off(). > This also facilitates a > following zswap cleanup patch which uses this fact to simplify > zswap_swapoff(). > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> > --- > mm/swapfile.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 556ff7347d5f0..2fedb148b9404 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > if (was_full && (si->flags & SWP_WRITEOK)) > add_to_avail_list(si); > } > - atomic_long_add(nr_entries, &nr_swap_pages); > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > if (si->flags & SWP_BLKDEV) > swap_slot_free_notify = > si->bdev->bd_disk->fops->swap_slot_free_notify; > @@ -752,6 +750,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > offset++; > } > clear_shadow_from_swap_cache(si->type, begin, end); > + atomic_long_add(nr_entries, &nr_swap_pages); > + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); This isn't enough. You need to use smp_wmb() here and smp_rmb() in somewhere reading si->inuse_pages. > } > > static void set_cluster_next(struct swap_info_struct *si, unsigned long next) -- Best Regards, Huang, Ying
On Tue, Jan 23, 2024 at 1:01 AM Huang, Ying <ying.huang@intel.com> wrote:
>
> Yosry Ahmed <yosryahmed@google.com> writes:
>
> > In swap_range_free(), we update inuse_pages then do some cleanups (arch
> > invalidation, zswap invalidation, swap cache cleanups, etc). During
> > swapoff, try_to_unuse() uses inuse_pages to make sure all swap entries
> > are freed. Make sure we only update inuse_pages after we are done with
> > the cleanups.
> >
> > In practice, this shouldn't matter, because swap_range_free() is called
> > with the swap info lock held, and the swapoff code will spin for that
> > lock after try_to_unuse() anyway.
> >
> > The goal is to make it obvious and more future proof that once
> > try_to_unuse() returns, all cleanups are done.
>
> Defines "all cleanups". Apparently, some other operations are still
> to be done after try_to_unuse() in swap_off().
I am referring to the cleanups in swap_range_free() that I mentioned above.
How about s/all the cleanups/all the cleanups in swap_range_free()?
>
> > This also facilitates a
> > following zswap cleanup patch which uses this fact to simplify
> > zswap_swapoff().
> >
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> > mm/swapfile.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 556ff7347d5f0..2fedb148b9404 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> > if (was_full && (si->flags & SWP_WRITEOK))
> > add_to_avail_list(si);
> > }
> > - atomic_long_add(nr_entries, &nr_swap_pages);
> > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
> > if (si->flags & SWP_BLKDEV)
> > swap_slot_free_notify =
> > si->bdev->bd_disk->fops->swap_slot_free_notify;
> > @@ -752,6 +750,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> > offset++;
> > }
> > clear_shadow_from_swap_cache(si->type, begin, end);
> > + atomic_long_add(nr_entries, &nr_swap_pages);
> > + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
>
> This isn't enough. You need to use smp_wmb() here and smp_rmb() in
> somewhere reading si->inuse_pages.
Hmm, good point. Although as I mentioned in the commit message, this
shouldn't matter today as swap_range_free() executes with the lock
held, and we spin on the lock after try_to_unuse() returns. It may
still be more future-proof to add the memory barriers.
In swap_range_free, we want to make sure that the write to
si->inuse_pages in swap_range_free() happens *after* the cleanups
(specifically zswap_invalidate() in this case).
In swap_off, we want to make sure that the cleanups following
try_to_unuse() (e.g. zswap_swapoff) happen *after* reading
si->inuse_pages == 0 in try_to_unuse().
So I think we want smp_wmb() in swap_range_free() and smp_mb() in
try_to_unuse(). Does the below look correct to you?
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2fedb148b9404..a2fa2f65a8ddd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -750,6 +750,12 @@ static void swap_range_free(struct
swap_info_struct *si, unsigned long offset,
offset++;
}
clear_shadow_from_swap_cache(si->type, begin, end);
+
+ /*
+ * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
+ * only after the above cleanups are done.
+ */
+ smp_wmb();
atomic_long_add(nr_entries, &nr_swap_pages);
WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}
@@ -2130,6 +2136,11 @@ static int try_to_unuse(unsigned int type)
return -EINTR;
}
+ /*
+ * Make sure that further cleanups after try_to_unuse() returns happen
+ * after swap_range_free() reduces si->inuse_pages to 0.
+ */
+ smp_mb();
return 0;
}
Alternatively, we may just hold the spinlock in try_to_unuse() when we
check si->inuse_pages at the end. This will also ensure that any calls
to swap_range_free() have completed. Let me know what you prefer.
Yosry Ahmed <yosryahmed@google.com> writes: > On Tue, Jan 23, 2024 at 1:01 AM Huang, Ying <ying.huang@intel.com> wrote: >> >> Yosry Ahmed <yosryahmed@google.com> writes: >> >> > In swap_range_free(), we update inuse_pages then do some cleanups (arch >> > invalidation, zswap invalidation, swap cache cleanups, etc). During >> > swapoff, try_to_unuse() uses inuse_pages to make sure all swap entries >> > are freed. Make sure we only update inuse_pages after we are done with >> > the cleanups. >> > >> > In practice, this shouldn't matter, because swap_range_free() is called >> > with the swap info lock held, and the swapoff code will spin for that >> > lock after try_to_unuse() anyway. >> > >> > The goal is to make it obvious and more future proof that once >> > try_to_unuse() returns, all cleanups are done. >> >> Defines "all cleanups". Apparently, some other operations are still >> to be done after try_to_unuse() in swap_off(). > > I am referring to the cleanups in swap_range_free() that I mentioned above. > > How about s/all the cleanups/all the cleanups in swap_range_free()? Sounds good for me. >> >> > This also facilitates a >> > following zswap cleanup patch which uses this fact to simplify >> > zswap_swapoff(). >> > >> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> >> > --- >> > mm/swapfile.c | 4 ++-- >> > 1 file changed, 2 insertions(+), 2 deletions(-) >> > >> > diff --git a/mm/swapfile.c b/mm/swapfile.c >> > index 556ff7347d5f0..2fedb148b9404 100644 >> > --- a/mm/swapfile.c >> > +++ b/mm/swapfile.c >> > @@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, >> > if (was_full && (si->flags & SWP_WRITEOK)) >> > add_to_avail_list(si); >> > } >> > - atomic_long_add(nr_entries, &nr_swap_pages); >> > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); >> > if (si->flags & SWP_BLKDEV) >> > swap_slot_free_notify = >> > si->bdev->bd_disk->fops->swap_slot_free_notify; >> > @@ -752,6 +750,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, >> > offset++; >> > } >> > clear_shadow_from_swap_cache(si->type, begin, end); >> > + atomic_long_add(nr_entries, &nr_swap_pages); >> > + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); >> >> This isn't enough. You need to use smp_wmb() here and smp_rmb() in >> somewhere reading si->inuse_pages. > > Hmm, good point. Although as I mentioned in the commit message, this > shouldn't matter today as swap_range_free() executes with the lock > held, and we spin on the lock after try_to_unuse() returns. Yes. IIUC, this patch isn't needed too because we have spinlock already. > It may still be more future-proof to add the memory barriers. Yes. Without memory barriers, moving code doesn't guarantee memory order. > In swap_range_free, we want to make sure that the write to > si->inuse_pages in swap_range_free() happens *after* the cleanups > (specifically zswap_invalidate() in this case). > In swap_off, we want to make sure that the cleanups following > try_to_unuse() (e.g. zswap_swapoff) happen *after* reading > si->inuse_pages == 0 in try_to_unuse(). > > So I think we want smp_wmb() in swap_range_free() and smp_mb() in > try_to_unuse(). Does the below look correct to you? > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 2fedb148b9404..a2fa2f65a8ddd 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -750,6 +750,12 @@ static void swap_range_free(struct > swap_info_struct *si, unsigned long offset, > offset++; > } > clear_shadow_from_swap_cache(si->type, begin, end); > + > + /* > + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 > + * only after the above cleanups are done. > + */ > + smp_wmb(); > atomic_long_add(nr_entries, &nr_swap_pages); > WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > } > @@ -2130,6 +2136,11 @@ static int try_to_unuse(unsigned int type) > return -EINTR; > } > > + /* > + * Make sure that further cleanups after try_to_unuse() returns happen > + * after swap_range_free() reduces si->inuse_pages to 0. > + */ > + smp_mb(); > return 0; > } We need to take care of "si->inuse_pages" checking at the beginning of try_to_unuse() too. Otherwise, it looks good to me. > Alternatively, we may just hold the spinlock in try_to_unuse() when we > check si->inuse_pages at the end. This will also ensure that any calls > to swap_range_free() have completed. Let me know what you prefer. Personally, I prefer memory barriers here. -- Best Regards, Huang, Ying
> > In swap_range_free, we want to make sure that the write to > > si->inuse_pages in swap_range_free() happens *after* the cleanups > > (specifically zswap_invalidate() in this case). > > In swap_off, we want to make sure that the cleanups following > > try_to_unuse() (e.g. zswap_swapoff) happen *after* reading > > si->inuse_pages == 0 in try_to_unuse(). > > > > So I think we want smp_wmb() in swap_range_free() and smp_mb() in > > try_to_unuse(). Does the below look correct to you? > > > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index 2fedb148b9404..a2fa2f65a8ddd 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -750,6 +750,12 @@ static void swap_range_free(struct > > swap_info_struct *si, unsigned long offset, > > offset++; > > } > > clear_shadow_from_swap_cache(si->type, begin, end); > > + > > + /* > > + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 > > + * only after the above cleanups are done. > > + */ > > + smp_wmb(); > > atomic_long_add(nr_entries, &nr_swap_pages); > > WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > > } > > @@ -2130,6 +2136,11 @@ static int try_to_unuse(unsigned int type) > > return -EINTR; > > } > > > > + /* > > + * Make sure that further cleanups after try_to_unuse() returns happen > > + * after swap_range_free() reduces si->inuse_pages to 0. > > + */ > > + smp_mb(); > > return 0; > > } > > We need to take care of "si->inuse_pages" checking at the beginning of > try_to_unuse() too. Otherwise, it looks good to me. Hmm, why isn't one barrier at the end of the function enough? I think all we need is that before we return from try_to_unuse(), all the cleanups in swap_range_free() are taken care of, which the barrier at the end should be doing. We just want instructions after try_to_unuse() to not get re-ordered before si->inuse_pages is read as 0, right? > > > Alternatively, we may just hold the spinlock in try_to_unuse() when we > > check si->inuse_pages at the end. This will also ensure that any calls > > to swap_range_free() have completed. Let me know what you prefer. > > Personally, I prefer memory barriers here. Ack.
Yosry Ahmed <yosryahmed@google.com> writes:
>> > In swap_range_free, we want to make sure that the write to
>> > si->inuse_pages in swap_range_free() happens *after* the cleanups
>> > (specifically zswap_invalidate() in this case).
>> > In swap_off, we want to make sure that the cleanups following
>> > try_to_unuse() (e.g. zswap_swapoff) happen *after* reading
>> > si->inuse_pages == 0 in try_to_unuse().
>> >
>> > So I think we want smp_wmb() in swap_range_free() and smp_mb() in
>> > try_to_unuse(). Does the below look correct to you?
>> >
>> > diff --git a/mm/swapfile.c b/mm/swapfile.c
>> > index 2fedb148b9404..a2fa2f65a8ddd 100644
>> > --- a/mm/swapfile.c
>> > +++ b/mm/swapfile.c
>> > @@ -750,6 +750,12 @@ static void swap_range_free(struct
>> > swap_info_struct *si, unsigned long offset,
>> > offset++;
>> > }
>> > clear_shadow_from_swap_cache(si->type, begin, end);
>> > +
>> > + /*
>> > + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
>> > + * only after the above cleanups are done.
>> > + */
>> > + smp_wmb();
>> > atomic_long_add(nr_entries, &nr_swap_pages);
>> > WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
>> > }
>> > @@ -2130,6 +2136,11 @@ static int try_to_unuse(unsigned int type)
>> > return -EINTR;
>> > }
>> >
>> > + /*
>> > + * Make sure that further cleanups after try_to_unuse() returns happen
>> > + * after swap_range_free() reduces si->inuse_pages to 0.
>> > + */
>> > + smp_mb();
>> > return 0;
>> > }
>>
>> We need to take care of "si->inuse_pages" checking at the beginning of
>> try_to_unuse() too. Otherwise, it looks good to me.
>
> Hmm, why isn't one barrier at the end of the function enough? I think
> all we need is that before we return from try_to_unuse(), all the
> cleanups in swap_range_free() are taken care of, which the barrier at
> the end should be doing. We just want instructions after
> try_to_unuse() to not get re-ordered before si->inuse_pages is read as
> 0, right?
Because at the begin of try_to_unuse() as below, after reading, function
returns directly without any memory barriers.
if (!READ_ONCE(si->inuse_pages))
return 0;
--
Best Regards,
Huang, Ying
On Tue, Jan 23, 2024 at 7:29 PM Huang, Ying <ying.huang@intel.com> wrote: > > Yosry Ahmed <yosryahmed@google.com> writes: > > >> > In swap_range_free, we want to make sure that the write to > >> > si->inuse_pages in swap_range_free() happens *after* the cleanups > >> > (specifically zswap_invalidate() in this case). > >> > In swap_off, we want to make sure that the cleanups following > >> > try_to_unuse() (e.g. zswap_swapoff) happen *after* reading > >> > si->inuse_pages == 0 in try_to_unuse(). > >> > > >> > So I think we want smp_wmb() in swap_range_free() and smp_mb() in > >> > try_to_unuse(). Does the below look correct to you? > >> > > >> > diff --git a/mm/swapfile.c b/mm/swapfile.c > >> > index 2fedb148b9404..a2fa2f65a8ddd 100644 > >> > --- a/mm/swapfile.c > >> > +++ b/mm/swapfile.c > >> > @@ -750,6 +750,12 @@ static void swap_range_free(struct > >> > swap_info_struct *si, unsigned long offset, > >> > offset++; > >> > } > >> > clear_shadow_from_swap_cache(si->type, begin, end); > >> > + > >> > + /* > >> > + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 > >> > + * only after the above cleanups are done. > >> > + */ > >> > + smp_wmb(); > >> > atomic_long_add(nr_entries, &nr_swap_pages); > >> > WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > >> > } > >> > @@ -2130,6 +2136,11 @@ static int try_to_unuse(unsigned int type) > >> > return -EINTR; > >> > } > >> > > >> > + /* > >> > + * Make sure that further cleanups after try_to_unuse() returns happen > >> > + * after swap_range_free() reduces si->inuse_pages to 0. > >> > + */ > >> > + smp_mb(); > >> > return 0; > >> > } > >> > >> We need to take care of "si->inuse_pages" checking at the beginning of > >> try_to_unuse() too. Otherwise, it looks good to me. > > > > Hmm, why isn't one barrier at the end of the function enough? I think > > all we need is that before we return from try_to_unuse(), all the > > cleanups in swap_range_free() are taken care of, which the barrier at > > the end should be doing. We just want instructions after > > try_to_unuse() to not get re-ordered before si->inuse_pages is read as > > 0, right? > > Because at the begin of try_to_unuse() as below, after reading, function > returns directly without any memory barriers. > > if (!READ_ONCE(si->inuse_pages)) > return 0; Right, I missed this one. Let me fix this up and send a v2. Thanks!
> Alternatively, we may just hold the spinlock in try_to_unuse() when we
> check si->inuse_pages at the end. This will also ensure that any calls
> to swap_range_free() have completed. Let me know what you prefer.
To elaborate, I mean replacing this patch and the memory barriers with
the diff below.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2fedb148b9404..9b932ecbd80a8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2046,6 +2046,7 @@ static int try_to_unuse(unsigned int type)
struct swap_info_struct *si = swap_info[type];
struct folio *folio;
swp_entry_t entry;
+ unsigned int inuse;
unsigned int i;
if (!READ_ONCE(si->inuse_pages))
@@ -2123,8 +2124,14 @@ static int try_to_unuse(unsigned int type)
* and even shmem_writepage() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
+ *
+ * Read si->inuse_pages with the lock held to make sure that cleanups in
+ * swap_range_free() are completed when we read si->inuse_pages == 0.
*/
- if (READ_ONCE(si->inuse_pages)) {
+ spin_lock(&si->lock);
+ inuse = si->inuse_pages;
+ spin_unlock(&si->lock);
+ if (inuse) {
if (!signal_pending(current))
goto retry;
return -EINTR;
On 2024/1/20 10:40, Yosry Ahmed wrote: > In swap_range_free(), we update inuse_pages then do some cleanups (arch > invalidation, zswap invalidation, swap cache cleanups, etc). During > swapoff, try_to_unuse() uses inuse_pages to make sure all swap entries > are freed. Make sure we only update inuse_pages after we are done with > the cleanups. > > In practice, this shouldn't matter, because swap_range_free() is called > with the swap info lock held, and the swapoff code will spin for that > lock after try_to_unuse() anyway. > > The goal is to make it obvious and more future proof that once > try_to_unuse() returns, all cleanups are done. This also facilitates a > following zswap cleanup patch which uses this fact to simplify > zswap_swapoff(). > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com> Thanks. > --- > mm/swapfile.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 556ff7347d5f0..2fedb148b9404 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > if (was_full && (si->flags & SWP_WRITEOK)) > add_to_avail_list(si); > } > - atomic_long_add(nr_entries, &nr_swap_pages); > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > if (si->flags & SWP_BLKDEV) > swap_slot_free_notify = > si->bdev->bd_disk->fops->swap_slot_free_notify; > @@ -752,6 +750,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, > offset++; > } > clear_shadow_from_swap_cache(si->type, begin, end); > + atomic_long_add(nr_entries, &nr_swap_pages); > + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); > } > > static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
© 2016 - 2025 Red Hat, Inc.