[PATCH] mm: attempt to batch free swap entries for zap_pte_range()

Barry Song posted 1 patch 1 year, 4 months ago
There is a newer version of this series
mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 61 insertions(+)
[PATCH] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 4 months ago
From: Barry Song <v-songbaohua@oppo.com>

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.
If swap_count is 1, which is likely true as most anon memory are
private, we can free all contiguous swap slots all together.

Ran the below test program for measuring the bandwidth of munmap
using zRAM and 64KiB mTHP:

 #include <sys/mman.h>
 #include <sys/time.h>
 #include <stdlib.h>

 unsigned long long tv_to_ms(struct timeval tv)
 {
        return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 }

 main()
 {
        struct timeval tv_b, tv_e;
        int i;
 #define SIZE 1024*1024*1024
        void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (!p) {
                perror("fail to get memory");
                exit(-1);
        }

        madvise(p, SIZE, MADV_HUGEPAGE);
        memset(p, 0x11, SIZE); /* write to get mem */

        madvise(p, SIZE, MADV_PAGEOUT);

        gettimeofday(&tv_b, NULL);
        munmap(p, SIZE);
        gettimeofday(&tv_e, NULL);

        printf("munmap in bandwidth: %ld bytes/ms\n",
                        SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
 }

The result is as below (munmap bandwidth):
                mm-unstable  mm-unstable-with-patch
   round1       21053761      63161283
   round2       21053761      63161283
   round3       21053761      63161283
   round4       20648881      67108864
   round5       20648881      67108864

munmap bandwidth becomes 3X faster.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Cc: Kairui Song <kasong@tencent.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..ed872a186e81 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
 	return true;
 }
 
+static bool swap_is_last_map(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages,
+			      bool *has_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	bool cached = false;
+
+	do {
+		if ((*map & ~SWAP_HAS_CACHE) != 1)
+			return false;
+		if (*map & SWAP_HAS_CACHE)
+			cached = true;
+	} while (++map < map_end);
+
+	*has_cache = cached;
+	return true;
+}
+
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
 	return usage;
 }
 
+static bool try_batch_swap_entries_free(struct swap_info_struct *p,
+		swp_entry_t entry, int nr, bool *any_only_cache)
+{
+	unsigned long offset = swp_offset(entry);
+	struct swap_cluster_info *ci;
+	bool has_cache = false;
+	bool can_batch;
+	int i;
+
+	/* cross into another cluster */
+	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+		return false;
+	ci = lock_cluster_or_swap_info(p, offset);
+	can_batch = swap_is_last_map(p, offset, nr, &has_cache);
+	if (can_batch) {
+		for (i = 0; i < nr; i++)
+			WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
+	}
+	unlock_cluster_or_swap_info(p, ci);
+
+	/* all swap_maps have count==1 and have no swapcache */
+	if (!can_batch)
+		goto out;
+	if (!has_cache) {
+		spin_lock(&p->lock);
+		swap_entry_range_free(p, entry, nr);
+		spin_unlock(&p->lock);
+	}
+	*any_only_cache = has_cache;
+out:
+	return can_batch;
+}
+
 /*
  * Drop the last HAS_CACHE flag of swap entries, caller have to
  * ensure all entries belong to the same cgroup.
@@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	bool any_only_cache = false;
 	unsigned long offset;
 	unsigned char count;
+	bool batched;
 
 	if (non_swap_entry(entry))
 		return;
@@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (WARN_ON(end_offset > si->max))
 		goto out;
 
+	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
+		batched = try_batch_swap_entries_free(si, entry, nr,
+						&any_only_cache);
+		if (batched)
+			goto reclaim;
+	}
+
 	/*
 	 * First free all entries in the range.
 	 */
@@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 		}
 	}
 
+reclaim:
 	/*
 	 * Short-circuit the below loop if none of the entries had their
 	 * reference drop to zero.
-- 
2.34.1
Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
Posted by David Hildenbrand 1 year, 4 months ago
On 06.08.24 03:24, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
> If swap_count is 1, which is likely true as most anon memory are
> private, we can free all contiguous swap slots all together.
> 
> Ran the below test program for measuring the bandwidth of munmap
> using zRAM and 64KiB mTHP:
> 
>   #include <sys/mman.h>
>   #include <sys/time.h>
>   #include <stdlib.h>
> 
>   unsigned long long tv_to_ms(struct timeval tv)
>   {
>          return tv.tv_sec * 1000 + tv.tv_usec / 1000;
>   }
> 
>   main()
>   {
>          struct timeval tv_b, tv_e;
>          int i;
>   #define SIZE 1024*1024*1024
>          void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>          if (!p) {
>                  perror("fail to get memory");
>                  exit(-1);
>          }
> 
>          madvise(p, SIZE, MADV_HUGEPAGE);
>          memset(p, 0x11, SIZE); /* write to get mem */
> 
>          madvise(p, SIZE, MADV_PAGEOUT);
> 
>          gettimeofday(&tv_b, NULL);
>          munmap(p, SIZE);
>          gettimeofday(&tv_e, NULL);
> 
>          printf("munmap in bandwidth: %ld bytes/ms\n",
>                          SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
>   }
> 
> The result is as below (munmap bandwidth):
>                  mm-unstable  mm-unstable-with-patch
>     round1       21053761      63161283
>     round2       21053761      63161283
>     round3       21053761      63161283
>     round4       20648881      67108864
>     round5       20648881      67108864
> 
> munmap bandwidth becomes 3X faster.
> 
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
> 
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>   mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 61 insertions(+)
> 
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index ea023fc25d08..ed872a186e81 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>   	return true;
>   }
>   
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +			      unsigned long offset, int nr_pages,
> +			      bool *has_cache)
> +{
> +	unsigned char *map = si->swap_map + offset;
> +	unsigned char *map_end = map + nr_pages;
> +	bool cached = false;
> +
> +	do {
> +		if ((*map & ~SWAP_HAS_CACHE) != 1)
> +			return false;
> +		if (*map & SWAP_HAS_CACHE)
> +			cached = true;
> +	} while (++map < map_end);
> +
> +	*has_cache = cached;
> +	return true;
> +}
> +
>   /*
>    * returns number of pages in the folio that backs the swap entry. If positive,
>    * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> @@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
>   	return usage;
>   }
>   
> +static bool try_batch_swap_entries_free(struct swap_info_struct *p,

Why call it "p" here and not "si" like in the other code you are touching?

> +		swp_entry_t entry, int nr, bool *any_only_cache)
> +{
> +	unsigned long offset = swp_offset(entry);
> +	struct swap_cluster_info *ci;
> +	bool has_cache = false;
> +	bool can_batch;
> +	int i;
> +
> +	/* cross into another cluster */
> +	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> +		return false;
> +	ci = lock_cluster_or_swap_info(p, offset);
> +	can_batch = swap_is_last_map(p, offset, nr, &has_cache);
> +	if (can_batch) {
> +		for (i = 0; i < nr; i++)
> +			WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
> +	}
> +	unlock_cluster_or_swap_info(p, ci);
> +
> +	/* all swap_maps have count==1 and have no swapcache */
> +	if (!can_batch)
> +		goto out;
> +	if (!has_cache) {
> +		spin_lock(&p->lock);
> +		swap_entry_range_free(p, entry, nr);
> +		spin_unlock(&p->lock);
> +	}
> +	*any_only_cache = has_cache;
> +out:
> +	return can_batch;
> +}
> +
>   /*
>    * Drop the last HAS_CACHE flag of swap entries, caller have to
>    * ensure all entries belong to the same cgroup.
> @@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>   	bool any_only_cache = false;
>   	unsigned long offset;
>   	unsigned char count;
> +	bool batched;
>   
>   	if (non_swap_entry(entry))
>   		return;
> @@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>   	if (WARN_ON(end_offset > si->max))
>   		goto out;
>   
> +	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
> +		batched = try_batch_swap_entries_free(si, entry, nr,
> +						&any_only_cache);
> +		if (batched)
> +			goto reclaim;
> +	}
> +

I'm wondering if we could find a way to clean this up to achieve here:


if (WARN_ON(end_offset > si->max))
	goto out;

/*
  * First free all entries in the range.$
  */
any_only_cache = __free_swap_entries(si, entry, nr);

/*
  * Short-circuit the below loop if none of the entries had their
  * reference drop to zero.
  */
if (!any_only_cache)
	goto out;




Whereby move the fallback loop in that new function

static bool __free_swap_entries(struct swap_info_struct *si,
		swp_entry_t entry, int nr)
{
	const unsigned long start_offset = swp_offset(entry);
	const unsigned long end_offset = start_offset + nr;
	bool any_only_cache = false;

	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
		[... what try_batch_swap_entries_free() would do ...]
	}

fallback:
	for (offset = start_offset; offset < end_offset; offset++) {
		if (data_race(si->swap_map[offset])) {
		[... what the fallback code would do ...]
	}
	return any_only_cache;
}


>   	/*
>   	 * First free all entries in the range.
>   	 */
> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>   		}
>   	}
>   
> +reclaim:
>   	/*
>   	 * Short-circuit the below loop if none of the entries had their
>   	 * reference drop to zero.

-- 
Cheers,

David / dhildenb
Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 4 months ago
On Tue, Aug 6, 2024 at 8:56 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 06.08.24 03:24, Barry Song wrote:
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > Zhiguo reported that swap release could be a serious bottleneck
> > during process exits[1]. With mTHP, we have the opportunity to
> > batch free swaps.
> > Thanks to the work of Chris and Kairui[2], I was able to achieve
> > this optimization with minimal code changes by building on their
> > efforts.
> > If swap_count is 1, which is likely true as most anon memory are
> > private, we can free all contiguous swap slots all together.
> >
> > Ran the below test program for measuring the bandwidth of munmap
> > using zRAM and 64KiB mTHP:
> >
> >   #include <sys/mman.h>
> >   #include <sys/time.h>
> >   #include <stdlib.h>
> >
> >   unsigned long long tv_to_ms(struct timeval tv)
> >   {
> >          return tv.tv_sec * 1000 + tv.tv_usec / 1000;
> >   }
> >
> >   main()
> >   {
> >          struct timeval tv_b, tv_e;
> >          int i;
> >   #define SIZE 1024*1024*1024
> >          void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
> >                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> >          if (!p) {
> >                  perror("fail to get memory");
> >                  exit(-1);
> >          }
> >
> >          madvise(p, SIZE, MADV_HUGEPAGE);
> >          memset(p, 0x11, SIZE); /* write to get mem */
> >
> >          madvise(p, SIZE, MADV_PAGEOUT);
> >
> >          gettimeofday(&tv_b, NULL);
> >          munmap(p, SIZE);
> >          gettimeofday(&tv_e, NULL);
> >
> >          printf("munmap in bandwidth: %ld bytes/ms\n",
> >                          SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
> >   }
> >
> > The result is as below (munmap bandwidth):
> >                  mm-unstable  mm-unstable-with-patch
> >     round1       21053761      63161283
> >     round2       21053761      63161283
> >     round3       21053761      63161283
> >     round4       20648881      67108864
> >     round5       20648881      67108864
> >
> > munmap bandwidth becomes 3X faster.
> >
> > [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> > [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
> >
> > Cc: Kairui Song <kasong@tencent.com>
> > Cc: Chris Li <chrisl@kernel.org>
> > Cc: "Huang, Ying" <ying.huang@intel.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Kalesh Singh <kaleshsingh@google.com>
> > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >   mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 61 insertions(+)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index ea023fc25d08..ed872a186e81 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
> >       return true;
> >   }
> >
> > +static bool swap_is_last_map(struct swap_info_struct *si,
> > +                           unsigned long offset, int nr_pages,
> > +                           bool *has_cache)
> > +{
> > +     unsigned char *map = si->swap_map + offset;
> > +     unsigned char *map_end = map + nr_pages;
> > +     bool cached = false;
> > +
> > +     do {
> > +             if ((*map & ~SWAP_HAS_CACHE) != 1)
> > +                     return false;
> > +             if (*map & SWAP_HAS_CACHE)
> > +                     cached = true;
> > +     } while (++map < map_end);
> > +
> > +     *has_cache = cached;
> > +     return true;
> > +}
> > +
> >   /*
> >    * returns number of pages in the folio that backs the swap entry. If positive,
> >    * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> > @@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
> >       return usage;
> >   }
> >
> > +static bool try_batch_swap_entries_free(struct swap_info_struct *p,
>
> Why call it "p" here and not "si" like in the other code you are touching?

that is because I found other _free_ functions are all using "p":

static unsigned char __swap_entry_free(struct swap_info_struct *p,
      swp_entry_t entry)
{
...
}

static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
{
...
}

For sure I can move from "p" to "si".

>
> > +             swp_entry_t entry, int nr, bool *any_only_cache)
> > +{
> > +     unsigned long offset = swp_offset(entry);
> > +     struct swap_cluster_info *ci;
> > +     bool has_cache = false;
> > +     bool can_batch;
> > +     int i;
> > +
> > +     /* cross into another cluster */
> > +     if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> > +             return false;
> > +     ci = lock_cluster_or_swap_info(p, offset);
> > +     can_batch = swap_is_last_map(p, offset, nr, &has_cache);
> > +     if (can_batch) {
> > +             for (i = 0; i < nr; i++)
> > +                     WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
> > +     }
> > +     unlock_cluster_or_swap_info(p, ci);
> > +
> > +     /* all swap_maps have count==1 and have no swapcache */
> > +     if (!can_batch)
> > +             goto out;
> > +     if (!has_cache) {
> > +             spin_lock(&p->lock);
> > +             swap_entry_range_free(p, entry, nr);
> > +             spin_unlock(&p->lock);
> > +     }
> > +     *any_only_cache = has_cache;
> > +out:
> > +     return can_batch;
> > +}
> > +
> >   /*
> >    * Drop the last HAS_CACHE flag of swap entries, caller have to
> >    * ensure all entries belong to the same cgroup.
> > @@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >       bool any_only_cache = false;
> >       unsigned long offset;
> >       unsigned char count;
> > +     bool batched;
> >
> >       if (non_swap_entry(entry))
> >               return;
> > @@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >       if (WARN_ON(end_offset > si->max))
> >               goto out;
> >
> > +     if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
> > +             batched = try_batch_swap_entries_free(si, entry, nr,
> > +                                             &any_only_cache);
> > +             if (batched)
> > +                     goto reclaim;
> > +     }
> > +
>
> I'm wondering if we could find a way to clean this up to achieve here:
>
>
> if (WARN_ON(end_offset > si->max))
>         goto out;
>
> /*
>   * First free all entries in the range.$
>   */
> any_only_cache = __free_swap_entries(si, entry, nr);
>
> /*
>   * Short-circuit the below loop if none of the entries had their
>   * reference drop to zero.
>   */
> if (!any_only_cache)
>         goto out;
>
>
>
>
> Whereby move the fallback loop in that new function
>
> static bool __free_swap_entries(struct swap_info_struct *si,
>                 swp_entry_t entry, int nr)
> {
>         const unsigned long start_offset = swp_offset(entry);
>         const unsigned long end_offset = start_offset + nr;
>         bool any_only_cache = false;
>
>         if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
>                 [... what try_batch_swap_entries_free() would do ...]
>         }
>
> fallback:
>         for (offset = start_offset; offset < end_offset; offset++) {
>                 if (data_race(si->swap_map[offset])) {
>                 [... what the fallback code would do ...]
>         }
>         return any_only_cache;
> }
>
>

good suggestion. will do this in v2.

> >       /*
> >        * First free all entries in the range.
> >        */
> > @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >               }
> >       }
> >
> > +reclaim:
> >       /*
> >        * Short-circuit the below loop if none of the entries had their
> >        * reference drop to zero.
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Andrew Morton 1 year, 4 months ago
On Wed, 7 Aug 2024 04:44:44 +0800 Barry Song <21cnbao@gmail.com> wrote:

> > > +static bool try_batch_swap_entries_free(struct swap_info_struct *p,
> >
> > Why call it "p" here and not "si" like in the other code you are touching?
> 
> that is because I found other _free_ functions are all using "p":

`p' sucks.  "pointer to something".  It's just lazy.  In this context, "si"
has meaning; lots of it.
Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 4 months ago
On Wed, Aug 7, 2024 at 6:41 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed, 7 Aug 2024 04:44:44 +0800 Barry Song <21cnbao@gmail.com> wrote:
>
> > > > +static bool try_batch_swap_entries_free(struct swap_info_struct *p,
> > >
> > > Why call it "p" here and not "si" like in the other code you are touching?
> >
> > that is because I found other _free_ functions are all using "p":
>
> `p' sucks.  "pointer to something".  It's just lazy.  In this context, "si"
> has meaning; lots of it.

Agreed. I'll also clean up the existing "p" in those _free_ functions
while sending
v2.

Thanks
Barry
Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 4 months ago
On Wed, Aug 7, 2024 at 10:23 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Aug 7, 2024 at 6:41 AM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Wed, 7 Aug 2024 04:44:44 +0800 Barry Song <21cnbao@gmail.com> wrote:
> >
> > > > > +static bool try_batch_swap_entries_free(struct swap_info_struct *p,
> > > >
> > > > Why call it "p" here and not "si" like in the other code you are touching?
> > >
> > > that is because I found other _free_ functions are all using "p":
> >
> > `p' sucks.  "pointer to something".  It's just lazy.  In this context, "si"
> > has meaning; lots of it.
>
> Agreed. I'll also clean up the existing "p" in those _free_ functions
> while sending
> v2.

well. we are having "p" everywhere. will be a separate patch for this cleanup:

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..d130cce3a02d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -532,7 +532,7 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
  * added to free cluster list and its usage counter will be increased by 1.
  * Only used for initialization.
  */
-static void inc_cluster_info_page(struct swap_info_struct *p,
+static void inc_cluster_info_page(struct swap_info_struct *si,
 	struct swap_cluster_info *cluster_info, unsigned long page_nr)
 {
 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
@@ -553,28 +553,28 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
  * which means no page in the cluster is in use, we can optionally discard
  * the cluster and add it to free cluster list.
  */
-static void dec_cluster_info_page(struct swap_info_struct *p,
+static void dec_cluster_info_page(struct swap_info_struct *si,
 				  struct swap_cluster_info *ci, int nr_pages)
 {
-	if (!p->cluster_info)
+	if (!si->cluster_info)
 		return;
 
 	VM_BUG_ON(ci->count < nr_pages);
 	VM_BUG_ON(cluster_is_free(ci));
-	lockdep_assert_held(&p->lock);
+	lockdep_assert_held(&si->lock);
 	lockdep_assert_held(&ci->lock);
 	ci->count -= nr_pages;
 
 	if (!ci->count) {
-		free_cluster(p, ci);
+		free_cluster(si, ci);
 		return;
 	}
 
 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
 		if (ci->flags & CLUSTER_FLAG_FRAG)
-			p->frag_cluster_nr[ci->order]--;
-		list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+			si->frag_cluster_nr[ci->order]--;
+		list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]);
 		ci->flags = CLUSTER_FLAG_NONFULL;
 	}
 }
@@ -872,19 +872,19 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 	return found;
 }
 
-static void __del_from_avail_list(struct swap_info_struct *p)
+static void __del_from_avail_list(struct swap_info_struct *si)
 {
 	int nid;
 
-	assert_spin_locked(&p->lock);
+	assert_spin_locked(&si->lock);
 	for_each_node(nid)
-		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+		plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
 }
 
-static void del_from_avail_list(struct swap_info_struct *p)
+static void del_from_avail_list(struct swap_info_struct *si)
 {
 	spin_lock(&swap_avail_lock);
-	__del_from_avail_list(p);
+	__del_from_avail_list(si);
 	spin_unlock(&swap_avail_lock);
 }
 
@@ -905,13 +905,13 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 	}
 }
 
-static void add_to_avail_list(struct swap_info_struct *p)
+static void add_to_avail_list(struct swap_info_struct *si)
 {
 	int nid;
 
 	spin_lock(&swap_avail_lock);
 	for_each_node(nid)
-		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
+		plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
 	spin_unlock(&swap_avail_lock);
 }
 
@@ -1291,22 +1291,22 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 
 static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
 {
-	struct swap_info_struct *p;
+	struct swap_info_struct *si;
 	unsigned long offset;
 
 	if (!entry.val)
 		goto out;
-	p = swp_swap_info(entry);
-	if (!p)
+	si = swp_swap_info(entry);
+	if (!si)
 		goto bad_nofile;
-	if (data_race(!(p->flags & SWP_USED)))
+	if (data_race(!(si->flags & SWP_USED)))
 		goto bad_device;
 	offset = swp_offset(entry);
-	if (offset >= p->max)
+	if (offset >= si->max)
 		goto bad_offset;
-	if (data_race(!p->swap_map[swp_offset(entry)]))
+	if (data_race(!si->swap_map[swp_offset(entry)]))
 		goto bad_free;
-	return p;
+	return si;
 
 bad_free:
 	pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
@@ -1339,14 +1339,14 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
 	return p;
 }
 
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
 					      unsigned long offset,
 					      unsigned char usage)
 {
 	unsigned char count;
 	unsigned char has_cache;
 
-	count = p->swap_map[offset];
+	count = si->swap_map[offset];
 
 	has_cache = count & SWAP_HAS_CACHE;
 	count &= ~SWAP_HAS_CACHE;
@@ -1362,7 +1362,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
 		count = 0;
 	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
 		if (count == COUNT_CONTINUED) {
-			if (swap_count_continued(p, offset, count))
+			if (swap_count_continued(si, offset, count))
 				count = SWAP_MAP_MAX | COUNT_CONTINUED;
 			else
 				count = SWAP_MAP_MAX;
@@ -1372,9 +1372,9 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
 
 	usage = count | has_cache;
 	if (usage)
-		WRITE_ONCE(p->swap_map[offset], usage);
+		WRITE_ONCE(si->swap_map[offset], usage);
 	else
-		WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
+		WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
 
 	return usage;
 }
@@ -1453,16 +1453,16 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	return NULL;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
+static unsigned char __swap_entry_free(struct swap_info_struct *si,
 				       swp_entry_t entry)
 {
 	struct swap_cluster_info *ci;
 	unsigned long offset = swp_offset(entry);
 	unsigned char usage;
 
-	ci = lock_cluster_or_swap_info(p, offset);
-	usage = __swap_entry_free_locked(p, offset, 1);
-	unlock_cluster_or_swap_info(p, ci);
+	ci = lock_cluster_or_swap_info(si, offset);
+	usage = __swap_entry_free_locked(si, offset, 1);
+	unlock_cluster_or_swap_info(si, ci);
 	if (!usage)
 		free_swap_slot(entry);
 
@@ -1473,27 +1473,27 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
  * Drop the last HAS_CACHE flag of swap entries, caller have to
  * ensure all entries belong to the same cgroup.
  */
-static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
+static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
 				  unsigned int nr_pages)
 {
 	unsigned long offset = swp_offset(entry);
-	unsigned char *map = p->swap_map + offset;
+	unsigned char *map = si->swap_map + offset;
 	unsigned char *map_end = map + nr_pages;
 	struct swap_cluster_info *ci;
 
-	ci = lock_cluster(p, offset);
+	ci = lock_cluster(si, offset);
 	do {
 		VM_BUG_ON(*map != SWAP_HAS_CACHE);
 		*map = 0;
 	} while (++map < map_end);
-	dec_cluster_info_page(p, ci, nr_pages);
+	dec_cluster_info_page(si, ci, nr_pages);
 	unlock_cluster(ci);
 
 	mem_cgroup_uncharge_swap(entry, nr_pages);
-	swap_range_free(p, offset, nr_pages);
+	swap_range_free(si, offset, nr_pages);
 }
 
-static void cluster_swap_free_nr(struct swap_info_struct *sis,
+static void cluster_swap_free_nr(struct swap_info_struct *si,
 		unsigned long offset, int nr_pages,
 		unsigned char usage)
 {
@@ -1501,26 +1501,26 @@ static void cluster_swap_free_nr(struct swap_info_struct *sis,
 	DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 };
 	int i, nr;
 
-	ci = lock_cluster_or_swap_info(sis, offset);
+	ci = lock_cluster_or_swap_info(si, offset);
 	while (nr_pages) {
 		nr = min(BITS_PER_LONG, nr_pages);
 		for (i = 0; i < nr; i++) {
-			if (!__swap_entry_free_locked(sis, offset + i, usage))
+			if (!__swap_entry_free_locked(si, offset + i, usage))
 				bitmap_set(to_free, i, 1);
 		}
 		if (!bitmap_empty(to_free, BITS_PER_LONG)) {
-			unlock_cluster_or_swap_info(sis, ci);
+			unlock_cluster_or_swap_info(si, ci);
 			for_each_set_bit(i, to_free, BITS_PER_LONG)
-				free_swap_slot(swp_entry(sis->type, offset + i));
+				free_swap_slot(swp_entry(si->type, offset + i));
 			if (nr == nr_pages)
 				return;
 			bitmap_clear(to_free, 0, BITS_PER_LONG);
-			ci = lock_cluster_or_swap_info(sis, offset);
+			ci = lock_cluster_or_swap_info(si, offset);
 		}
 		offset += nr;
 		nr_pages -= nr;
 	}
-	unlock_cluster_or_swap_info(sis, ci);
+	unlock_cluster_or_swap_info(si, ci);
 }
 
 /*
@@ -1646,28 +1646,28 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
 int swp_swapcount(swp_entry_t entry)
 {
 	int count, tmp_count, n;
-	struct swap_info_struct *p;
+	struct swap_info_struct *si;
 	struct swap_cluster_info *ci;
 	struct page *page;
 	pgoff_t offset;
 	unsigned char *map;
 
-	p = _swap_info_get(entry);
-	if (!p)
+	si = _swap_info_get(entry);
+	if (!si)
 		return 0;
 
 	offset = swp_offset(entry);
 
-	ci = lock_cluster_or_swap_info(p, offset);
+	ci = lock_cluster_or_swap_info(si, offset);
 
-	count = swap_count(p->swap_map[offset]);
+	count = swap_count(si->swap_map[offset]);
 	if (!(count & COUNT_CONTINUED))
 		goto out;
 
 	count &= ~COUNT_CONTINUED;
 	n = SWAP_MAP_MAX + 1;
 
-	page = vmalloc_to_page(p->swap_map + offset);
+	page = vmalloc_to_page(si->swap_map + offset);
 	offset &= ~PAGE_MASK;
 	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
 
@@ -1681,7 +1681,7 @@ int swp_swapcount(swp_entry_t entry)
 		n *= (SWAP_CONT_MAX + 1);
 	} while (tmp_count & COUNT_CONTINUED);
 out:
-	unlock_cluster_or_swap_info(p, ci);
+	unlock_cluster_or_swap_info(si, ci);
 	return count;
 }
 
@@ -2542,52 +2542,52 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	return generic_swapfile_activate(sis, swap_file, span);
 }
 
-static int swap_node(struct swap_info_struct *p)
+static int swap_node(struct swap_info_struct *si)
 {
 	struct block_device *bdev;
 
-	if (p->bdev)
-		bdev = p->bdev;
+	if (si->bdev)
+		bdev = si->bdev;
 	else
-		bdev = p->swap_file->f_inode->i_sb->s_bdev;
+		bdev = si->swap_file->f_inode->i_sb->s_bdev;
 
 	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
 }
 
-static void setup_swap_info(struct swap_info_struct *p, int prio,
+static void setup_swap_info(struct swap_info_struct *si, int prio,
 			    unsigned char *swap_map,
 			    struct swap_cluster_info *cluster_info)
 {
 	int i;
 
 	if (prio >= 0)
-		p->prio = prio;
+		si->prio = prio;
 	else
-		p->prio = --least_priority;
+		si->prio = --least_priority;
 	/*
 	 * the plist prio is negated because plist ordering is
 	 * low-to-high, while swap ordering is high-to-low
 	 */
-	p->list.prio = -p->prio;
+	si->list.prio = -si->prio;
 	for_each_node(i) {
-		if (p->prio >= 0)
-			p->avail_lists[i].prio = -p->prio;
+		if (si->prio >= 0)
+			si->avail_lists[i].prio = -si->prio;
 		else {
-			if (swap_node(p) == i)
-				p->avail_lists[i].prio = 1;
+			if (swap_node(si) == i)
+				si->avail_lists[i].prio = 1;
 			else
-				p->avail_lists[i].prio = -p->prio;
+				si->avail_lists[i].prio = -si->prio;
 		}
 	}
-	p->swap_map = swap_map;
-	p->cluster_info = cluster_info;
+	si->swap_map = swap_map;
+	si->cluster_info = cluster_info;
 }
 
-static void _enable_swap_info(struct swap_info_struct *p)
+static void _enable_swap_info(struct swap_info_struct *si)
 {
-	p->flags |= SWP_WRITEOK;
-	atomic_long_add(p->pages, &nr_swap_pages);
-	total_swap_pages += p->pages;
+	si->flags |= SWP_WRITEOK;
+	atomic_long_add(si->pages, &nr_swap_pages);
+	total_swap_pages += si->pages;
 
 	assert_spin_locked(&swap_lock);
 	/*
@@ -2600,40 +2600,40 @@ static void _enable_swap_info(struct swap_info_struct *p)
 	 * which allocates swap pages from the highest available priority
 	 * swap_info_struct.
 	 */
-	plist_add(&p->list, &swap_active_head);
+	plist_add(&si->list, &swap_active_head);
 
 	/* add to available list iff swap device is not full */
-	if (p->highest_bit)
-		add_to_avail_list(p);
+	if (si->highest_bit)
+		add_to_avail_list(si);
 }
 
-static void enable_swap_info(struct swap_info_struct *p, int prio,
+static void enable_swap_info(struct swap_info_struct *si, int prio,
 				unsigned char *swap_map,
 				struct swap_cluster_info *cluster_info)
 {
 	spin_lock(&swap_lock);
-	spin_lock(&p->lock);
-	setup_swap_info(p, prio, swap_map, cluster_info);
-	spin_unlock(&p->lock);
+	spin_lock(&si->lock);
+	setup_swap_info(si, prio, swap_map, cluster_info);
+	spin_unlock(&si->lock);
 	spin_unlock(&swap_lock);
 	/*
 	 * Finished initializing swap device, now it's safe to reference it.
 	 */
-	percpu_ref_resurrect(&p->users);
+	percpu_ref_resurrect(&si->users);
 	spin_lock(&swap_lock);
-	spin_lock(&p->lock);
-	_enable_swap_info(p);
-	spin_unlock(&p->lock);
+	spin_lock(&si->lock);
+	_enable_swap_info(si);
+	spin_unlock(&si->lock);
 	spin_unlock(&swap_lock);
 }
 
-static void reinsert_swap_info(struct swap_info_struct *p)
+static void reinsert_swap_info(struct swap_info_struct *si)
 {
 	spin_lock(&swap_lock);
-	spin_lock(&p->lock);
-	setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
-	_enable_swap_info(p);
-	spin_unlock(&p->lock);
+	spin_lock(&si->lock);
+	setup_swap_info(si, si->prio, si->swap_map, si->cluster_info);
+	_enable_swap_info(si);
+	spin_unlock(&si->lock);
 	spin_unlock(&swap_lock);
 }
 
@@ -3019,20 +3019,20 @@ static struct swap_info_struct *alloc_swap_info(void)
 	return p;
 }
 
-static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
 {
 	if (S_ISBLK(inode->i_mode)) {
-		p->bdev = I_BDEV(inode);
+		si->bdev = I_BDEV(inode);
 		/*
 		 * Zoned block devices contain zones that have a sequential
 		 * write only restriction.  Hence zoned block devices are not
 		 * suitable for swapping.  Disallow them here.
 		 */
-		if (bdev_is_zoned(p->bdev))
+		if (bdev_is_zoned(si->bdev))
 			return -EINVAL;
-		p->flags |= SWP_BLKDEV;
+		si->flags |= SWP_BLKDEV;
 	} else if (S_ISREG(inode->i_mode)) {
-		p->bdev = inode->i_sb->s_bdev;
+		si->bdev = inode->i_sb->s_bdev;
 	}
 
 	return 0;
@@ -3067,7 +3067,7 @@ __weak unsigned long arch_max_swapfile_size(void)
 	return generic_max_swapfile_size();
 }
 
-static unsigned long read_swap_header(struct swap_info_struct *p,
+static unsigned long read_swap_header(struct swap_info_struct *si,
 					union swap_header *swap_header,
 					struct inode *inode)
 {
@@ -3098,9 +3098,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 		return 0;
 	}
 
-	p->lowest_bit  = 1;
-	p->cluster_next = 1;
-	p->cluster_nr = 0;
+	si->lowest_bit  = 1;
+	si->cluster_next = 1;
+	si->cluster_nr = 0;
 
 	maxpages = swapfile_maximum_size;
 	last_page = swap_header->info.last_page;
@@ -3118,7 +3118,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 		if ((unsigned int)maxpages == 0)
 			maxpages = UINT_MAX;
 	}
-	p->highest_bit = maxpages - 1;
+	si->highest_bit = maxpages - 1;
 
 	if (!maxpages)
 		return 0;

>
> Thanks
> Barry