[PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Barry Song posted 2 patches 1 year, 6 months ago
There is a newer version of this series
[PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 6 months ago
From: Barry Song <v-songbaohua@oppo.com>

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.
If swap_count is 1, which is likely true as most anon memory are
private, we can free all contiguous swap slots all together.

Ran the below test program for measuring the bandwidth of munmap
using zRAM and 64KiB mTHP:

 #include <sys/mman.h>
 #include <sys/time.h>
 #include <stdlib.h>

 unsigned long long tv_to_ms(struct timeval tv)
 {
        return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 }

 main()
 {
        struct timeval tv_b, tv_e;
        int i;
 #define SIZE 1024*1024*1024
        void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (!p) {
                perror("fail to get memory");
                exit(-1);
        }

        madvise(p, SIZE, MADV_HUGEPAGE);
        memset(p, 0x11, SIZE); /* write to get mem */

        madvise(p, SIZE, MADV_PAGEOUT);

        gettimeofday(&tv_b, NULL);
        munmap(p, SIZE);
        gettimeofday(&tv_e, NULL);

        printf("munmap in bandwidth: %ld bytes/ms\n",
                        SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
 }

The result is as below (munmap bandwidth):
                mm-unstable  mm-unstable-with-patch
   round1       21053761      63161283
   round2       21053761      63161283
   round3       21053761      63161283
   round4       20648881      67108864
   round5       20648881      67108864

munmap bandwidth becomes 3X faster.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Cc: Kairui Song <kasong@tencent.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 11 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 35cb58373493..25c3f98fa8d5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
 	return true;
 }
 
+static bool swap_is_last_map(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages,
+			      bool *has_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	bool cached = false;
+
+	do {
+		if ((*map & ~SWAP_HAS_CACHE) != 1)
+			return false;
+		if (*map & SWAP_HAS_CACHE)
+			cached = true;
+	} while (++map < map_end);
+
+	*has_cache = cached;
+	return true;
+}
+
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1469,6 +1488,53 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si,
 	return usage;
 }
 
+static bool __swap_entries_free(struct swap_info_struct *si,
+				swp_entry_t entry, int nr)
+{
+	unsigned long offset = swp_offset(entry);
+	unsigned int type = swp_type(entry);
+	struct swap_cluster_info *ci;
+	bool has_cache = false;
+	unsigned char count;
+	bool can_batch;
+	int i;
+
+	if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+		goto fallback;
+	/* cross into another cluster */
+	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+		goto fallback;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	can_batch = swap_is_last_map(si, offset, nr, &has_cache);
+	if (can_batch) {
+		for (i = 0; i < nr; i++)
+			WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
+	}
+	unlock_cluster_or_swap_info(si, ci);
+
+	if (!can_batch)
+		goto fallback;
+	if (!has_cache) {
+		spin_lock(&si->lock);
+		swap_entry_range_free(si, entry, nr);
+		spin_unlock(&si->lock);
+	}
+	return has_cache;
+
+fallback:
+	for (i = 0; i  < nr; i++) {
+		if (data_race(si->swap_map[offset + i])) {
+			count = __swap_entry_free(si, swp_entry(type, offset + i));
+			if (count == SWAP_HAS_CACHE)
+				has_cache = true;
+		} else {
+			WARN_ON_ONCE(1);
+		}
+	}
+	return has_cache;
+}
+
 /*
  * Drop the last HAS_CACHE flag of swap entries, caller have to
  * ensure all entries belong to the same cgroup.
@@ -1792,11 +1858,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 {
 	const unsigned long start_offset = swp_offset(entry);
 	const unsigned long end_offset = start_offset + nr;
-	unsigned int type = swp_type(entry);
 	struct swap_info_struct *si;
 	bool any_only_cache = false;
 	unsigned long offset;
-	unsigned char count;
 
 	if (non_swap_entry(entry))
 		return;
@@ -1811,15 +1875,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	/*
 	 * First free all entries in the range.
 	 */
-	for (offset = start_offset; offset < end_offset; offset++) {
-		if (data_race(si->swap_map[offset])) {
-			count = __swap_entry_free(si, swp_entry(type, offset));
-			if (count == SWAP_HAS_CACHE)
-				any_only_cache = true;
-		} else {
-			WARN_ON_ONCE(1);
-		}
-	}
+	any_only_cache = __swap_entries_free(si, entry, nr);
 
 	/*
 	 * Short-circuit the below loop if none of the entries had their
-- 
2.34.1
Re: [PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Kairui Song 1 year, 6 months ago
On Wed, Aug 7, 2024 at 4:25 PM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
> If swap_count is 1, which is likely true as most anon memory are
> private, we can free all contiguous swap slots all together.
>
> Ran the below test program for measuring the bandwidth of munmap
> using zRAM and 64KiB mTHP:
>
>  #include <sys/mman.h>
>  #include <sys/time.h>
>  #include <stdlib.h>
>
>  unsigned long long tv_to_ms(struct timeval tv)
>  {
>         return tv.tv_sec * 1000 + tv.tv_usec / 1000;
>  }
>
>  main()
>  {
>         struct timeval tv_b, tv_e;
>         int i;
>  #define SIZE 1024*1024*1024
>         void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>         if (!p) {
>                 perror("fail to get memory");
>                 exit(-1);
>         }
>
>         madvise(p, SIZE, MADV_HUGEPAGE);
>         memset(p, 0x11, SIZE); /* write to get mem */
>
>         madvise(p, SIZE, MADV_PAGEOUT);
>
>         gettimeofday(&tv_b, NULL);
>         munmap(p, SIZE);
>         gettimeofday(&tv_e, NULL);
>
>         printf("munmap in bandwidth: %ld bytes/ms\n",
>                         SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
>  }
>
> The result is as below (munmap bandwidth):
>                 mm-unstable  mm-unstable-with-patch
>    round1       21053761      63161283
>    round2       21053761      63161283
>    round3       21053761      63161283
>    round4       20648881      67108864
>    round5       20648881      67108864
>
> munmap bandwidth becomes 3X faster.

Hi Barry,

Thanks for the patch, I also noticed this could be optimized when
working on the batch freeing of mthp pages in the series you
mentioned, a very nice improvement.

>
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
>
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 35cb58373493..25c3f98fa8d5 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>         return true;
>  }
>
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +                             unsigned long offset, int nr_pages,
> +                             bool *has_cache)
> +{
> +       unsigned char *map = si->swap_map + offset;
> +       unsigned char *map_end = map + nr_pages;
> +       bool cached = false;
> +
> +       do {
> +               if ((*map & ~SWAP_HAS_CACHE) != 1)
> +                       return false;

I haven't tried this yet, but looking at this if. If a mthp or thp was
split, and part of the slots are "1", rest of the slots are "HAS_CACHE
| 1", this will also return true, is this a problem?

These slots with "1" don't have an entry in the swap cache, so the
following reclaim in free_swap_and_cache_nr might not work as
expected, could they be stuck in HAS_CACHE state?
Re: [PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 6 months ago
On Thu, Aug 8, 2024 at 4:16 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Wed, Aug 7, 2024 at 4:25 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > Zhiguo reported that swap release could be a serious bottleneck
> > during process exits[1]. With mTHP, we have the opportunity to
> > batch free swaps.
> > Thanks to the work of Chris and Kairui[2], I was able to achieve
> > this optimization with minimal code changes by building on their
> > efforts.
> > If swap_count is 1, which is likely true as most anon memory are
> > private, we can free all contiguous swap slots all together.
> >
> > Ran the below test program for measuring the bandwidth of munmap
> > using zRAM and 64KiB mTHP:
> >
> >  #include <sys/mman.h>
> >  #include <sys/time.h>
> >  #include <stdlib.h>
> >
> >  unsigned long long tv_to_ms(struct timeval tv)
> >  {
> >         return tv.tv_sec * 1000 + tv.tv_usec / 1000;
> >  }
> >
> >  main()
> >  {
> >         struct timeval tv_b, tv_e;
> >         int i;
> >  #define SIZE 1024*1024*1024
> >         void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
> >                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> >         if (!p) {
> >                 perror("fail to get memory");
> >                 exit(-1);
> >         }
> >
> >         madvise(p, SIZE, MADV_HUGEPAGE);
> >         memset(p, 0x11, SIZE); /* write to get mem */
> >
> >         madvise(p, SIZE, MADV_PAGEOUT);
> >
> >         gettimeofday(&tv_b, NULL);
> >         munmap(p, SIZE);
> >         gettimeofday(&tv_e, NULL);
> >
> >         printf("munmap in bandwidth: %ld bytes/ms\n",
> >                         SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
> >  }
> >
> > The result is as below (munmap bandwidth):
> >                 mm-unstable  mm-unstable-with-patch
> >    round1       21053761      63161283
> >    round2       21053761      63161283
> >    round3       21053761      63161283
> >    round4       20648881      67108864
> >    round5       20648881      67108864
> >
> > munmap bandwidth becomes 3X faster.
>
> Hi Barry,
>
> Thanks for the patch, I also noticed this could be optimized when
> working on the batch freeing of mthp pages in the series you
> mentioned, a very nice improvement.
>
> >
> > [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> > [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
> >
> > Cc: Kairui Song <kasong@tencent.com>
> > Cc: Chris Li <chrisl@kernel.org>
> > Cc: "Huang, Ying" <ying.huang@intel.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Kalesh Singh <kaleshsingh@google.com>
> > Cc: Ryan Roberts <ryan.roberts@arm.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 67 insertions(+), 11 deletions(-)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 35cb58373493..25c3f98fa8d5 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
> >         return true;
> >  }
> >
> > +static bool swap_is_last_map(struct swap_info_struct *si,
> > +                             unsigned long offset, int nr_pages,
> > +                             bool *has_cache)
> > +{
> > +       unsigned char *map = si->swap_map + offset;
> > +       unsigned char *map_end = map + nr_pages;
> > +       bool cached = false;
> > +
> > +       do {
> > +               if ((*map & ~SWAP_HAS_CACHE) != 1)
> > +                       return false;
>
> I haven't tried this yet, but looking at this if. If a mthp or thp was
> split, and part of the slots are "1", rest of the slots are "HAS_CACHE
> | 1", this will also return true, is this a problem?
>
> These slots with "1" don't have an entry in the swap cache, so the
> following reclaim in free_swap_and_cache_nr might not work as
> expected, could they be stuck in HAS_CACHE state?

good catch. Kairui, Thanks! In this case, I am leaking swap slots whose
count == 1.
The original non-batch code will still free those slots in non-batched mode
but my code will just write them to SWAP_HAS_CACHE.
this can be fixed by:

static bool swap_is_last_map(struct swap_info_struct *si,
                unsigned long offset, int nr_pages, bool *has_cache)
{
        unsigned char *map = si->swap_map + offset;
        unsigned char *map_end = map + nr_pages;
        unsigned char count = *map;

        if (swap_count(count) != 1)
                return false;

        do {
                if (*map != count)
                        return false;
        } while (++map < map_end);

        *has_cache = !!(count & SWAP_HAS_CACHE);
        return true;
}

Thanks
Barry
Re: [PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()
Posted by David Hildenbrand 1 year, 6 months ago
>   mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 67 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 35cb58373493..25c3f98fa8d5 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>   	return true;
>   }
>   
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +			      unsigned long offset, int nr_pages,
> +			      bool *has_cache)

Please use double tabs for indenting parameters on 2nd line on 
new/changed code:

		unsigned long offset, int nr_pages, bool *has_cache)

Results in less churn when renaming functions and we can frequently 
avoid some lines.

> +{
> +	unsigned char *map = si->swap_map + offset;
> +	unsigned char *map_end = map + nr_pages;
> +	bool cached = false;
> +
> +	do {
> +		if ((*map & ~SWAP_HAS_CACHE) != 1)
> +			return false;
> +		if (*map & SWAP_HAS_CACHE)
> +			cached = true;
> +	} while (++map < map_end);
> +
> +	*has_cache = cached;
> +	return true;
> +}
> +
>   /*
>    * returns number of pages in the folio that backs the swap entry. If positive,
>    * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> @@ -1469,6 +1488,53 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si,
>   	return usage;
>   }
>   
> +static bool __swap_entries_free(struct swap_info_struct *si,
> +				swp_entry_t entry, int nr)

Dito.

> +{
> +	unsigned long offset = swp_offset(entry);
> +	unsigned int type = swp_type(entry);
> +	struct swap_cluster_info *ci;
> +	bool has_cache = false;
> +	unsigned char count;
> +	bool can_batch;
> +	int i;
> +
> +	if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
> +		goto fallback;
> +	/* cross into another cluster */
> +	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> +		goto fallback;
> +
> +	ci = lock_cluster_or_swap_info(si, offset);
> +	can_batch = swap_is_last_map(si, offset, nr, &has_cache);
> +	if (can_batch) {
> +		for (i = 0; i < nr; i++)
> +			WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
> +	}
> +	unlock_cluster_or_swap_info(si, ci);
> +
> +	if (!can_batch)
> +		goto fallback;

I'd avoid "can_batch" and just do:

ci = lock_cluster_or_swap_info(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
	unlock_cluster_or_swap_info(si, ci);
	goto fallback;
}
for (i = 0; i < nr; i++)
	WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
unlock_cluster_or_swap_info(si, ci);

> +	if (!has_cache) {
> +		spin_lock(&si->lock);

I'm no expert on that code, but we might drop the cluster lock the take 
the swap_info lock and then retake the cluster lock. I assume there are 
no races we are worrying about here, right?

> +		swap_entry_range_free(si, entry, nr);
> +		spin_unlock(&si->lock);
> +	}
> +	return has_cache;
> +
> +fallback:
> +	for (i = 0; i  < nr; i++) {

One space too much before the "<".

> +		if (data_race(si->swap_map[offset + i])) {
> +			count = __swap_entry_free(si, swp_entry(type, offset + i));
> +			if (count == SWAP_HAS_CACHE)
> +				has_cache = true;
> +		} else {
> +			WARN_ON_ONCE(1);
> +		}
> +	}
> +	return has_cache;
> +}
> +

-- 
Cheers,

David / dhildenb
Re: [PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()
Posted by Barry Song 1 year, 6 months ago
On Wed, Aug 7, 2024 at 9:29 PM David Hildenbrand <david@redhat.com> wrote:
>
> >   mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++--------
> >   1 file changed, 67 insertions(+), 11 deletions(-)
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 35cb58373493..25c3f98fa8d5 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
> >       return true;
> >   }
> >
> > +static bool swap_is_last_map(struct swap_info_struct *si,
> > +                           unsigned long offset, int nr_pages,
> > +                           bool *has_cache)
>
> Please use double tabs for indenting parameters on 2nd line on
> new/changed code:
>
>                 unsigned long offset, int nr_pages, bool *has_cache)
>
> Results in less churn when renaming functions and we can frequently
> avoid some lines.

ack.

>
> > +{
> > +     unsigned char *map = si->swap_map + offset;
> > +     unsigned char *map_end = map + nr_pages;
> > +     bool cached = false;
> > +
> > +     do {
> > +             if ((*map & ~SWAP_HAS_CACHE) != 1)
> > +                     return false;
> > +             if (*map & SWAP_HAS_CACHE)
> > +                     cached = true;
> > +     } while (++map < map_end);
> > +
> > +     *has_cache = cached;
> > +     return true;
> > +}
> > +
> >   /*
> >    * returns number of pages in the folio that backs the swap entry. If positive,
> >    * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> > @@ -1469,6 +1488,53 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si,
> >       return usage;
> >   }
> >
> > +static bool __swap_entries_free(struct swap_info_struct *si,
> > +                             swp_entry_t entry, int nr)
>
> Dito.

ack.

>
> > +{
> > +     unsigned long offset = swp_offset(entry);
> > +     unsigned int type = swp_type(entry);
> > +     struct swap_cluster_info *ci;
> > +     bool has_cache = false;
> > +     unsigned char count;
> > +     bool can_batch;
> > +     int i;
> > +
> > +     if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
> > +             goto fallback;
> > +     /* cross into another cluster */
> > +     if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> > +             goto fallback;
> > +
> > +     ci = lock_cluster_or_swap_info(si, offset);
> > +     can_batch = swap_is_last_map(si, offset, nr, &has_cache);
> > +     if (can_batch) {
> > +             for (i = 0; i < nr; i++)
> > +                     WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
> > +     }
> > +     unlock_cluster_or_swap_info(si, ci);
> > +
> > +     if (!can_batch)
> > +             goto fallback;
>
> I'd avoid "can_batch" and just do:
>
> ci = lock_cluster_or_swap_info(si, offset);
> if (!swap_is_last_map(si, offset, nr, &has_cache)) {
>         unlock_cluster_or_swap_info(si, ci);
>         goto fallback;
> }
> for (i = 0; i < nr; i++)
>         WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
> unlock_cluster_or_swap_info(si, ci);

ack.

>
> > +     if (!has_cache) {
> > +             spin_lock(&si->lock);
>
> I'm no expert on that code, but we might drop the cluster lock the take
> the swap_info lock and then retake the cluster lock. I assume there are
> no races we are worrying about here, right?

I suppose so. Even the original single-entry code follows the same pattern:

static unsigned char __swap_entry_free(struct swap_info_struct *p,
       swp_entry_t entry)
{
         struct swap_cluster_info *ci;
         unsigned long offset = swp_offset(entry);
         unsigned char usage;

         ci = lock_cluster_or_swap_info(p, offset);
         usage = __swap_entry_free_locked(p, offset, 1);
         unlock_cluster_or_swap_info(p, ci);
         if (!usage)
                  free_swap_slot(entry);

         return usage;
}

I assume that once we mark them as SWAP_HAS_CACHE, no one else
will touch them except ourselves.

>
> > +             swap_entry_range_free(si, entry, nr);
> > +             spin_unlock(&si->lock);
> > +     }
> > +     return has_cache;
> > +
> > +fallback:
> > +     for (i = 0; i  < nr; i++) {
>
> One space too much before the "<".

ack.

>
> > +             if (data_race(si->swap_map[offset + i])) {
> > +                     count = __swap_entry_free(si, swp_entry(type, offset + i));
> > +                     if (count == SWAP_HAS_CACHE)
> > +                             has_cache = true;
> > +             } else {
> > +                     WARN_ON_ONCE(1);
> > +             }
> > +     }
> > +     return has_cache;
> > +}
> > +
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Re: [PATCH v2 2/2] mm: attempt to batch free swap entries for zap_pte_range()
Posted by David Hildenbrand 1 year, 6 months ago
>>
>>> +     if (!has_cache) {
>>> +             spin_lock(&si->lock);
>>
>> I'm no expert on that code, but we might drop the cluster lock the take
>> the swap_info lock and then retake the cluster lock. I assume there are
>> no races we are worrying about here, right?
> 
> I suppose so. Even the original single-entry code follows the same pattern:
> 
> static unsigned char __swap_entry_free(struct swap_info_struct *p,
>         swp_entry_t entry)
> {
>           struct swap_cluster_info *ci;
>           unsigned long offset = swp_offset(entry);
>           unsigned char usage;
> 
>           ci = lock_cluster_or_swap_info(p, offset);
>           usage = __swap_entry_free_locked(p, offset, 1);
>           unlock_cluster_or_swap_info(p, ci);
>           if (!usage)
>                    free_swap_slot(entry);
> 
>           return usage;
> }
> 
> I assume that once we mark them as SWAP_HAS_CACHE, no one else
> will touch them except ourselves.

That makes sense, thanks!

-- 
Cheers,

David / dhildenb