[v3] mm: batch free swaps for zap_pte_range()

[PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Posted by Barry Song 1 year, 6 months ago

From: Barry Song <v-songbaohua@oppo.com>

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.
If swap_count is 1, which is likely true as most anon memory are
private, we can free all contiguous swap slots all together.

Ran the below test program for measuring the bandwidth of munmap
using zRAM and 64KiB mTHP:

 #include <sys/mman.h>
 #include <sys/time.h>
 #include <stdlib.h>

 unsigned long long tv_to_ms(struct timeval tv)
 {
        return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 }

 main()
 {
        struct timeval tv_b, tv_e;
        int i;
 #define SIZE 1024*1024*1024
        void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (!p) {
                perror("fail to get memory");
                exit(-1);
        }

        madvise(p, SIZE, MADV_HUGEPAGE);
        memset(p, 0x11, SIZE); /* write to get mem */

        madvise(p, SIZE, MADV_PAGEOUT);

        gettimeofday(&tv_b, NULL);
        munmap(p, SIZE);
        gettimeofday(&tv_e, NULL);

        printf("munmap in bandwidth: %ld bytes/ms\n",
                        SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
 }

The result is as below (munmap bandwidth):
                mm-unstable  mm-unstable-with-patch
   round1       21053761      63161283
   round2       21053761      63161283
   round3       21053761      63161283
   round4       20648881      67108864
   round5       20648881      67108864

munmap bandwidth becomes 3X faster.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Cc: Kairui Song <kasong@tencent.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/swapfile.c | 76 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 11 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 35cb58373493..52e941b6d626 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
 	return true;
 }
 
+static bool swap_is_last_map(struct swap_info_struct *si,
+		unsigned long offset, int nr_pages, bool *has_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	unsigned char count = *map;
+
+	if (swap_count(count) != 1)
+		return false;
+
+	while (++map < map_end) {
+		if (*map != count)
+			return false;
+	}
+
+	*has_cache = !!(count & SWAP_HAS_CACHE);
+	return true;
+}
+
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1469,6 +1488,51 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si,
 	return usage;
 }
 
+static bool __swap_entries_free(struct swap_info_struct *si,
+		swp_entry_t entry, int nr)
+{
+	unsigned long offset = swp_offset(entry);
+	unsigned int type = swp_type(entry);
+	struct swap_cluster_info *ci;
+	bool has_cache = false;
+	unsigned char count;
+	int i;
+
+	if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+		goto fallback;
+	/* cross into another cluster */
+	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+		goto fallback;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	if (!swap_is_last_map(si, offset, nr, &has_cache)) {
+		unlock_cluster_or_swap_info(si, ci);
+		goto fallback;
+	}
+	for (i = 0; i < nr; i++)
+		WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
+	unlock_cluster_or_swap_info(si, ci);
+
+	if (!has_cache) {
+		spin_lock(&si->lock);
+		swap_entry_range_free(si, entry, nr);
+		spin_unlock(&si->lock);
+	}
+	return has_cache;
+
+fallback:
+	for (i = 0; i < nr; i++) {
+		if (data_race(si->swap_map[offset + i])) {
+			count = __swap_entry_free(si, swp_entry(type, offset + i));
+			if (count == SWAP_HAS_CACHE)
+				has_cache = true;
+		} else {
+			WARN_ON_ONCE(1);
+		}
+	}
+	return has_cache;
+}
+
 /*
  * Drop the last HAS_CACHE flag of swap entries, caller have to
  * ensure all entries belong to the same cgroup.
@@ -1792,11 +1856,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 {
 	const unsigned long start_offset = swp_offset(entry);
 	const unsigned long end_offset = start_offset + nr;
-	unsigned int type = swp_type(entry);
 	struct swap_info_struct *si;
 	bool any_only_cache = false;
 	unsigned long offset;
-	unsigned char count;
 
 	if (non_swap_entry(entry))
 		return;
@@ -1811,15 +1873,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	/*
 	 * First free all entries in the range.
 	 */
-	for (offset = start_offset; offset < end_offset; offset++) {
-		if (data_race(si->swap_map[offset])) {
-			count = __swap_entry_free(si, swp_entry(type, offset));
-			if (count == SWAP_HAS_CACHE)
-				any_only_cache = true;
-		} else {
-			WARN_ON_ONCE(1);
-		}
-	}
+	any_only_cache = __swap_entries_free(si, entry, nr);
 
 	/*
 	 * Short-circuit the below loop if none of the entries had their
-- 
2.34.1

Re: [PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Posted by Chris Li 1 year, 5 months ago

Hi Barry,

We got a crash report from syzbot that has been bisect into this change.

Please see the comment below.

------------[ cut here ]------------
 kernel BUG at mm/swap_cgroup.c:141 !
Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
CPU: 1 UID: 0 PID: 5371 Comm: syz.0.15 Not tainted
6.11.0-rc3-next-20240812-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 06/27/2024
 RIP: 0010:swap_cgroup_record+0x2cd/0x2d0 mm/swap_cgroup.c:141
Code: e7 e8 a7 c9 f6 ff e9 64 fe ff ff e8 cd 41 8e ff 48 c7 c7 c0 db
a5 8e 48 89 de e8 2e 8c e8 02 e9 7a fd ff ff e8 b4 41 8e ff 90 <0f> 0b
90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f
RSP: 0018:ffffc90003e172f8 EFLAGS: 00010093
RAX: ffffffff82054c9c RBX: 000000000000000b RCX: ffff88802298bc00
RDX: 0000000000000000 RSI: 000000000000000a RDI: 0000000000000000
RBP: 0000000000000001 R08: ffffffff82054b43 R09: fffff520007c2e3c
R10: dffffc0000000000 R11: fffff520007c2e3c R12: ffff88801cf0f014
R13: 0000000000000000 R14: 000000000000000a R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff8880b9100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f9332107a8c CR3: 000000000e734000 CR4: 00000000003506f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
  __mem_cgroup_uncharge_swap+0x84/0x2e0 mm/memcontrol.c:5118
  mem_cgroup_uncharge_swap include/linux/swap.h:668 [inline]
  swap_entry_range_free+0x45f/0x1120 mm/swapfile.c:1556
  __swap_entries_free mm/swapfile.c:1518 [inline]
  free_swap_and_cache_nr+0xa65/0xae0 mm/swapfile.c:1876
  zap_pte_range mm/memory.c:1653 [inline]
  zap_pmd_range mm/memory.c:1736 [inline]
  zap_pud_range mm/memory.c:1765 [inline]
  zap_p4d_range mm/memory.c:1786 [inline]
  unmap_page_range+0x1924/0x42c0 mm/memory.c:1807
  unmap_vmas+0x3cc/0x5f0 mm/memory.c:1897
  exit_mmap+0x267/0xc20 mm/mmap.c:1923
  __mmput+0x115/0x390 kernel/fork.c:1347
  exit_mm+0x220/0x310 kernel/exit.c:571
  do_exit+0x9b2/0x28e0 kernel/exit.c:926
  do_group_exit+0x207/0x2c0 kernel/exit.c:1088
  __do_sys_exit_group kernel/exit.c:1099 [inline]
  __se_sys_exit_group kernel/exit.c:1097 [inline]
  __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1097
  x64_sys_call+0x2634/0x2640 arch/x86/include/generated/asm/syscalls_64.h:232
  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
  do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

On Wed, Aug 7, 2024 at 2:59 PM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
> If swap_count is 1, which is likely true as most anon memory are
> private, we can free all contiguous swap slots all together.
>
> Ran the below test program for measuring the bandwidth of munmap
> using zRAM and 64KiB mTHP:
>
>  #include <sys/mman.h>
>  #include <sys/time.h>
>  #include <stdlib.h>
>
>  unsigned long long tv_to_ms(struct timeval tv)
>  {
>         return tv.tv_sec * 1000 + tv.tv_usec / 1000;
>  }
>
>  main()
>  {
>         struct timeval tv_b, tv_e;
>         int i;
>  #define SIZE 1024*1024*1024
>         void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>         if (!p) {
>                 perror("fail to get memory");
>                 exit(-1);
>         }
>
>         madvise(p, SIZE, MADV_HUGEPAGE);
>         memset(p, 0x11, SIZE); /* write to get mem */
>
>         madvise(p, SIZE, MADV_PAGEOUT);
>
>         gettimeofday(&tv_b, NULL);
>         munmap(p, SIZE);
>         gettimeofday(&tv_e, NULL);
>
>         printf("munmap in bandwidth: %ld bytes/ms\n",
>                         SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
>  }
>
> The result is as below (munmap bandwidth):
>                 mm-unstable  mm-unstable-with-patch
>    round1       21053761      63161283
>    round2       21053761      63161283
>    round3       21053761      63161283
>    round4       20648881      67108864
>    round5       20648881      67108864
>
> munmap bandwidth becomes 3X faster.
>
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
>
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/swapfile.c | 76 +++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 65 insertions(+), 11 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 35cb58373493..52e941b6d626 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>         return true;
>  }
>
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +               unsigned long offset, int nr_pages, bool *has_cache)
> +{
> +       unsigned char *map = si->swap_map + offset;
> +       unsigned char *map_end = map + nr_pages;
> +       unsigned char count = *map;
> +
> +       if (swap_count(count) != 1)
> +               return false;
> +
> +       while (++map < map_end) {
> +               if (*map != count)
> +                       return false;
> +       }
> +
> +       *has_cache = !!(count & SWAP_HAS_CACHE);
> +       return true;
> +}
> +
>  /*
>   * returns number of pages in the folio that backs the swap entry. If positive,
>   * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> @@ -1469,6 +1488,51 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si,
>         return usage;
>  }
>
> +static bool __swap_entries_free(struct swap_info_struct *si,
> +               swp_entry_t entry, int nr)
> +{
> +       unsigned long offset = swp_offset(entry);
> +       unsigned int type = swp_type(entry);
> +       struct swap_cluster_info *ci;
> +       bool has_cache = false;
> +       unsigned char count;
> +       int i;
> +
> +       if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
> +               goto fallback;
> +       /* cross into another cluster */
> +       if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> +               goto fallback;
> +
> +       ci = lock_cluster_or_swap_info(si, offset);
> +       if (!swap_is_last_map(si, offset, nr, &has_cache)) {
> +               unlock_cluster_or_swap_info(si, ci);
> +               goto fallback;
> +       }
> +       for (i = 0; i < nr; i++)
> +               WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
> +       unlock_cluster_or_swap_info(si, ci);
> +
> +       if (!has_cache) {
> +               spin_lock(&si->lock);
> +               swap_entry_range_free(si, entry, nr);

Here it calls swap_entry_range_free() to free a range of the swap
entry. However the swap_entry_range_free() has the assumption that all
entries belong to the same folio and charge to the same memcg.
It eventually pass down to swap_cgroup_record(), which BUG on this line:

VM_BUG_ON(sc->id != old);

The root cause is that the swap entries are not from the same memcg.
Thankos Yosry for finding the root cause.

> +               spin_unlock(&si->lock);
> +       }
> +       return has_cache;
> +
> +fallback:
> +       for (i = 0; i < nr; i++) {
> +               if (data_race(si->swap_map[offset + i])) {
> +                       count = __swap_entry_free(si, swp_entry(type, offset + i));
> +                       if (count == SWAP_HAS_CACHE)
> +                               has_cache = true;
> +               } else {
> +                       WARN_ON_ONCE(1);
> +               }
> +       }
> +       return has_cache;
> +}
> +
>  /*
>   * Drop the last HAS_CACHE flag of swap entries, caller have to
>   * ensure all entries belong to the same cgroup.
> @@ -1792,11 +1856,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>  {
>         const unsigned long start_offset = swp_offset(entry);
>         const unsigned long end_offset = start_offset + nr;
> -       unsigned int type = swp_type(entry);
>         struct swap_info_struct *si;
>         bool any_only_cache = false;
>         unsigned long offset;
> -       unsigned char count;
>
>         if (non_swap_entry(entry))
>                 return;
> @@ -1811,15 +1873,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>         /*
>          * First free all entries in the range.
>          */
> -       for (offset = start_offset; offset < end_offset; offset++) {
> -               if (data_race(si->swap_map[offset])) {
> -                       count = __swap_entry_free(si, swp_entry(type, offset));
> -                       if (count == SWAP_HAS_CACHE)
> -                               any_only_cache = true;
> -               } else {
> -                       WARN_ON_ONCE(1);
> -               }
> -       }
> +       any_only_cache = __swap_entries_free(si, entry, nr);

Here we are just doing a page table walk, there is no guarantee the
'nr' number of swap entries came from the same folio and previously
charged to the same memcg. The swap_pte_batch() only checks they are
the same swap type, does not check they charge to the same memcg.

Chris

>
>         /*
>          * Short-circuit the below loop if none of the entries had their
> --
> 2.34.1
>

Re: [PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Posted by Barry Song 1 year, 5 months ago

On Fri, Aug 16, 2024 at 6:29 AM Chris Li <chrisl@kernel.org> wrote:
>
> Hi Barry,
>
> We got a crash report from syzbot that has been bisect into this change.
> > +static bool __swap_entries_free(struct swap_info_struct *si,
> > +               swp_entry_t entry, int nr)
> > +{
> > +       unsigned long offset = swp_offset(entry);
> > +       unsigned int type = swp_type(entry);
> > +       struct swap_cluster_info *ci;
> > +       bool has_cache = false;
> > +       unsigned char count;
> > +       int i;
> > +
> > +       if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
> > +               goto fallback;
> > +       /* cross into another cluster */
> > +       if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> > +               goto fallback;
> > +
> > +       ci = lock_cluster_or_swap_info(si, offset);
> > +       if (!swap_is_last_map(si, offset, nr, &has_cache)) {
> > +               unlock_cluster_or_swap_info(si, ci);
> > +               goto fallback;
> > +       }
> > +       for (i = 0; i < nr; i++)
> > +               WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
> > +       unlock_cluster_or_swap_info(si, ci);
> > +
> > +       if (!has_cache) {
> > +               spin_lock(&si->lock);
> > +               swap_entry_range_free(si, entry, nr);
>
> Here it calls swap_entry_range_free() to free a range of the swap
> entry. However the swap_entry_range_free() has the assumption that all
> entries belong to the same folio and charge to the same memcg.
> It eventually pass down to swap_cgroup_record(), which BUG on this line:
>
> VM_BUG_ON(sc->id != old);
>
> The root cause is that the swap entries are not from the same memcg.
> Thankos Yosry for finding the root cause.
>
> > +               spin_unlock(&si->lock);
> > +       }
> > +       return has_cache;
> > +
> > +fallback:
> > +       for (i = 0; i < nr; i++) {
> > +               if (data_race(si->swap_map[offset + i])) {
> > +                       count = __swap_entry_free(si, swp_entry(type, offset + i));
> > +                       if (count == SWAP_HAS_CACHE)
> > +                               has_cache = true;
> > +               } else {
> > +                       WARN_ON_ONCE(1);
> > +               }
> > +       }
> > +       return has_cache;
> > +}
> > +
> >  /*
> >   * Drop the last HAS_CACHE flag of swap entries, caller have to
> >   * ensure all entries belong to the same cgroup.
> > @@ -1792,11 +1856,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >  {
> >         const unsigned long start_offset = swp_offset(entry);
> >         const unsigned long end_offset = start_offset + nr;
> > -       unsigned int type = swp_type(entry);
> >         struct swap_info_struct *si;
> >         bool any_only_cache = false;
> >         unsigned long offset;
> > -       unsigned char count;
> >
> >         if (non_swap_entry(entry))
> >                 return;
> > @@ -1811,15 +1873,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >         /*
> >          * First free all entries in the range.
> >          */
> > -       for (offset = start_offset; offset < end_offset; offset++) {
> > -               if (data_race(si->swap_map[offset])) {
> > -                       count = __swap_entry_free(si, swp_entry(type, offset));
> > -                       if (count == SWAP_HAS_CACHE)
> > -                               any_only_cache = true;
> > -               } else {
> > -                       WARN_ON_ONCE(1);
> > -               }
> > -       }
> > +       any_only_cache = __swap_entries_free(si, entry, nr);
>
> Here we are just doing a page table walk, there is no guarantee the
> 'nr' number of swap entries came from the same folio and previously
> charged to the same memcg. The swap_pte_batch() only checks they are
> the same swap type, does not check they charge to the same memcg.
>

Sorry for the trouble, thanks for the report, Yosry & Chris.
Does the below fix the problem? otherwise, we might remove
the assumption all swaps must belong to one swap_cgroup in
batch free?

From c68e0d780ba808da4bb682b753e3fa77c4f96e13 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Fri, 16 Aug 2024 09:36:23 +1200
Subject: [PATCH] mm: check all swaps belong to same swap_cgroup in
 swap_pte_batch()

Right now, it is possible two folios are contiguous in swap slots
but they don't belong to one memcg. In this case, even we return
a large nr, we can't really batch free all slots.

Reported-by: Yosry Ahmed <yosryahmed@google.com>
Reported-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/internal.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index adbf8c88c9df..d1f1e221212d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/swap_cgroup.h>
 #include <linux/tracepoint-defs.h>
 
 /* Internal core VMA manipulation functions. */
@@ -275,18 +276,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
 {
 	pte_t expected_pte = pte_next_swp_offset(pte);
 	const pte_t *end_ptep = start_ptep + max_nr;
+	swp_entry_t entry = pte_to_swp_entry(pte);
 	pte_t *ptep = start_ptep + 1;
+	unsigned short cgroup_id;
 
 	VM_WARN_ON(max_nr < 1);
 	VM_WARN_ON(!is_swap_pte(pte));
-	VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+	VM_WARN_ON(non_swap_entry(entry));
 
+	cgroup_id = lookup_swap_cgroup_id(entry);
 	while (ptep < end_ptep) {
 		pte = ptep_get(ptep);
 
 		if (!pte_same(pte, expected_pte))
 			break;
-
+		if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
+			break;
 		expected_pte = pte_next_swp_offset(expected_pte);
 		ptep++;
 	}
-- 
2.34.1


> Chris
>
> >
> >         /*
> >          * Short-circuit the below loop if none of the entries had their
> > --
> > 2.34.1
> >

Thanks
Barry

Re: [PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Posted by Hugh Dickins 1 year, 5 months ago

On Fri, 16 Aug 2024, Barry Song wrote:
> Subject: [PATCH] mm: check all swaps belong to same swap_cgroup in
>  swap_pte_batch()
> 
> Right now, it is possible two folios are contiguous in swap slots
> but they don't belong to one memcg. In this case, even we return
> a large nr, we can't really batch free all slots.
> 
> Reported-by: Yosry Ahmed <yosryahmed@google.com>
> Reported-by: Chris Li <chrisl@kernel.org>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/internal.h | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index adbf8c88c9df..d1f1e221212d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -15,6 +15,7 @@
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
>  #include <linux/swapops.h>
> +#include <linux/swap_cgroup.h>
>  #include <linux/tracepoint-defs.h>
>  
>  /* Internal core VMA manipulation functions. */
> @@ -275,18 +276,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
>  {
>  	pte_t expected_pte = pte_next_swp_offset(pte);
>  	const pte_t *end_ptep = start_ptep + max_nr;
> +	swp_entry_t entry = pte_to_swp_entry(pte);
>  	pte_t *ptep = start_ptep + 1;
> +	unsigned short cgroup_id;
>  
>  	VM_WARN_ON(max_nr < 1);
>  	VM_WARN_ON(!is_swap_pte(pte));
> -	VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> +	VM_WARN_ON(non_swap_entry(entry));
>  
> +	cgroup_id = lookup_swap_cgroup_id(entry);
>  	while (ptep < end_ptep) {
>  		pte = ptep_get(ptep);
>  
>  		if (!pte_same(pte, expected_pte))
>  			break;
> -
> +		if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
> +			break;
>  		expected_pte = pte_next_swp_offset(expected_pte);
>  		ptep++;
>  	}
> -- 

[PATCH] mm: check all swaps belong to same swap_cgroup in swap_pte_batch() fix

mm-unstable swap_pte_batch() adds a new usage of lookup_swap_cgroup_id(),
which crashes if CONFIG_MEMCG kernel booted with "cgroup_disable=memory":
it now needs a mem_cgroup_disabled() check.

Fixes: 92b50df44566 ("mm: check all swaps belong to same swap_cgroup in swap_pte_batch()")
Signed-off-by: Hugh Dickins <hughd@google.com>
---
 mm/swap_cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index db6c4a26cf59..da1278f0563b 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -161,6 +161,8 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
  */
 unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 {
+	if (mem_cgroup_disabled())
+		return 0;
 	return lookup_swap_cgroup(ent, NULL)->id;
 }
 
-- 
2.35.3

Re: [PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Posted by Barry Song 1 year, 5 months ago

On Mon, Aug 26, 2024 at 8:09 AM Hugh Dickins <hughd@google.com> wrote:
>
> On Fri, 16 Aug 2024, Barry Song wrote:
> > Subject: [PATCH] mm: check all swaps belong to same swap_cgroup in
> >  swap_pte_batch()
> >
> > Right now, it is possible two folios are contiguous in swap slots
> > but they don't belong to one memcg. In this case, even we return
> > a large nr, we can't really batch free all slots.
> >
> > Reported-by: Yosry Ahmed <yosryahmed@google.com>
> > Reported-by: Chris Li <chrisl@kernel.org>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  mm/internal.h | 9 +++++++--
> >  1 file changed, 7 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index adbf8c88c9df..d1f1e221212d 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -15,6 +15,7 @@
> >  #include <linux/rmap.h>
> >  #include <linux/swap.h>
> >  #include <linux/swapops.h>
> > +#include <linux/swap_cgroup.h>
> >  #include <linux/tracepoint-defs.h>
> >
> >  /* Internal core VMA manipulation functions. */
> > @@ -275,18 +276,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> >  {
> >       pte_t expected_pte = pte_next_swp_offset(pte);
> >       const pte_t *end_ptep = start_ptep + max_nr;
> > +     swp_entry_t entry = pte_to_swp_entry(pte);
> >       pte_t *ptep = start_ptep + 1;
> > +     unsigned short cgroup_id;
> >
> >       VM_WARN_ON(max_nr < 1);
> >       VM_WARN_ON(!is_swap_pte(pte));
> > -     VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> > +     VM_WARN_ON(non_swap_entry(entry));
> >
> > +     cgroup_id = lookup_swap_cgroup_id(entry);
> >       while (ptep < end_ptep) {
> >               pte = ptep_get(ptep);
> >
> >               if (!pte_same(pte, expected_pte))
> >                       break;
> > -
> > +             if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
> > +                     break;
> >               expected_pte = pte_next_swp_offset(expected_pte);
> >               ptep++;
> >       }
> > --
>
> [PATCH] mm: check all swaps belong to same swap_cgroup in swap_pte_batch() fix
>
> mm-unstable swap_pte_batch() adds a new usage of lookup_swap_cgroup_id(),
> which crashes if CONFIG_MEMCG kernel booted with "cgroup_disable=memory":
> it now needs a mem_cgroup_disabled() check.

sorry for the trouble.

>
> Fixes: 92b50df44566 ("mm: check all swaps belong to same swap_cgroup in swap_pte_batch()")
> Signed-off-by: Hugh Dickins <hughd@google.com>

Acked-by: Barry Song <baohua@kernel.org>

> ---
>  mm/swap_cgroup.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
> index db6c4a26cf59..da1278f0563b 100644
> --- a/mm/swap_cgroup.c
> +++ b/mm/swap_cgroup.c
> @@ -161,6 +161,8 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
>   */
>  unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
>  {
> +       if (mem_cgroup_disabled())
> +               return 0;
>         return lookup_swap_cgroup(ent, NULL)->id;
>  }
>
> --
> 2.35.3

Re: [PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()

Posted by David Hildenbrand 1 year, 6 months ago

On 07.08.24 23:58, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
> If swap_count is 1, which is likely true as most anon memory are
> private, we can free all contiguous swap slots all together.
> 
> Ran the below test program for measuring the bandwidth of munmap
> using zRAM and 64KiB mTHP:
> 
>   #include <sys/mman.h>
>   #include <sys/time.h>
>   #include <stdlib.h>
> 
>   unsigned long long tv_to_ms(struct timeval tv)
>   {
>          return tv.tv_sec * 1000 + tv.tv_usec / 1000;
>   }
> 
>   main()
>   {
>          struct timeval tv_b, tv_e;
>          int i;
>   #define SIZE 1024*1024*1024
>          void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>          if (!p) {
>                  perror("fail to get memory");
>                  exit(-1);
>          }
> 
>          madvise(p, SIZE, MADV_HUGEPAGE);
>          memset(p, 0x11, SIZE); /* write to get mem */
> 
>          madvise(p, SIZE, MADV_PAGEOUT);
> 
>          gettimeofday(&tv_b, NULL);
>          munmap(p, SIZE);
>          gettimeofday(&tv_e, NULL);
> 
>          printf("munmap in bandwidth: %ld bytes/ms\n",
>                          SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
>   }
> 
> The result is as below (munmap bandwidth):
>                  mm-unstable  mm-unstable-with-patch
>     round1       21053761      63161283
>     round2       21053761      63161283
>     round3       21053761      63161283
>     round4       20648881      67108864
>     round5       20648881      67108864
> 
> munmap bandwidth becomes 3X faster.
> 
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
> 
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---

All looks straight forward to me

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Cheers,

David / dhildenb

[PATCH v3 1/2] mm: rename instances of swap_info_struct to meaningful 'si'
[PATCH v3 2/2] mm: attempt to batch free swap entries for zap_pte_range()