[RFC 0/4] mm: zswap: add support for zswapin of large folios

Usama Arif posted 4 patches 1 month, 1 week ago
Documentation/admin-guide/mm/transhuge.rst |   3 +
include/linux/huge_mm.h                    |   1 +
include/linux/zswap.h                      |   6 ++
mm/huge_memory.c                           |   3 +
mm/memory.c                                |  16 +--
mm/page_io.c                               |   2 +-
mm/zswap.c                                 | 120 ++++++++++++++-------
7 files changed, 99 insertions(+), 52 deletions(-)
[RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month, 1 week ago
After large folio zswapout support added in [1], this patch adds
support for zswapin of large folios to bring it on par with zram.
This series makes sure that the benefits of large folios (fewer
page faults, batched PTE and rmap manipulation, reduced lru list,
TLB coalescing (for arm64 and amd)) are not lost at swap out when
using zswap.

It builds on top of [2] which added large folio swapin support for
zram and provides the same level of large folio swapin support as
zram, i.e. only supporting swap count == 1.

Patch 1 skips swapcache for swapping in zswap pages, this should improve
no readahead swapin performance [3], and also allows us to build on large
folio swapin support added in [2], hence is a prerequisite for patch 3.

Patch 3 adds support for large folio zswapin. This patch does not add
support for hybrid backends (i.e. folios partly present swap and zswap).

The main performance benefit comes from maintaining large folios *after*
swapin, large folio performance improvements have been mentioned in previous
series posted on it [2],[4], so have not added those. Below is a simple
microbenchmark to measure the time needed *for* zswpin of 1G memory (along
with memory integrity check).

                                |  no mTHP (ms) | 1M mTHP enabled (ms)
Base kernel                     |   1165        |    1163
Kernel with mTHP zswpin series  |   1203        |     738

The time measured was pretty consistent between runs (~1-2% variation).
There is 36% improvement in zswapin time with 1M folios. The percentage
improvement is likely to be more if the memcmp is removed.

diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 40de679248b8..77068c577c86 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -9,6 +9,8 @@
 #include <string.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
+#include <sys/time.h>
+#include <malloc.h>
 
 #include "../kselftest.h"
 #include "cgroup_util.h"
@@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
        return test_zswap_writeback(root, false);
 }
 
+static int zswapin_perf(const char *cgroup, void *arg)
+{
+       long pagesize = sysconf(_SC_PAGESIZE);
+       size_t memsize = MB(1*1024);
+       char buf[pagesize];
+       int ret = -1;
+       char *mem;
+       struct timeval start, end;
+
+       mem = (char *)memalign(2*1024*1024, memsize);
+       if (!mem)
+               return ret;
+
+       /*
+        * Fill half of each page with increasing data, and keep other
+        * half empty, this will result in data that is still compressible
+        * and ends up in zswap, with material zswap usage.
+        */
+       for (int i = 0; i < pagesize; i++)
+               buf[i] = i < pagesize/2 ? (char) i : 0;
+
+       for (int i = 0; i < memsize; i += pagesize)
+               memcpy(&mem[i], buf, pagesize);
+
+       /* Try and reclaim allocated memory */
+       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
+               ksft_print_msg("Failed to reclaim all of the requested memory\n");
+               goto out;
+       }
+
+       gettimeofday(&start, NULL);
+       /* zswpin */
+       for (int i = 0; i < memsize; i += pagesize) {
+               if (memcmp(&mem[i], buf, pagesize)) {
+                       ksft_print_msg("invalid memory\n");
+                       goto out;
+               }
+       }
+       gettimeofday(&end, NULL);
+       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
+       ret = 0;
+out:
+       free(mem);
+       return ret;
+}
+
+static int test_zswapin_perf(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *test_group;
+
+       test_group = cg_name(root, "zswapin_perf_test");
+       if (!test_group)
+               goto out;
+       if (cg_create(test_group))
+               goto out;
+
+       if (cg_run(test_group, zswapin_perf, NULL))
+               goto out;
+
+       ret = KSFT_PASS;
+out:
+       cg_destroy(test_group);
+       free(test_group);
+       return ret;
+}
+
 /*
  * When trying to store a memcg page in zswap, if the memcg hits its memory
  * limit in zswap, writeback should affect only the zswapped pages of that
@@ -584,6 +654,7 @@ struct zswap_test {
        T(test_zswapin),
        T(test_zswap_writeback_enabled),
        T(test_zswap_writeback_disabled),
+       T(test_zswapin_perf),
        T(test_no_kmem_bypass),
        T(test_no_invasive_cgroup_shrink),
 };

[1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
[2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
[3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
[4] https://lwn.net/Articles/955575/

Usama Arif (4):
  mm/zswap: skip swapcache for swapping in zswap pages
  mm/zswap: modify zswap_decompress to accept page instead of folio
  mm/zswap: add support for large folio zswapin
  mm/zswap: count successful large folio zswap loads

 Documentation/admin-guide/mm/transhuge.rst |   3 +
 include/linux/huge_mm.h                    |   1 +
 include/linux/zswap.h                      |   6 ++
 mm/huge_memory.c                           |   3 +
 mm/memory.c                                |  16 +--
 mm/page_io.c                               |   2 +-
 mm/zswap.c                                 | 120 ++++++++++++++-------
 7 files changed, 99 insertions(+), 52 deletions(-)

-- 
2.43.5
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
> After large folio zswapout support added in [1], this patch adds
> support for zswapin of large folios to bring it on par with zram.
> This series makes sure that the benefits of large folios (fewer
> page faults, batched PTE and rmap manipulation, reduced lru list,
> TLB coalescing (for arm64 and amd)) are not lost at swap out when
> using zswap.
>
> It builds on top of [2] which added large folio swapin support for
> zram and provides the same level of large folio swapin support as
> zram, i.e. only supporting swap count == 1.
>
> Patch 1 skips swapcache for swapping in zswap pages, this should improve
> no readahead swapin performance [3], and also allows us to build on large
> folio swapin support added in [2], hence is a prerequisite for patch 3.
>
> Patch 3 adds support for large folio zswapin. This patch does not add
> support for hybrid backends (i.e. folios partly present swap and zswap).
>
> The main performance benefit comes from maintaining large folios *after*
> swapin, large folio performance improvements have been mentioned in previous
> series posted on it [2],[4], so have not added those. Below is a simple
> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
> with memory integrity check).
>
>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
> Base kernel                     |   1165        |    1163
> Kernel with mTHP zswpin series  |   1203        |     738

Hi Usama,
Do you know where this minor regression for non-mTHP comes from?
As you even have skipped swapcache for small folios in zswap in patch1,
that part should have some gain? is it because of zswap_present_test()?

>
> The time measured was pretty consistent between runs (~1-2% variation).
> There is 36% improvement in zswapin time with 1M folios. The percentage
> improvement is likely to be more if the memcmp is removed.
>
> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
> index 40de679248b8..77068c577c86 100644
> --- a/tools/testing/selftests/cgroup/test_zswap.c
> +++ b/tools/testing/selftests/cgroup/test_zswap.c
> @@ -9,6 +9,8 @@
>  #include <string.h>
>  #include <sys/wait.h>
>  #include <sys/mman.h>
> +#include <sys/time.h>
> +#include <malloc.h>
>
>  #include "../kselftest.h"
>  #include "cgroup_util.h"
> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
>         return test_zswap_writeback(root, false);
>  }
>
> +static int zswapin_perf(const char *cgroup, void *arg)
> +{
> +       long pagesize = sysconf(_SC_PAGESIZE);
> +       size_t memsize = MB(1*1024);
> +       char buf[pagesize];
> +       int ret = -1;
> +       char *mem;
> +       struct timeval start, end;
> +
> +       mem = (char *)memalign(2*1024*1024, memsize);
> +       if (!mem)
> +               return ret;
> +
> +       /*
> +        * Fill half of each page with increasing data, and keep other
> +        * half empty, this will result in data that is still compressible
> +        * and ends up in zswap, with material zswap usage.
> +        */
> +       for (int i = 0; i < pagesize; i++)
> +               buf[i] = i < pagesize/2 ? (char) i : 0;
> +
> +       for (int i = 0; i < memsize; i += pagesize)
> +               memcpy(&mem[i], buf, pagesize);
> +
> +       /* Try and reclaim allocated memory */
> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
> +               goto out;
> +       }
> +
> +       gettimeofday(&start, NULL);
> +       /* zswpin */
> +       for (int i = 0; i < memsize; i += pagesize) {
> +               if (memcmp(&mem[i], buf, pagesize)) {
> +                       ksft_print_msg("invalid memory\n");
> +                       goto out;
> +               }
> +       }
> +       gettimeofday(&end, NULL);
> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
> +       ret = 0;
> +out:
> +       free(mem);
> +       return ret;
> +}
> +
> +static int test_zswapin_perf(const char *root)
> +{
> +       int ret = KSFT_FAIL;
> +       char *test_group;
> +
> +       test_group = cg_name(root, "zswapin_perf_test");
> +       if (!test_group)
> +               goto out;
> +       if (cg_create(test_group))
> +               goto out;
> +
> +       if (cg_run(test_group, zswapin_perf, NULL))
> +               goto out;
> +
> +       ret = KSFT_PASS;
> +out:
> +       cg_destroy(test_group);
> +       free(test_group);
> +       return ret;
> +}
> +
>  /*
>   * When trying to store a memcg page in zswap, if the memcg hits its memory
>   * limit in zswap, writeback should affect only the zswapped pages of that
> @@ -584,6 +654,7 @@ struct zswap_test {
>         T(test_zswapin),
>         T(test_zswap_writeback_enabled),
>         T(test_zswap_writeback_disabled),
> +       T(test_zswapin_perf),
>         T(test_no_kmem_bypass),
>         T(test_no_invasive_cgroup_shrink),
>  };
>
> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
> [4] https://lwn.net/Articles/955575/
>
> Usama Arif (4):
>   mm/zswap: skip swapcache for swapping in zswap pages
>   mm/zswap: modify zswap_decompress to accept page instead of folio
>   mm/zswap: add support for large folio zswapin
>   mm/zswap: count successful large folio zswap loads
>
>  Documentation/admin-guide/mm/transhuge.rst |   3 +
>  include/linux/huge_mm.h                    |   1 +
>  include/linux/zswap.h                      |   6 ++
>  mm/huge_memory.c                           |   3 +
>  mm/memory.c                                |  16 +--
>  mm/page_io.c                               |   2 +-
>  mm/zswap.c                                 | 120 ++++++++++++++-------
>  7 files changed, 99 insertions(+), 52 deletions(-)
>
> --
> 2.43.5
>

Thanks
barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 21/10/2024 06:09, Barry Song wrote:
> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> After large folio zswapout support added in [1], this patch adds
>> support for zswapin of large folios to bring it on par with zram.
>> This series makes sure that the benefits of large folios (fewer
>> page faults, batched PTE and rmap manipulation, reduced lru list,
>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
>> using zswap.
>>
>> It builds on top of [2] which added large folio swapin support for
>> zram and provides the same level of large folio swapin support as
>> zram, i.e. only supporting swap count == 1.
>>
>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
>> no readahead swapin performance [3], and also allows us to build on large
>> folio swapin support added in [2], hence is a prerequisite for patch 3.
>>
>> Patch 3 adds support for large folio zswapin. This patch does not add
>> support for hybrid backends (i.e. folios partly present swap and zswap).
>>
>> The main performance benefit comes from maintaining large folios *after*
>> swapin, large folio performance improvements have been mentioned in previous
>> series posted on it [2],[4], so have not added those. Below is a simple
>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
>> with memory integrity check).
>>
>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
>> Base kernel                     |   1165        |    1163
>> Kernel with mTHP zswpin series  |   1203        |     738
> 
> Hi Usama,
> Do you know where this minor regression for non-mTHP comes from?
> As you even have skipped swapcache for small folios in zswap in patch1,
> that part should have some gain? is it because of zswap_present_test()?
> 

Hi Barry,

The microbenchmark does a sequential read of 1G of memory, so it probably
isnt very representative of real world usecases. This also means that
swap_vma_readahead is able to readahead accurately all pages in its window.
With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
do_swap_page calls. I had added some prints and I was seeing 8 pages being
readahead in 1 do_swap_page. The larger number of calls causes the slight
regression (eventhough they are quite fast). I think in a realistic scenario,
where readahead window wont be as large, there wont be a regression.
The cost of zswap_present_test in the whole call stack of swapping page is
very low and I think can be ignored.

I think the more interesting thing is what Kanchana pointed out in
https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
I am curious, did you see this when testing large folio swapin and compression
at 4K granuality? Its looks like swap thrashing so I think it would be common
between zswap and zram. I dont have larger granuality zswap compression done,
which is why I think there is a regression in time taken. (It could be because
its tested on intel as well).

Thanks,
Usama


>>
>> The time measured was pretty consistent between runs (~1-2% variation).
>> There is 36% improvement in zswapin time with 1M folios. The percentage
>> improvement is likely to be more if the memcmp is removed.
>>
>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
>> index 40de679248b8..77068c577c86 100644
>> --- a/tools/testing/selftests/cgroup/test_zswap.c
>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
>> @@ -9,6 +9,8 @@
>>  #include <string.h>
>>  #include <sys/wait.h>
>>  #include <sys/mman.h>
>> +#include <sys/time.h>
>> +#include <malloc.h>
>>
>>  #include "../kselftest.h"
>>  #include "cgroup_util.h"
>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
>>         return test_zswap_writeback(root, false);
>>  }
>>
>> +static int zswapin_perf(const char *cgroup, void *arg)
>> +{
>> +       long pagesize = sysconf(_SC_PAGESIZE);
>> +       size_t memsize = MB(1*1024);
>> +       char buf[pagesize];
>> +       int ret = -1;
>> +       char *mem;
>> +       struct timeval start, end;
>> +
>> +       mem = (char *)memalign(2*1024*1024, memsize);
>> +       if (!mem)
>> +               return ret;
>> +
>> +       /*
>> +        * Fill half of each page with increasing data, and keep other
>> +        * half empty, this will result in data that is still compressible
>> +        * and ends up in zswap, with material zswap usage.
>> +        */
>> +       for (int i = 0; i < pagesize; i++)
>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
>> +
>> +       for (int i = 0; i < memsize; i += pagesize)
>> +               memcpy(&mem[i], buf, pagesize);
>> +
>> +       /* Try and reclaim allocated memory */
>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
>> +               goto out;
>> +       }
>> +
>> +       gettimeofday(&start, NULL);
>> +       /* zswpin */
>> +       for (int i = 0; i < memsize; i += pagesize) {
>> +               if (memcmp(&mem[i], buf, pagesize)) {
>> +                       ksft_print_msg("invalid memory\n");
>> +                       goto out;
>> +               }
>> +       }
>> +       gettimeofday(&end, NULL);
>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
>> +       ret = 0;
>> +out:
>> +       free(mem);
>> +       return ret;
>> +}
>> +
>> +static int test_zswapin_perf(const char *root)
>> +{
>> +       int ret = KSFT_FAIL;
>> +       char *test_group;
>> +
>> +       test_group = cg_name(root, "zswapin_perf_test");
>> +       if (!test_group)
>> +               goto out;
>> +       if (cg_create(test_group))
>> +               goto out;
>> +
>> +       if (cg_run(test_group, zswapin_perf, NULL))
>> +               goto out;
>> +
>> +       ret = KSFT_PASS;
>> +out:
>> +       cg_destroy(test_group);
>> +       free(test_group);
>> +       return ret;
>> +}
>> +
>>  /*
>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
>>   * limit in zswap, writeback should affect only the zswapped pages of that
>> @@ -584,6 +654,7 @@ struct zswap_test {
>>         T(test_zswapin),
>>         T(test_zswap_writeback_enabled),
>>         T(test_zswap_writeback_disabled),
>> +       T(test_zswapin_perf),
>>         T(test_no_kmem_bypass),
>>         T(test_no_invasive_cgroup_shrink),
>>  };
>>
>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
>> [4] https://lwn.net/Articles/955575/
>>
>> Usama Arif (4):
>>   mm/zswap: skip swapcache for swapping in zswap pages
>>   mm/zswap: modify zswap_decompress to accept page instead of folio
>>   mm/zswap: add support for large folio zswapin
>>   mm/zswap: count successful large folio zswap loads
>>
>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
>>  include/linux/huge_mm.h                    |   1 +
>>  include/linux/zswap.h                      |   6 ++
>>  mm/huge_memory.c                           |   3 +
>>  mm/memory.c                                |  16 +--
>>  mm/page_io.c                               |   2 +-
>>  mm/zswap.c                                 | 120 ++++++++++++++-------
>>  7 files changed, 99 insertions(+), 52 deletions(-)
>>
>> --
>> 2.43.5
>>
> 
> Thanks
> barry

Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 21/10/2024 11:40, Usama Arif wrote:
> 
> 
> On 21/10/2024 06:09, Barry Song wrote:
>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>
>>> After large folio zswapout support added in [1], this patch adds
>>> support for zswapin of large folios to bring it on par with zram.
>>> This series makes sure that the benefits of large folios (fewer
>>> page faults, batched PTE and rmap manipulation, reduced lru list,
>>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
>>> using zswap.
>>>
>>> It builds on top of [2] which added large folio swapin support for
>>> zram and provides the same level of large folio swapin support as
>>> zram, i.e. only supporting swap count == 1.
>>>
>>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
>>> no readahead swapin performance [3], and also allows us to build on large
>>> folio swapin support added in [2], hence is a prerequisite for patch 3.
>>>
>>> Patch 3 adds support for large folio zswapin. This patch does not add
>>> support for hybrid backends (i.e. folios partly present swap and zswap).
>>>
>>> The main performance benefit comes from maintaining large folios *after*
>>> swapin, large folio performance improvements have been mentioned in previous
>>> series posted on it [2],[4], so have not added those. Below is a simple
>>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
>>> with memory integrity check).
>>>
>>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
>>> Base kernel                     |   1165        |    1163
>>> Kernel with mTHP zswpin series  |   1203        |     738
>>
>> Hi Usama,
>> Do you know where this minor regression for non-mTHP comes from?
>> As you even have skipped swapcache for small folios in zswap in patch1,
>> that part should have some gain? is it because of zswap_present_test()?
>>
> 
> Hi Barry,
> 
> The microbenchmark does a sequential read of 1G of memory, so it probably
> isnt very representative of real world usecases. This also means that
> swap_vma_readahead is able to readahead accurately all pages in its window.
> With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
> do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
> do_swap_page calls. I had added some prints and I was seeing 8 pages being
> readahead in 1 do_swap_page. The larger number of calls causes the slight
> regression (eventhough they are quite fast). I think in a realistic scenario,
> where readahead window wont be as large, there wont be a regression.
> The cost of zswap_present_test in the whole call stack of swapping page is
> very low and I think can be ignored.
> 
> I think the more interesting thing is what Kanchana pointed out in
> https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> I am curious, did you see this when testing large folio swapin and compression
> at 4K granuality? Its looks like swap thrashing so I think it would be common
> between zswap and zram. I dont have larger granuality zswap compression done,
> which is why I think there is a regression in time taken. (It could be because
> its tested on intel as well).
> 
> Thanks,
> Usama
> 

Hi,

So I have been doing some benchmarking after Kanchana pointed out a performance
regression in [1] of swapping in large folio. I would love to get thoughts from
zram folks on this, as thats where large folio swapin was first added [2].
As far as I can see, the current support in zram is doing large folio swapin
at 4K granuality. The large granuality compression in [3] which was posted
in March is not merged, so I am currently comparing upstream zram with this series. 

With the microbenchmark below of timing 1G swapin, there was a very large improvement
in performance by using this series. I think similar numbers would be seen in zram.

But when doing kernel build test, Kanchana saw a regression in [1]. I believe
its because of swap thrashing (causing large zswap activity), due to larger page swapin.
The part of the code that decides large folio swapin is the same between zswap and zram,
so I believe this would be observed in zram as well.

My initial thought was this might be because its intel, where you dont have the advantage
of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
and ARM as well, though a bit less (have added the numbers below).

The numbers show that the zswap activity increases and page faults decrease.
Overall this does result in sys time increasing and real time slightly increases,
likely because the cost of increased zswap activity is more than the benefit of
lower page faults.
I can see in [3] that pagefaults reduced in zram as well.

Large folio swapin shows good numbers in microbenchmarks that just target reduce page
faults and sequential swapin only, but not in kernel build test. Is a similar regression
observed with zram when enabling large folio swapin on kernel build test? Maybe large
folio swapin makes more sense on workloads where mappings are kept for a longer time?


Kernel build numbers in cgroup with memory.max=4G to trigger zswap
Command for AMD: make defconfig; time make -j$(nproc) bzImage
Command for ARM: make defconfig; time make -j$(nproc) Image


AMD 16K+32K THP=always
metric         mm-unstable      mm-unstable + large folio zswapin series
real           1m23.038s        1m23.050s
user           53m57.210s       53m53.437s
sys            7m24.592s        7m48.843s
zswpin         612070           999244
zswpout        2226403          2347979
pgfault        20667366         20481728
pgmajfault     385887           269117

AMD 16K+32K+64K THP=always
metric         mm-unstable      mm-unstable + large folio zswapin series
real           1m22.975s        1m23.266s
user           53m51.302s       53m51.069s
sys            7m40.168s        7m57.104s
zswpin         676492           1258573
zswpout        2449839          2714767
pgfault        17540746         17296555
pgmajfault     429629           307495
--------------------------
ARM 16K+32K THP=always
metric         mm-unstable      mm-unstable + large folio zswapin series
real           0m51.168s        0m52.086s
user           25m14.715s       25m15.765s
sys            17m18.856s       18m8.031s
zswpin         3904129          7339245
zswpout        11171295         13473461
pgfault        37313345         36011338
pgmajfault     2726253          1932642


ARM 16K+32K+64K THP=always
metric         mm-unstable      mm-unstable + large folio zswapin series
real           0m52.017s        0m53.828s
user           25m2.742s        25m0.046s
sys            18m24.525s       20m26.207s
zswpin         4853571          8908664
zswpout        12297199         15768764
pgfault        32158152         30425519
pgmajfault     3320717          2237015


Thanks!
Usama


[1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
[2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
[3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/

> 
>>>
>>> The time measured was pretty consistent between runs (~1-2% variation).
>>> There is 36% improvement in zswapin time with 1M folios. The percentage
>>> improvement is likely to be more if the memcmp is removed.
>>>
>>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
>>> index 40de679248b8..77068c577c86 100644
>>> --- a/tools/testing/selftests/cgroup/test_zswap.c
>>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
>>> @@ -9,6 +9,8 @@
>>>  #include <string.h>
>>>  #include <sys/wait.h>
>>>  #include <sys/mman.h>
>>> +#include <sys/time.h>
>>> +#include <malloc.h>
>>>
>>>  #include "../kselftest.h"
>>>  #include "cgroup_util.h"
>>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
>>>         return test_zswap_writeback(root, false);
>>>  }
>>>
>>> +static int zswapin_perf(const char *cgroup, void *arg)
>>> +{
>>> +       long pagesize = sysconf(_SC_PAGESIZE);
>>> +       size_t memsize = MB(1*1024);
>>> +       char buf[pagesize];
>>> +       int ret = -1;
>>> +       char *mem;
>>> +       struct timeval start, end;
>>> +
>>> +       mem = (char *)memalign(2*1024*1024, memsize);
>>> +       if (!mem)
>>> +               return ret;
>>> +
>>> +       /*
>>> +        * Fill half of each page with increasing data, and keep other
>>> +        * half empty, this will result in data that is still compressible
>>> +        * and ends up in zswap, with material zswap usage.
>>> +        */
>>> +       for (int i = 0; i < pagesize; i++)
>>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
>>> +
>>> +       for (int i = 0; i < memsize; i += pagesize)
>>> +               memcpy(&mem[i], buf, pagesize);
>>> +
>>> +       /* Try and reclaim allocated memory */
>>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
>>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
>>> +               goto out;
>>> +       }
>>> +
>>> +       gettimeofday(&start, NULL);
>>> +       /* zswpin */
>>> +       for (int i = 0; i < memsize; i += pagesize) {
>>> +               if (memcmp(&mem[i], buf, pagesize)) {
>>> +                       ksft_print_msg("invalid memory\n");
>>> +                       goto out;
>>> +               }
>>> +       }
>>> +       gettimeofday(&end, NULL);
>>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
>>> +       ret = 0;
>>> +out:
>>> +       free(mem);
>>> +       return ret;
>>> +}
>>> +
>>> +static int test_zswapin_perf(const char *root)
>>> +{
>>> +       int ret = KSFT_FAIL;
>>> +       char *test_group;
>>> +
>>> +       test_group = cg_name(root, "zswapin_perf_test");
>>> +       if (!test_group)
>>> +               goto out;
>>> +       if (cg_create(test_group))
>>> +               goto out;
>>> +
>>> +       if (cg_run(test_group, zswapin_perf, NULL))
>>> +               goto out;
>>> +
>>> +       ret = KSFT_PASS;
>>> +out:
>>> +       cg_destroy(test_group);
>>> +       free(test_group);
>>> +       return ret;
>>> +}
>>> +
>>>  /*
>>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
>>>   * limit in zswap, writeback should affect only the zswapped pages of that
>>> @@ -584,6 +654,7 @@ struct zswap_test {
>>>         T(test_zswapin),
>>>         T(test_zswap_writeback_enabled),
>>>         T(test_zswap_writeback_disabled),
>>> +       T(test_zswapin_perf),
>>>         T(test_no_kmem_bypass),
>>>         T(test_no_invasive_cgroup_shrink),
>>>  };
>>>
>>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
>>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
>>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
>>> [4] https://lwn.net/Articles/955575/
>>>
>>> Usama Arif (4):
>>>   mm/zswap: skip swapcache for swapping in zswap pages
>>>   mm/zswap: modify zswap_decompress to accept page instead of folio
>>>   mm/zswap: add support for large folio zswapin
>>>   mm/zswap: count successful large folio zswap loads
>>>
>>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
>>>  include/linux/huge_mm.h                    |   1 +
>>>  include/linux/zswap.h                      |   6 ++
>>>  mm/huge_memory.c                           |   3 +
>>>  mm/memory.c                                |  16 +--
>>>  mm/page_io.c                               |   2 +-
>>>  mm/zswap.c                                 | 120 ++++++++++++++-------
>>>  7 files changed, 99 insertions(+), 52 deletions(-)
>>>
>>> --
>>> 2.43.5
>>>
>>
>> Thanks
>> barry
> 

Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Wed, Oct 23, 2024 at 4:26 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 21/10/2024 11:40, Usama Arif wrote:
> >
> >
> > On 21/10/2024 06:09, Barry Song wrote:
> >> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>
> >>> After large folio zswapout support added in [1], this patch adds
> >>> support for zswapin of large folios to bring it on par with zram.
> >>> This series makes sure that the benefits of large folios (fewer
> >>> page faults, batched PTE and rmap manipulation, reduced lru list,
> >>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
> >>> using zswap.
> >>>
> >>> It builds on top of [2] which added large folio swapin support for
> >>> zram and provides the same level of large folio swapin support as
> >>> zram, i.e. only supporting swap count == 1.
> >>>
> >>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
> >>> no readahead swapin performance [3], and also allows us to build on large
> >>> folio swapin support added in [2], hence is a prerequisite for patch 3.
> >>>
> >>> Patch 3 adds support for large folio zswapin. This patch does not add
> >>> support for hybrid backends (i.e. folios partly present swap and zswap).
> >>>
> >>> The main performance benefit comes from maintaining large folios *after*
> >>> swapin, large folio performance improvements have been mentioned in previous
> >>> series posted on it [2],[4], so have not added those. Below is a simple
> >>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
> >>> with memory integrity check).
> >>>
> >>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
> >>> Base kernel                     |   1165        |    1163
> >>> Kernel with mTHP zswpin series  |   1203        |     738
> >>
> >> Hi Usama,
> >> Do you know where this minor regression for non-mTHP comes from?
> >> As you even have skipped swapcache for small folios in zswap in patch1,
> >> that part should have some gain? is it because of zswap_present_test()?
> >>
> >
> > Hi Barry,
> >
> > The microbenchmark does a sequential read of 1G of memory, so it probably
> > isnt very representative of real world usecases. This also means that
> > swap_vma_readahead is able to readahead accurately all pages in its window.
> > With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
> > do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
> > do_swap_page calls. I had added some prints and I was seeing 8 pages being
> > readahead in 1 do_swap_page. The larger number of calls causes the slight
> > regression (eventhough they are quite fast). I think in a realistic scenario,
> > where readahead window wont be as large, there wont be a regression.
> > The cost of zswap_present_test in the whole call stack of swapping page is
> > very low and I think can be ignored.
> >
> > I think the more interesting thing is what Kanchana pointed out in
> > https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> > I am curious, did you see this when testing large folio swapin and compression
> > at 4K granuality? Its looks like swap thrashing so I think it would be common
> > between zswap and zram. I dont have larger granuality zswap compression done,
> > which is why I think there is a regression in time taken. (It could be because
> > its tested on intel as well).
> >
> > Thanks,
> > Usama
> >
>
> Hi,
>
> So I have been doing some benchmarking after Kanchana pointed out a performance
> regression in [1] of swapping in large folio. I would love to get thoughts from
> zram folks on this, as thats where large folio swapin was first added [2].
> As far as I can see, the current support in zram is doing large folio swapin
> at 4K granuality. The large granuality compression in [3] which was posted
> in March is not merged, so I am currently comparing upstream zram with this series.
>
> With the microbenchmark below of timing 1G swapin, there was a very large improvement
> in performance by using this series. I think similar numbers would be seen in zram.

Imagine running several apps on a phone and switching
between them: A → B → C → D → E … → A → B … The app
currently on the screen retains its memory, while the ones
sent to the background are swapped out. When we bring
those apps back to the foreground, their memory is restored.
This behavior is quite similar to what you're seeing with
your microbenchmark.

>
> But when doing kernel build test, Kanchana saw a regression in [1]. I believe
> its because of swap thrashing (causing large zswap activity), due to larger page swapin.
> The part of the code that decides large folio swapin is the same between zswap and zram,
> so I believe this would be observed in zram as well.

Is this an extreme case where the workload's working set far
exceeds the available memory by memcg limitation? I doubt mTHP
would provide any real benefit from the start if the workload is bound to
experience swap thrashing. What if we disable mTHP entirely?

>
> My initial thought was this might be because its intel, where you dont have the advantage
> of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
> and ARM as well, though a bit less (have added the numbers below).
>
> The numbers show that the zswap activity increases and page faults decrease.
> Overall this does result in sys time increasing and real time slightly increases,
> likely because the cost of increased zswap activity is more than the benefit of
> lower page faults.
> I can see in [3] that pagefaults reduced in zram as well.
>
> Large folio swapin shows good numbers in microbenchmarks that just target reduce page
> faults and sequential swapin only, but not in kernel build test. Is a similar regression
> observed with zram when enabling large folio swapin on kernel build test? Maybe large
> folio swapin makes more sense on workloads where mappings are kept for a longer time?
>

I suspect this is because mTHP doesn't always benefit workloads
when available memory is quite limited compared to the working set.
In that case, mTHP swap-in might introduce more features that
exacerbate the problem. We used to have an extra control "swapin_enabled"
for swap-in, but it never gained much traction:
https://lore.kernel.org/linux-mm/20240726094618.401593-5-21cnbao@gmail.com/
We can reconsider whether to include the knob, but if it's better
to disable mTHP entirely for these cases, we can still adhere to
the policy of "enabled".

Using large block compression and decompression in zRAM will
significantly reduce CPU usage, likely making the issue unnoticeable.
However, the default minimum size for large block support is currently
set to 64KB(ZSMALLOC_MULTI_PAGES_ORDER = 4).

>
> Kernel build numbers in cgroup with memory.max=4G to trigger zswap
> Command for AMD: make defconfig; time make -j$(nproc) bzImage
> Command for ARM: make defconfig; time make -j$(nproc) Image
>
>
> AMD 16K+32K THP=always
> metric         mm-unstable      mm-unstable + large folio zswapin series
> real           1m23.038s        1m23.050s
> user           53m57.210s       53m53.437s
> sys            7m24.592s        7m48.843s
> zswpin         612070           999244
> zswpout        2226403          2347979
> pgfault        20667366         20481728
> pgmajfault     385887           269117
>
> AMD 16K+32K+64K THP=always
> metric         mm-unstable      mm-unstable + large folio zswapin series
> real           1m22.975s        1m23.266s
> user           53m51.302s       53m51.069s
> sys            7m40.168s        7m57.104s
> zswpin         676492           1258573
> zswpout        2449839          2714767
> pgfault        17540746         17296555
> pgmajfault     429629           307495
> --------------------------
> ARM 16K+32K THP=always
> metric         mm-unstable      mm-unstable + large folio zswapin series
> real           0m51.168s        0m52.086s
> user           25m14.715s       25m15.765s
> sys            17m18.856s       18m8.031s
> zswpin         3904129          7339245
> zswpout        11171295         13473461
> pgfault        37313345         36011338
> pgmajfault     2726253          1932642
>
>
> ARM 16K+32K+64K THP=always
> metric         mm-unstable      mm-unstable + large folio zswapin series
> real           0m52.017s        0m53.828s
> user           25m2.742s        25m0.046s
> sys            18m24.525s       20m26.207s
> zswpin         4853571          8908664
> zswpout        12297199         15768764
> pgfault        32158152         30425519
> pgmajfault     3320717          2237015
>
>
> Thanks!
> Usama
>
>
> [1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> [2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
> [3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/
>
> >
> >>>
> >>> The time measured was pretty consistent between runs (~1-2% variation).
> >>> There is 36% improvement in zswapin time with 1M folios. The percentage
> >>> improvement is likely to be more if the memcmp is removed.
> >>>
> >>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
> >>> index 40de679248b8..77068c577c86 100644
> >>> --- a/tools/testing/selftests/cgroup/test_zswap.c
> >>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
> >>> @@ -9,6 +9,8 @@
> >>>  #include <string.h>
> >>>  #include <sys/wait.h>
> >>>  #include <sys/mman.h>
> >>> +#include <sys/time.h>
> >>> +#include <malloc.h>
> >>>
> >>>  #include "../kselftest.h"
> >>>  #include "cgroup_util.h"
> >>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
> >>>         return test_zswap_writeback(root, false);
> >>>  }
> >>>
> >>> +static int zswapin_perf(const char *cgroup, void *arg)
> >>> +{
> >>> +       long pagesize = sysconf(_SC_PAGESIZE);
> >>> +       size_t memsize = MB(1*1024);
> >>> +       char buf[pagesize];
> >>> +       int ret = -1;
> >>> +       char *mem;
> >>> +       struct timeval start, end;
> >>> +
> >>> +       mem = (char *)memalign(2*1024*1024, memsize);
> >>> +       if (!mem)
> >>> +               return ret;
> >>> +
> >>> +       /*
> >>> +        * Fill half of each page with increasing data, and keep other
> >>> +        * half empty, this will result in data that is still compressible
> >>> +        * and ends up in zswap, with material zswap usage.
> >>> +        */
> >>> +       for (int i = 0; i < pagesize; i++)
> >>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
> >>> +
> >>> +       for (int i = 0; i < memsize; i += pagesize)
> >>> +               memcpy(&mem[i], buf, pagesize);
> >>> +
> >>> +       /* Try and reclaim allocated memory */
> >>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
> >>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
> >>> +               goto out;
> >>> +       }
> >>> +
> >>> +       gettimeofday(&start, NULL);
> >>> +       /* zswpin */
> >>> +       for (int i = 0; i < memsize; i += pagesize) {
> >>> +               if (memcmp(&mem[i], buf, pagesize)) {
> >>> +                       ksft_print_msg("invalid memory\n");
> >>> +                       goto out;
> >>> +               }
> >>> +       }
> >>> +       gettimeofday(&end, NULL);
> >>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
> >>> +       ret = 0;
> >>> +out:
> >>> +       free(mem);
> >>> +       return ret;
> >>> +}
> >>> +
> >>> +static int test_zswapin_perf(const char *root)
> >>> +{
> >>> +       int ret = KSFT_FAIL;
> >>> +       char *test_group;
> >>> +
> >>> +       test_group = cg_name(root, "zswapin_perf_test");
> >>> +       if (!test_group)
> >>> +               goto out;
> >>> +       if (cg_create(test_group))
> >>> +               goto out;
> >>> +
> >>> +       if (cg_run(test_group, zswapin_perf, NULL))
> >>> +               goto out;
> >>> +
> >>> +       ret = KSFT_PASS;
> >>> +out:
> >>> +       cg_destroy(test_group);
> >>> +       free(test_group);
> >>> +       return ret;
> >>> +}
> >>> +
> >>>  /*
> >>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
> >>>   * limit in zswap, writeback should affect only the zswapped pages of that
> >>> @@ -584,6 +654,7 @@ struct zswap_test {
> >>>         T(test_zswapin),
> >>>         T(test_zswap_writeback_enabled),
> >>>         T(test_zswap_writeback_disabled),
> >>> +       T(test_zswapin_perf),
> >>>         T(test_no_kmem_bypass),
> >>>         T(test_no_invasive_cgroup_shrink),
> >>>  };
> >>>
> >>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
> >>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
> >>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
> >>> [4] https://lwn.net/Articles/955575/
> >>>
> >>> Usama Arif (4):
> >>>   mm/zswap: skip swapcache for swapping in zswap pages
> >>>   mm/zswap: modify zswap_decompress to accept page instead of folio
> >>>   mm/zswap: add support for large folio zswapin
> >>>   mm/zswap: count successful large folio zswap loads
> >>>
> >>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
> >>>  include/linux/huge_mm.h                    |   1 +
> >>>  include/linux/zswap.h                      |   6 ++
> >>>  mm/huge_memory.c                           |   3 +
> >>>  mm/memory.c                                |  16 +--
> >>>  mm/page_io.c                               |   2 +-
> >>>  mm/zswap.c                                 | 120 ++++++++++++++-------
> >>>  7 files changed, 99 insertions(+), 52 deletions(-)
> >>>
> >>> --
> >>> 2.43.5
> >>>
> >>

Thanks
Barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 22/10/2024 21:46, Barry Song wrote:
> On Wed, Oct 23, 2024 at 4:26 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 21/10/2024 11:40, Usama Arif wrote:
>>>
>>>
>>> On 21/10/2024 06:09, Barry Song wrote:
>>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>
>>>>> After large folio zswapout support added in [1], this patch adds
>>>>> support for zswapin of large folios to bring it on par with zram.
>>>>> This series makes sure that the benefits of large folios (fewer
>>>>> page faults, batched PTE and rmap manipulation, reduced lru list,
>>>>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
>>>>> using zswap.
>>>>>
>>>>> It builds on top of [2] which added large folio swapin support for
>>>>> zram and provides the same level of large folio swapin support as
>>>>> zram, i.e. only supporting swap count == 1.
>>>>>
>>>>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
>>>>> no readahead swapin performance [3], and also allows us to build on large
>>>>> folio swapin support added in [2], hence is a prerequisite for patch 3.
>>>>>
>>>>> Patch 3 adds support for large folio zswapin. This patch does not add
>>>>> support for hybrid backends (i.e. folios partly present swap and zswap).
>>>>>
>>>>> The main performance benefit comes from maintaining large folios *after*
>>>>> swapin, large folio performance improvements have been mentioned in previous
>>>>> series posted on it [2],[4], so have not added those. Below is a simple
>>>>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
>>>>> with memory integrity check).
>>>>>
>>>>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
>>>>> Base kernel                     |   1165        |    1163
>>>>> Kernel with mTHP zswpin series  |   1203        |     738
>>>>
>>>> Hi Usama,
>>>> Do you know where this minor regression for non-mTHP comes from?
>>>> As you even have skipped swapcache for small folios in zswap in patch1,
>>>> that part should have some gain? is it because of zswap_present_test()?
>>>>
>>>
>>> Hi Barry,
>>>
>>> The microbenchmark does a sequential read of 1G of memory, so it probably
>>> isnt very representative of real world usecases. This also means that
>>> swap_vma_readahead is able to readahead accurately all pages in its window.
>>> With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
>>> do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
>>> do_swap_page calls. I had added some prints and I was seeing 8 pages being
>>> readahead in 1 do_swap_page. The larger number of calls causes the slight
>>> regression (eventhough they are quite fast). I think in a realistic scenario,
>>> where readahead window wont be as large, there wont be a regression.
>>> The cost of zswap_present_test in the whole call stack of swapping page is
>>> very low and I think can be ignored.
>>>
>>> I think the more interesting thing is what Kanchana pointed out in
>>> https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
>>> I am curious, did you see this when testing large folio swapin and compression
>>> at 4K granuality? Its looks like swap thrashing so I think it would be common
>>> between zswap and zram. I dont have larger granuality zswap compression done,
>>> which is why I think there is a regression in time taken. (It could be because
>>> its tested on intel as well).
>>>
>>> Thanks,
>>> Usama
>>>
>>
>> Hi,
>>
>> So I have been doing some benchmarking after Kanchana pointed out a performance
>> regression in [1] of swapping in large folio. I would love to get thoughts from
>> zram folks on this, as thats where large folio swapin was first added [2].
>> As far as I can see, the current support in zram is doing large folio swapin
>> at 4K granuality. The large granuality compression in [3] which was posted
>> in March is not merged, so I am currently comparing upstream zram with this series.
>>
>> With the microbenchmark below of timing 1G swapin, there was a very large improvement
>> in performance by using this series. I think similar numbers would be seen in zram.
> 
> Imagine running several apps on a phone and switching
> between them: A → B → C → D → E … → A → B … The app
> currently on the screen retains its memory, while the ones
> sent to the background are swapped out. When we bring
> those apps back to the foreground, their memory is restored.
> This behavior is quite similar to what you're seeing with
> your microbenchmark.
> 

Hi Barry,

Thanks for explaining this! Do you know if there is some open source benchmark
we could use to show an improvement in app switching with large folios?

Also I guess swap thrashing can happen when apps are brought back to foreground?

>>
>> But when doing kernel build test, Kanchana saw a regression in [1]. I believe
>> its because of swap thrashing (causing large zswap activity), due to larger page swapin.
>> The part of the code that decides large folio swapin is the same between zswap and zram,
>> so I believe this would be observed in zram as well.
> 
> Is this an extreme case where the workload's working set far
> exceeds the available memory by memcg limitation? I doubt mTHP
> would provide any real benefit from the start if the workload is bound to
> experience swap thrashing. What if we disable mTHP entirely?
> 

I would agree, this is an extreme case. I wanted (z)swap activity to happen so limited
memory.max to 4G.

mTHP is beneficial in kernel test benchmarking going from no mTHP to 16K:

ARM make defconfig; time make -j$(nproc) Image, cgroup memory.max=4G
metric         no mTHP         16K mTHP=always
real           1m0.613s         0m52.008s                    
user           25m23.028s       25m19.488s                      
sys            25m45.466s       18m11.640s                      
zswpin         1911194          3108438                   
zswpout        6880815          9374628                   
pgfault        120430166        48976658                     
pgmajfault     1580674          2327086     




>>
>> My initial thought was this might be because its intel, where you dont have the advantage
>> of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
>> and ARM as well, though a bit less (have added the numbers below).
>>
>> The numbers show that the zswap activity increases and page faults decrease.
>> Overall this does result in sys time increasing and real time slightly increases,
>> likely because the cost of increased zswap activity is more than the benefit of
>> lower page faults.
>> I can see in [3] that pagefaults reduced in zram as well.
>>
>> Large folio swapin shows good numbers in microbenchmarks that just target reduce page
>> faults and sequential swapin only, but not in kernel build test. Is a similar regression
>> observed with zram when enabling large folio swapin on kernel build test? Maybe large
>> folio swapin makes more sense on workloads where mappings are kept for a longer time?
>>
> 
> I suspect this is because mTHP doesn't always benefit workloads
> when available memory is quite limited compared to the working set.
> In that case, mTHP swap-in might introduce more features that
> exacerbate the problem. We used to have an extra control "swapin_enabled"
> for swap-in, but it never gained much traction:
> https://lore.kernel.org/linux-mm/20240726094618.401593-5-21cnbao@gmail.com/
> We can reconsider whether to include the knob, but if it's better
> to disable mTHP entirely for these cases, we can still adhere to
> the policy of "enabled".
> 
Yes I think this makes sense to have. The only thing is, its too many knobs!
I personally think its already difficult to decide upto which mTHP size we
should enable (and I think this changes per workload). But if we add swapin_enabled
on top of that it can make things more difficult.

> Using large block compression and decompression in zRAM will
> significantly reduce CPU usage, likely making the issue unnoticeable.
> However, the default minimum size for large block support is currently
> set to 64KB(ZSMALLOC_MULTI_PAGES_ORDER = 4).
> 

I saw that the patch was sent in March, and there werent any updates after?
Maybe I can try and cherry-pick that and see if we can develop large
granularity compression for zswap.

>>
>> Kernel build numbers in cgroup with memory.max=4G to trigger zswap
>> Command for AMD: make defconfig; time make -j$(nproc) bzImage
>> Command for ARM: make defconfig; time make -j$(nproc) Image
>>
>>
>> AMD 16K+32K THP=always
>> metric         mm-unstable      mm-unstable + large folio zswapin series
>> real           1m23.038s        1m23.050s
>> user           53m57.210s       53m53.437s
>> sys            7m24.592s        7m48.843s
>> zswpin         612070           999244
>> zswpout        2226403          2347979
>> pgfault        20667366         20481728
>> pgmajfault     385887           269117
>>
>> AMD 16K+32K+64K THP=always
>> metric         mm-unstable      mm-unstable + large folio zswapin series
>> real           1m22.975s        1m23.266s
>> user           53m51.302s       53m51.069s
>> sys            7m40.168s        7m57.104s
>> zswpin         676492           1258573
>> zswpout        2449839          2714767
>> pgfault        17540746         17296555
>> pgmajfault     429629           307495
>> --------------------------
>> ARM 16K+32K THP=always
>> metric         mm-unstable      mm-unstable + large folio zswapin series
>> real           0m51.168s        0m52.086s
>> user           25m14.715s       25m15.765s
>> sys            17m18.856s       18m8.031s
>> zswpin         3904129          7339245
>> zswpout        11171295         13473461
>> pgfault        37313345         36011338
>> pgmajfault     2726253          1932642
>>
>>
>> ARM 16K+32K+64K THP=always
>> metric         mm-unstable      mm-unstable + large folio zswapin series
>> real           0m52.017s        0m53.828s
>> user           25m2.742s        25m0.046s
>> sys            18m24.525s       20m26.207s
>> zswpin         4853571          8908664
>> zswpout        12297199         15768764
>> pgfault        32158152         30425519
>> pgmajfault     3320717          2237015
>>
>>
>> Thanks!
>> Usama
>>
>>
>> [1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
>> [2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
>> [3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/
>>
>>>
>>>>>
>>>>> The time measured was pretty consistent between runs (~1-2% variation).
>>>>> There is 36% improvement in zswapin time with 1M folios. The percentage
>>>>> improvement is likely to be more if the memcmp is removed.
>>>>>
>>>>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
>>>>> index 40de679248b8..77068c577c86 100644
>>>>> --- a/tools/testing/selftests/cgroup/test_zswap.c
>>>>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
>>>>> @@ -9,6 +9,8 @@
>>>>>  #include <string.h>
>>>>>  #include <sys/wait.h>
>>>>>  #include <sys/mman.h>
>>>>> +#include <sys/time.h>
>>>>> +#include <malloc.h>
>>>>>
>>>>>  #include "../kselftest.h"
>>>>>  #include "cgroup_util.h"
>>>>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
>>>>>         return test_zswap_writeback(root, false);
>>>>>  }
>>>>>
>>>>> +static int zswapin_perf(const char *cgroup, void *arg)
>>>>> +{
>>>>> +       long pagesize = sysconf(_SC_PAGESIZE);
>>>>> +       size_t memsize = MB(1*1024);
>>>>> +       char buf[pagesize];
>>>>> +       int ret = -1;
>>>>> +       char *mem;
>>>>> +       struct timeval start, end;
>>>>> +
>>>>> +       mem = (char *)memalign(2*1024*1024, memsize);
>>>>> +       if (!mem)
>>>>> +               return ret;
>>>>> +
>>>>> +       /*
>>>>> +        * Fill half of each page with increasing data, and keep other
>>>>> +        * half empty, this will result in data that is still compressible
>>>>> +        * and ends up in zswap, with material zswap usage.
>>>>> +        */
>>>>> +       for (int i = 0; i < pagesize; i++)
>>>>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
>>>>> +
>>>>> +       for (int i = 0; i < memsize; i += pagesize)
>>>>> +               memcpy(&mem[i], buf, pagesize);
>>>>> +
>>>>> +       /* Try and reclaim allocated memory */
>>>>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
>>>>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
>>>>> +               goto out;
>>>>> +       }
>>>>> +
>>>>> +       gettimeofday(&start, NULL);
>>>>> +       /* zswpin */
>>>>> +       for (int i = 0; i < memsize; i += pagesize) {
>>>>> +               if (memcmp(&mem[i], buf, pagesize)) {
>>>>> +                       ksft_print_msg("invalid memory\n");
>>>>> +                       goto out;
>>>>> +               }
>>>>> +       }
>>>>> +       gettimeofday(&end, NULL);
>>>>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
>>>>> +       ret = 0;
>>>>> +out:
>>>>> +       free(mem);
>>>>> +       return ret;
>>>>> +}
>>>>> +
>>>>> +static int test_zswapin_perf(const char *root)
>>>>> +{
>>>>> +       int ret = KSFT_FAIL;
>>>>> +       char *test_group;
>>>>> +
>>>>> +       test_group = cg_name(root, "zswapin_perf_test");
>>>>> +       if (!test_group)
>>>>> +               goto out;
>>>>> +       if (cg_create(test_group))
>>>>> +               goto out;
>>>>> +
>>>>> +       if (cg_run(test_group, zswapin_perf, NULL))
>>>>> +               goto out;
>>>>> +
>>>>> +       ret = KSFT_PASS;
>>>>> +out:
>>>>> +       cg_destroy(test_group);
>>>>> +       free(test_group);
>>>>> +       return ret;
>>>>> +}
>>>>> +
>>>>>  /*
>>>>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
>>>>>   * limit in zswap, writeback should affect only the zswapped pages of that
>>>>> @@ -584,6 +654,7 @@ struct zswap_test {
>>>>>         T(test_zswapin),
>>>>>         T(test_zswap_writeback_enabled),
>>>>>         T(test_zswap_writeback_disabled),
>>>>> +       T(test_zswapin_perf),
>>>>>         T(test_no_kmem_bypass),
>>>>>         T(test_no_invasive_cgroup_shrink),
>>>>>  };
>>>>>
>>>>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
>>>>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
>>>>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
>>>>> [4] https://lwn.net/Articles/955575/
>>>>>
>>>>> Usama Arif (4):
>>>>>   mm/zswap: skip swapcache for swapping in zswap pages
>>>>>   mm/zswap: modify zswap_decompress to accept page instead of folio
>>>>>   mm/zswap: add support for large folio zswapin
>>>>>   mm/zswap: count successful large folio zswap loads
>>>>>
>>>>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
>>>>>  include/linux/huge_mm.h                    |   1 +
>>>>>  include/linux/zswap.h                      |   6 ++
>>>>>  mm/huge_memory.c                           |   3 +
>>>>>  mm/memory.c                                |  16 +--
>>>>>  mm/page_io.c                               |   2 +-
>>>>>  mm/zswap.c                                 | 120 ++++++++++++++-------
>>>>>  7 files changed, 99 insertions(+), 52 deletions(-)
>>>>>
>>>>> --
>>>>> 2.43.5
>>>>>
>>>>
> 
> Thanks
> Barry

Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Wed, Oct 23, 2024 at 10:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 22/10/2024 21:46, Barry Song wrote:
> > On Wed, Oct 23, 2024 at 4:26 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >>
> >>
> >> On 21/10/2024 11:40, Usama Arif wrote:
> >>>
> >>>
> >>> On 21/10/2024 06:09, Barry Song wrote:
> >>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>>
> >>>>> After large folio zswapout support added in [1], this patch adds
> >>>>> support for zswapin of large folios to bring it on par with zram.
> >>>>> This series makes sure that the benefits of large folios (fewer
> >>>>> page faults, batched PTE and rmap manipulation, reduced lru list,
> >>>>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
> >>>>> using zswap.
> >>>>>
> >>>>> It builds on top of [2] which added large folio swapin support for
> >>>>> zram and provides the same level of large folio swapin support as
> >>>>> zram, i.e. only supporting swap count == 1.
> >>>>>
> >>>>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
> >>>>> no readahead swapin performance [3], and also allows us to build on large
> >>>>> folio swapin support added in [2], hence is a prerequisite for patch 3.
> >>>>>
> >>>>> Patch 3 adds support for large folio zswapin. This patch does not add
> >>>>> support for hybrid backends (i.e. folios partly present swap and zswap).
> >>>>>
> >>>>> The main performance benefit comes from maintaining large folios *after*
> >>>>> swapin, large folio performance improvements have been mentioned in previous
> >>>>> series posted on it [2],[4], so have not added those. Below is a simple
> >>>>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
> >>>>> with memory integrity check).
> >>>>>
> >>>>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
> >>>>> Base kernel                     |   1165        |    1163
> >>>>> Kernel with mTHP zswpin series  |   1203        |     738
> >>>>
> >>>> Hi Usama,
> >>>> Do you know where this minor regression for non-mTHP comes from?
> >>>> As you even have skipped swapcache for small folios in zswap in patch1,
> >>>> that part should have some gain? is it because of zswap_present_test()?
> >>>>
> >>>
> >>> Hi Barry,
> >>>
> >>> The microbenchmark does a sequential read of 1G of memory, so it probably
> >>> isnt very representative of real world usecases. This also means that
> >>> swap_vma_readahead is able to readahead accurately all pages in its window.
> >>> With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
> >>> do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
> >>> do_swap_page calls. I had added some prints and I was seeing 8 pages being
> >>> readahead in 1 do_swap_page. The larger number of calls causes the slight
> >>> regression (eventhough they are quite fast). I think in a realistic scenario,
> >>> where readahead window wont be as large, there wont be a regression.
> >>> The cost of zswap_present_test in the whole call stack of swapping page is
> >>> very low and I think can be ignored.
> >>>
> >>> I think the more interesting thing is what Kanchana pointed out in
> >>> https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> >>> I am curious, did you see this when testing large folio swapin and compression
> >>> at 4K granuality? Its looks like swap thrashing so I think it would be common
> >>> between zswap and zram. I dont have larger granuality zswap compression done,
> >>> which is why I think there is a regression in time taken. (It could be because
> >>> its tested on intel as well).
> >>>
> >>> Thanks,
> >>> Usama
> >>>
> >>
> >> Hi,
> >>
> >> So I have been doing some benchmarking after Kanchana pointed out a performance
> >> regression in [1] of swapping in large folio. I would love to get thoughts from
> >> zram folks on this, as thats where large folio swapin was first added [2].
> >> As far as I can see, the current support in zram is doing large folio swapin
> >> at 4K granuality. The large granuality compression in [3] which was posted
> >> in March is not merged, so I am currently comparing upstream zram with this series.
> >>
> >> With the microbenchmark below of timing 1G swapin, there was a very large improvement
> >> in performance by using this series. I think similar numbers would be seen in zram.
> >
> > Imagine running several apps on a phone and switching
> > between them: A → B → C → D → E … → A → B … The app
> > currently on the screen retains its memory, while the ones
> > sent to the background are swapped out. When we bring
> > those apps back to the foreground, their memory is restored.
> > This behavior is quite similar to what you're seeing with
> > your microbenchmark.
> >
>
> Hi Barry,
>
> Thanks for explaining this! Do you know if there is some open source benchmark
> we could use to show an improvement in app switching with large folios?
>

I’m fairly certain the Android team has this benchmark, but it’s not
open source.

A straightforward way to simulate this is to use a script that
cyclically launches multiple applications, such as Chrome, Firefox,
Office, PDF, and others.

for example:

launch chrome;
launch firefox;
launch youtube;
....
launch chrome;
launch firefox;
....

On Android, we have "Android activity manager 'am' command" to do that.
https://gist.github.com/tsohr/5711945

Not quite sure if other windows managers have similar tools.

> Also I guess swap thrashing can happen when apps are brought back to foreground?
>

Typically, the foreground app doesn't experience much swapping,
as it is the most recently or frequently used. However, this may
not hold for very low-end phones, where memory is significantly
less than the app's working set. For instance, we can't expect a
good user experience when playing a large game that requires 8GB
of memory on a 4GB phone! :-)
And for low-end phones, we never even enable mTHP.

> >>
> >> But when doing kernel build test, Kanchana saw a regression in [1]. I believe
> >> its because of swap thrashing (causing large zswap activity), due to larger page swapin.
> >> The part of the code that decides large folio swapin is the same between zswap and zram,
> >> so I believe this would be observed in zram as well.
> >
> > Is this an extreme case where the workload's working set far
> > exceeds the available memory by memcg limitation? I doubt mTHP
> > would provide any real benefit from the start if the workload is bound to
> > experience swap thrashing. What if we disable mTHP entirely?
> >
>
> I would agree, this is an extreme case. I wanted (z)swap activity to happen so limited
> memory.max to 4G.
>
> mTHP is beneficial in kernel test benchmarking going from no mTHP to 16K:
>
> ARM make defconfig; time make -j$(nproc) Image, cgroup memory.max=4G
> metric         no mTHP         16K mTHP=always
> real           1m0.613s         0m52.008s
> user           25m23.028s       25m19.488s
> sys            25m45.466s       18m11.640s
> zswpin         1911194          3108438
> zswpout        6880815          9374628
> pgfault        120430166        48976658
> pgmajfault     1580674          2327086
>
>

Interesting! We never use a phone to build the Linux kernel, but
let me see if I can find some other machines to reproduce your data.

>
>
> >>
> >> My initial thought was this might be because its intel, where you dont have the advantage
> >> of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
> >> and ARM as well, though a bit less (have added the numbers below).
> >>
> >> The numbers show that the zswap activity increases and page faults decrease.
> >> Overall this does result in sys time increasing and real time slightly increases,
> >> likely because the cost of increased zswap activity is more than the benefit of
> >> lower page faults.
> >> I can see in [3] that pagefaults reduced in zram as well.
> >>
> >> Large folio swapin shows good numbers in microbenchmarks that just target reduce page
> >> faults and sequential swapin only, but not in kernel build test. Is a similar regression
> >> observed with zram when enabling large folio swapin on kernel build test? Maybe large
> >> folio swapin makes more sense on workloads where mappings are kept for a longer time?
> >>
> >
> > I suspect this is because mTHP doesn't always benefit workloads
> > when available memory is quite limited compared to the working set.
> > In that case, mTHP swap-in might introduce more features that
> > exacerbate the problem. We used to have an extra control "swapin_enabled"
> > for swap-in, but it never gained much traction:
> > https://lore.kernel.org/linux-mm/20240726094618.401593-5-21cnbao@gmail.com/
> > We can reconsider whether to include the knob, but if it's better
> > to disable mTHP entirely for these cases, we can still adhere to
> > the policy of "enabled".
> >
> Yes I think this makes sense to have. The only thing is, its too many knobs!
> I personally think its already difficult to decide upto which mTHP size we
> should enable (and I think this changes per workload). But if we add swapin_enabled
> on top of that it can make things more difficult.
>
> > Using large block compression and decompression in zRAM will
> > significantly reduce CPU usage, likely making the issue unnoticeable.
> > However, the default minimum size for large block support is currently
> > set to 64KB(ZSMALLOC_MULTI_PAGES_ORDER = 4).
> >
>
> I saw that the patch was sent in March, and there werent any updates after?
> Maybe I can try and cherry-pick that and see if we can develop large
> granularity compression for zswap.

will provide an updated version next week.

>
> >>
> >> Kernel build numbers in cgroup with memory.max=4G to trigger zswap
> >> Command for AMD: make defconfig; time make -j$(nproc) bzImage
> >> Command for ARM: make defconfig; time make -j$(nproc) Image
> >>
> >>
> >> AMD 16K+32K THP=always
> >> metric         mm-unstable      mm-unstable + large folio zswapin series
> >> real           1m23.038s        1m23.050s
> >> user           53m57.210s       53m53.437s
> >> sys            7m24.592s        7m48.843s
> >> zswpin         612070           999244
> >> zswpout        2226403          2347979
> >> pgfault        20667366         20481728
> >> pgmajfault     385887           269117
> >>
> >> AMD 16K+32K+64K THP=always
> >> metric         mm-unstable      mm-unstable + large folio zswapin series
> >> real           1m22.975s        1m23.266s
> >> user           53m51.302s       53m51.069s
> >> sys            7m40.168s        7m57.104s
> >> zswpin         676492           1258573
> >> zswpout        2449839          2714767
> >> pgfault        17540746         17296555
> >> pgmajfault     429629           307495
> >> --------------------------
> >> ARM 16K+32K THP=always
> >> metric         mm-unstable      mm-unstable + large folio zswapin series
> >> real           0m51.168s        0m52.086s
> >> user           25m14.715s       25m15.765s
> >> sys            17m18.856s       18m8.031s
> >> zswpin         3904129          7339245
> >> zswpout        11171295         13473461
> >> pgfault        37313345         36011338
> >> pgmajfault     2726253          1932642
> >>
> >>
> >> ARM 16K+32K+64K THP=always
> >> metric         mm-unstable      mm-unstable + large folio zswapin series
> >> real           0m52.017s        0m53.828s
> >> user           25m2.742s        25m0.046s
> >> sys            18m24.525s       20m26.207s
> >> zswpin         4853571          8908664
> >> zswpout        12297199         15768764
> >> pgfault        32158152         30425519
> >> pgmajfault     3320717          2237015
> >>
> >>
> >> Thanks!
> >> Usama
> >>
> >>
> >> [1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> >> [2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
> >> [3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/
> >>
> >>>
> >>>>>
> >>>>> The time measured was pretty consistent between runs (~1-2% variation).
> >>>>> There is 36% improvement in zswapin time with 1M folios. The percentage
> >>>>> improvement is likely to be more if the memcmp is removed.
> >>>>>
> >>>>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
> >>>>> index 40de679248b8..77068c577c86 100644
> >>>>> --- a/tools/testing/selftests/cgroup/test_zswap.c
> >>>>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
> >>>>> @@ -9,6 +9,8 @@
> >>>>>  #include <string.h>
> >>>>>  #include <sys/wait.h>
> >>>>>  #include <sys/mman.h>
> >>>>> +#include <sys/time.h>
> >>>>> +#include <malloc.h>
> >>>>>
> >>>>>  #include "../kselftest.h"
> >>>>>  #include "cgroup_util.h"
> >>>>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
> >>>>>         return test_zswap_writeback(root, false);
> >>>>>  }
> >>>>>
> >>>>> +static int zswapin_perf(const char *cgroup, void *arg)
> >>>>> +{
> >>>>> +       long pagesize = sysconf(_SC_PAGESIZE);
> >>>>> +       size_t memsize = MB(1*1024);
> >>>>> +       char buf[pagesize];
> >>>>> +       int ret = -1;
> >>>>> +       char *mem;
> >>>>> +       struct timeval start, end;
> >>>>> +
> >>>>> +       mem = (char *)memalign(2*1024*1024, memsize);
> >>>>> +       if (!mem)
> >>>>> +               return ret;
> >>>>> +
> >>>>> +       /*
> >>>>> +        * Fill half of each page with increasing data, and keep other
> >>>>> +        * half empty, this will result in data that is still compressible
> >>>>> +        * and ends up in zswap, with material zswap usage.
> >>>>> +        */
> >>>>> +       for (int i = 0; i < pagesize; i++)
> >>>>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
> >>>>> +
> >>>>> +       for (int i = 0; i < memsize; i += pagesize)
> >>>>> +               memcpy(&mem[i], buf, pagesize);
> >>>>> +
> >>>>> +       /* Try and reclaim allocated memory */
> >>>>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
> >>>>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
> >>>>> +               goto out;
> >>>>> +       }
> >>>>> +
> >>>>> +       gettimeofday(&start, NULL);
> >>>>> +       /* zswpin */
> >>>>> +       for (int i = 0; i < memsize; i += pagesize) {
> >>>>> +               if (memcmp(&mem[i], buf, pagesize)) {
> >>>>> +                       ksft_print_msg("invalid memory\n");
> >>>>> +                       goto out;
> >>>>> +               }
> >>>>> +       }
> >>>>> +       gettimeofday(&end, NULL);
> >>>>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
> >>>>> +       ret = 0;
> >>>>> +out:
> >>>>> +       free(mem);
> >>>>> +       return ret;
> >>>>> +}
> >>>>> +
> >>>>> +static int test_zswapin_perf(const char *root)
> >>>>> +{
> >>>>> +       int ret = KSFT_FAIL;
> >>>>> +       char *test_group;
> >>>>> +
> >>>>> +       test_group = cg_name(root, "zswapin_perf_test");
> >>>>> +       if (!test_group)
> >>>>> +               goto out;
> >>>>> +       if (cg_create(test_group))
> >>>>> +               goto out;
> >>>>> +
> >>>>> +       if (cg_run(test_group, zswapin_perf, NULL))
> >>>>> +               goto out;
> >>>>> +
> >>>>> +       ret = KSFT_PASS;
> >>>>> +out:
> >>>>> +       cg_destroy(test_group);
> >>>>> +       free(test_group);
> >>>>> +       return ret;
> >>>>> +}
> >>>>> +
> >>>>>  /*
> >>>>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
> >>>>>   * limit in zswap, writeback should affect only the zswapped pages of that
> >>>>> @@ -584,6 +654,7 @@ struct zswap_test {
> >>>>>         T(test_zswapin),
> >>>>>         T(test_zswap_writeback_enabled),
> >>>>>         T(test_zswap_writeback_disabled),
> >>>>> +       T(test_zswapin_perf),
> >>>>>         T(test_no_kmem_bypass),
> >>>>>         T(test_no_invasive_cgroup_shrink),
> >>>>>  };
> >>>>>
> >>>>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
> >>>>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
> >>>>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
> >>>>> [4] https://lwn.net/Articles/955575/
> >>>>>
> >>>>> Usama Arif (4):
> >>>>>   mm/zswap: skip swapcache for swapping in zswap pages
> >>>>>   mm/zswap: modify zswap_decompress to accept page instead of folio
> >>>>>   mm/zswap: add support for large folio zswapin
> >>>>>   mm/zswap: count successful large folio zswap loads
> >>>>>
> >>>>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
> >>>>>  include/linux/huge_mm.h                    |   1 +
> >>>>>  include/linux/zswap.h                      |   6 ++
> >>>>>  mm/huge_memory.c                           |   3 +
> >>>>>  mm/memory.c                                |  16 +--
> >>>>>  mm/page_io.c                               |   2 +-
> >>>>>  mm/zswap.c                                 | 120 ++++++++++++++-------
> >>>>>  7 files changed, 99 insertions(+), 52 deletions(-)
> >>>>>
> >>>>> --
> >>>>> 2.43.5
> >>>>>
> >>>>
> >

Thanks
Barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Wed, Oct 23, 2024 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Oct 23, 2024 at 10:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 22/10/2024 21:46, Barry Song wrote:
> > > On Wed, Oct 23, 2024 at 4:26 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>
> > >>
> > >>
> > >> On 21/10/2024 11:40, Usama Arif wrote:
> > >>>
> > >>>
> > >>> On 21/10/2024 06:09, Barry Song wrote:
> > >>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>>>>
> > >>>>> After large folio zswapout support added in [1], this patch adds
> > >>>>> support for zswapin of large folios to bring it on par with zram.
> > >>>>> This series makes sure that the benefits of large folios (fewer
> > >>>>> page faults, batched PTE and rmap manipulation, reduced lru list,
> > >>>>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
> > >>>>> using zswap.
> > >>>>>
> > >>>>> It builds on top of [2] which added large folio swapin support for
> > >>>>> zram and provides the same level of large folio swapin support as
> > >>>>> zram, i.e. only supporting swap count == 1.
> > >>>>>
> > >>>>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
> > >>>>> no readahead swapin performance [3], and also allows us to build on large
> > >>>>> folio swapin support added in [2], hence is a prerequisite for patch 3.
> > >>>>>
> > >>>>> Patch 3 adds support for large folio zswapin. This patch does not add
> > >>>>> support for hybrid backends (i.e. folios partly present swap and zswap).
> > >>>>>
> > >>>>> The main performance benefit comes from maintaining large folios *after*
> > >>>>> swapin, large folio performance improvements have been mentioned in previous
> > >>>>> series posted on it [2],[4], so have not added those. Below is a simple
> > >>>>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
> > >>>>> with memory integrity check).
> > >>>>>
> > >>>>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
> > >>>>> Base kernel                     |   1165        |    1163
> > >>>>> Kernel with mTHP zswpin series  |   1203        |     738
> > >>>>
> > >>>> Hi Usama,
> > >>>> Do you know where this minor regression for non-mTHP comes from?
> > >>>> As you even have skipped swapcache for small folios in zswap in patch1,
> > >>>> that part should have some gain? is it because of zswap_present_test()?
> > >>>>
> > >>>
> > >>> Hi Barry,
> > >>>
> > >>> The microbenchmark does a sequential read of 1G of memory, so it probably
> > >>> isnt very representative of real world usecases. This also means that
> > >>> swap_vma_readahead is able to readahead accurately all pages in its window.
> > >>> With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
> > >>> do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
> > >>> do_swap_page calls. I had added some prints and I was seeing 8 pages being
> > >>> readahead in 1 do_swap_page. The larger number of calls causes the slight
> > >>> regression (eventhough they are quite fast). I think in a realistic scenario,
> > >>> where readahead window wont be as large, there wont be a regression.
> > >>> The cost of zswap_present_test in the whole call stack of swapping page is
> > >>> very low and I think can be ignored.
> > >>>
> > >>> I think the more interesting thing is what Kanchana pointed out in
> > >>> https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> > >>> I am curious, did you see this when testing large folio swapin and compression
> > >>> at 4K granuality? Its looks like swap thrashing so I think it would be common
> > >>> between zswap and zram. I dont have larger granuality zswap compression done,
> > >>> which is why I think there is a regression in time taken. (It could be because
> > >>> its tested on intel as well).
> > >>>
> > >>> Thanks,
> > >>> Usama
> > >>>
> > >>
> > >> Hi,
> > >>
> > >> So I have been doing some benchmarking after Kanchana pointed out a performance
> > >> regression in [1] of swapping in large folio. I would love to get thoughts from
> > >> zram folks on this, as thats where large folio swapin was first added [2].
> > >> As far as I can see, the current support in zram is doing large folio swapin
> > >> at 4K granuality. The large granuality compression in [3] which was posted
> > >> in March is not merged, so I am currently comparing upstream zram with this series.
> > >>
> > >> With the microbenchmark below of timing 1G swapin, there was a very large improvement
> > >> in performance by using this series. I think similar numbers would be seen in zram.
> > >
> > > Imagine running several apps on a phone and switching
> > > between them: A → B → C → D → E … → A → B … The app
> > > currently on the screen retains its memory, while the ones
> > > sent to the background are swapped out. When we bring
> > > those apps back to the foreground, their memory is restored.
> > > This behavior is quite similar to what you're seeing with
> > > your microbenchmark.
> > >
> >
> > Hi Barry,
> >
> > Thanks for explaining this! Do you know if there is some open source benchmark
> > we could use to show an improvement in app switching with large folios?
> >
>
> I’m fairly certain the Android team has this benchmark, but it’s not
> open source.
>
> A straightforward way to simulate this is to use a script that
> cyclically launches multiple applications, such as Chrome, Firefox,
> Office, PDF, and others.
>
> for example:
>
> launch chrome;
> launch firefox;
> launch youtube;
> ....
> launch chrome;
> launch firefox;
> ....
>
> On Android, we have "Android activity manager 'am' command" to do that.
> https://gist.github.com/tsohr/5711945
>
> Not quite sure if other windows managers have similar tools.
>
> > Also I guess swap thrashing can happen when apps are brought back to foreground?
> >
>
> Typically, the foreground app doesn't experience much swapping,
> as it is the most recently or frequently used. However, this may
> not hold for very low-end phones, where memory is significantly
> less than the app's working set. For instance, we can't expect a
> good user experience when playing a large game that requires 8GB
> of memory on a 4GB phone! :-)
> And for low-end phones, we never even enable mTHP.
>
> > >>
> > >> But when doing kernel build test, Kanchana saw a regression in [1]. I believe
> > >> its because of swap thrashing (causing large zswap activity), due to larger page swapin.
> > >> The part of the code that decides large folio swapin is the same between zswap and zram,
> > >> so I believe this would be observed in zram as well.
> > >
> > > Is this an extreme case where the workload's working set far
> > > exceeds the available memory by memcg limitation? I doubt mTHP
> > > would provide any real benefit from the start if the workload is bound to
> > > experience swap thrashing. What if we disable mTHP entirely?
> > >
> >
> > I would agree, this is an extreme case. I wanted (z)swap activity to happen so limited
> > memory.max to 4G.
> >
> > mTHP is beneficial in kernel test benchmarking going from no mTHP to 16K:
> >
> > ARM make defconfig; time make -j$(nproc) Image, cgroup memory.max=4G
> > metric         no mTHP         16K mTHP=always
> > real           1m0.613s         0m52.008s
> > user           25m23.028s       25m19.488s
> > sys            25m45.466s       18m11.640s
> > zswpin         1911194          3108438
> > zswpout        6880815          9374628
> > pgfault        120430166        48976658
> > pgmajfault     1580674          2327086
> >
> >
>
> Interesting! We never use a phone to build the Linux kernel, but
> let me see if I can find some other machines to reproduce your data.

Hi Usama,

I suspect the regression occurs because you're running an edge case
where the memory cgroup stays nearly full most of the time (this isn't
an inherent issue with large folio swap-in). As a result, swapping in
mTHP quickly triggers a memcg overflow, causing a swap-out. The
next swap-in then recreates the overflow, leading to a repeating
cycle.

We need a way to stop the cup from repeatedly filling to the brim and
overflowing. While not a definitive fix, the following change might help
improve the situation:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 17af08367c68..f2fa0eeb2d9a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c

@@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
*folio, struct mm_struct *mm,
                memcg = get_mem_cgroup_from_mm(mm);
        rcu_read_unlock();

-       ret = charge_memcg(folio, memcg, gfp);
+       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
MEMCG_CHARGE_BATCH)
+               ret = -ENOMEM;
+       else
+               ret = charge_memcg(folio, memcg, gfp);

        css_put(&memcg->css);
        return ret;
}

Please confirm if it makes the kernel build with memcg limitation
faster. If so, let's
work together to figure out an official patch :-) The above code hasn't consider
the parent memcg's overflow, so not an ideal fix.

>
> >
> >
> > >>
> > >> My initial thought was this might be because its intel, where you dont have the advantage
> > >> of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
> > >> and ARM as well, though a bit less (have added the numbers below).
> > >>
> > >> The numbers show that the zswap activity increases and page faults decrease.
> > >> Overall this does result in sys time increasing and real time slightly increases,
> > >> likely because the cost of increased zswap activity is more than the benefit of
> > >> lower page faults.
> > >> I can see in [3] that pagefaults reduced in zram as well.
> > >>
> > >> Large folio swapin shows good numbers in microbenchmarks that just target reduce page
> > >> faults and sequential swapin only, but not in kernel build test. Is a similar regression
> > >> observed with zram when enabling large folio swapin on kernel build test? Maybe large
> > >> folio swapin makes more sense on workloads where mappings are kept for a longer time?
> > >>
> > >
> > > I suspect this is because mTHP doesn't always benefit workloads
> > > when available memory is quite limited compared to the working set.
> > > In that case, mTHP swap-in might introduce more features that
> > > exacerbate the problem. We used to have an extra control "swapin_enabled"
> > > for swap-in, but it never gained much traction:
> > > https://lore.kernel.org/linux-mm/20240726094618.401593-5-21cnbao@gmail.com/
> > > We can reconsider whether to include the knob, but if it's better
> > > to disable mTHP entirely for these cases, we can still adhere to
> > > the policy of "enabled".
> > >
> > Yes I think this makes sense to have. The only thing is, its too many knobs!
> > I personally think its already difficult to decide upto which mTHP size we
> > should enable (and I think this changes per workload). But if we add swapin_enabled
> > on top of that it can make things more difficult.
> >
> > > Using large block compression and decompression in zRAM will
> > > significantly reduce CPU usage, likely making the issue unnoticeable.
> > > However, the default minimum size for large block support is currently
> > > set to 64KB(ZSMALLOC_MULTI_PAGES_ORDER = 4).
> > >
> >
> > I saw that the patch was sent in March, and there werent any updates after?
> > Maybe I can try and cherry-pick that and see if we can develop large
> > granularity compression for zswap.
>
> will provide an updated version next week.
>
> >
> > >>
> > >> Kernel build numbers in cgroup with memory.max=4G to trigger zswap
> > >> Command for AMD: make defconfig; time make -j$(nproc) bzImage
> > >> Command for ARM: make defconfig; time make -j$(nproc) Image
> > >>
> > >>
> > >> AMD 16K+32K THP=always
> > >> metric         mm-unstable      mm-unstable + large folio zswapin series
> > >> real           1m23.038s        1m23.050s
> > >> user           53m57.210s       53m53.437s
> > >> sys            7m24.592s        7m48.843s
> > >> zswpin         612070           999244
> > >> zswpout        2226403          2347979
> > >> pgfault        20667366         20481728
> > >> pgmajfault     385887           269117
> > >>
> > >> AMD 16K+32K+64K THP=always
> > >> metric         mm-unstable      mm-unstable + large folio zswapin series
> > >> real           1m22.975s        1m23.266s
> > >> user           53m51.302s       53m51.069s
> > >> sys            7m40.168s        7m57.104s
> > >> zswpin         676492           1258573
> > >> zswpout        2449839          2714767
> > >> pgfault        17540746         17296555
> > >> pgmajfault     429629           307495
> > >> --------------------------
> > >> ARM 16K+32K THP=always
> > >> metric         mm-unstable      mm-unstable + large folio zswapin series
> > >> real           0m51.168s        0m52.086s
> > >> user           25m14.715s       25m15.765s
> > >> sys            17m18.856s       18m8.031s
> > >> zswpin         3904129          7339245
> > >> zswpout        11171295         13473461
> > >> pgfault        37313345         36011338
> > >> pgmajfault     2726253          1932642
> > >>
> > >>
> > >> ARM 16K+32K+64K THP=always
> > >> metric         mm-unstable      mm-unstable + large folio zswapin series
> > >> real           0m52.017s        0m53.828s
> > >> user           25m2.742s        25m0.046s
> > >> sys            18m24.525s       20m26.207s
> > >> zswpin         4853571          8908664
> > >> zswpout        12297199         15768764
> > >> pgfault        32158152         30425519
> > >> pgmajfault     3320717          2237015
> > >>
> > >>
> > >> Thanks!
> > >> Usama
> > >>
> > >>
> > >> [1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
> > >> [2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
> > >> [3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/
> > >>
> > >>>
> > >>>>>
> > >>>>> The time measured was pretty consistent between runs (~1-2% variation).
> > >>>>> There is 36% improvement in zswapin time with 1M folios. The percentage
> > >>>>> improvement is likely to be more if the memcmp is removed.
> > >>>>>
> > >>>>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
> > >>>>> index 40de679248b8..77068c577c86 100644
> > >>>>> --- a/tools/testing/selftests/cgroup/test_zswap.c
> > >>>>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
> > >>>>> @@ -9,6 +9,8 @@
> > >>>>>  #include <string.h>
> > >>>>>  #include <sys/wait.h>
> > >>>>>  #include <sys/mman.h>
> > >>>>> +#include <sys/time.h>
> > >>>>> +#include <malloc.h>
> > >>>>>
> > >>>>>  #include "../kselftest.h"
> > >>>>>  #include "cgroup_util.h"
> > >>>>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
> > >>>>>         return test_zswap_writeback(root, false);
> > >>>>>  }
> > >>>>>
> > >>>>> +static int zswapin_perf(const char *cgroup, void *arg)
> > >>>>> +{
> > >>>>> +       long pagesize = sysconf(_SC_PAGESIZE);
> > >>>>> +       size_t memsize = MB(1*1024);
> > >>>>> +       char buf[pagesize];
> > >>>>> +       int ret = -1;
> > >>>>> +       char *mem;
> > >>>>> +       struct timeval start, end;
> > >>>>> +
> > >>>>> +       mem = (char *)memalign(2*1024*1024, memsize);
> > >>>>> +       if (!mem)
> > >>>>> +               return ret;
> > >>>>> +
> > >>>>> +       /*
> > >>>>> +        * Fill half of each page with increasing data, and keep other
> > >>>>> +        * half empty, this will result in data that is still compressible
> > >>>>> +        * and ends up in zswap, with material zswap usage.
> > >>>>> +        */
> > >>>>> +       for (int i = 0; i < pagesize; i++)
> > >>>>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
> > >>>>> +
> > >>>>> +       for (int i = 0; i < memsize; i += pagesize)
> > >>>>> +               memcpy(&mem[i], buf, pagesize);
> > >>>>> +
> > >>>>> +       /* Try and reclaim allocated memory */
> > >>>>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
> > >>>>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
> > >>>>> +               goto out;
> > >>>>> +       }
> > >>>>> +
> > >>>>> +       gettimeofday(&start, NULL);
> > >>>>> +       /* zswpin */
> > >>>>> +       for (int i = 0; i < memsize; i += pagesize) {
> > >>>>> +               if (memcmp(&mem[i], buf, pagesize)) {
> > >>>>> +                       ksft_print_msg("invalid memory\n");
> > >>>>> +                       goto out;
> > >>>>> +               }
> > >>>>> +       }
> > >>>>> +       gettimeofday(&end, NULL);
> > >>>>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
> > >>>>> +       ret = 0;
> > >>>>> +out:
> > >>>>> +       free(mem);
> > >>>>> +       return ret;
> > >>>>> +}
> > >>>>> +
> > >>>>> +static int test_zswapin_perf(const char *root)
> > >>>>> +{
> > >>>>> +       int ret = KSFT_FAIL;
> > >>>>> +       char *test_group;
> > >>>>> +
> > >>>>> +       test_group = cg_name(root, "zswapin_perf_test");
> > >>>>> +       if (!test_group)
> > >>>>> +               goto out;
> > >>>>> +       if (cg_create(test_group))
> > >>>>> +               goto out;
> > >>>>> +
> > >>>>> +       if (cg_run(test_group, zswapin_perf, NULL))
> > >>>>> +               goto out;
> > >>>>> +
> > >>>>> +       ret = KSFT_PASS;
> > >>>>> +out:
> > >>>>> +       cg_destroy(test_group);
> > >>>>> +       free(test_group);
> > >>>>> +       return ret;
> > >>>>> +}
> > >>>>> +
> > >>>>>  /*
> > >>>>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
> > >>>>>   * limit in zswap, writeback should affect only the zswapped pages of that
> > >>>>> @@ -584,6 +654,7 @@ struct zswap_test {
> > >>>>>         T(test_zswapin),
> > >>>>>         T(test_zswap_writeback_enabled),
> > >>>>>         T(test_zswap_writeback_disabled),
> > >>>>> +       T(test_zswapin_perf),
> > >>>>>         T(test_no_kmem_bypass),
> > >>>>>         T(test_no_invasive_cgroup_shrink),
> > >>>>>  };
> > >>>>>
> > >>>>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
> > >>>>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
> > >>>>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
> > >>>>> [4] https://lwn.net/Articles/955575/
> > >>>>>
> > >>>>> Usama Arif (4):
> > >>>>>   mm/zswap: skip swapcache for swapping in zswap pages
> > >>>>>   mm/zswap: modify zswap_decompress to accept page instead of folio
> > >>>>>   mm/zswap: add support for large folio zswapin
> > >>>>>   mm/zswap: count successful large folio zswap loads
> > >>>>>
> > >>>>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
> > >>>>>  include/linux/huge_mm.h                    |   1 +
> > >>>>>  include/linux/zswap.h                      |   6 ++
> > >>>>>  mm/huge_memory.c                           |   3 +
> > >>>>>  mm/memory.c                                |  16 +--
> > >>>>>  mm/page_io.c                               |   2 +-
> > >>>>>  mm/zswap.c                                 | 120 ++++++++++++++-------
> > >>>>>  7 files changed, 99 insertions(+), 52 deletions(-)
> > >>>>>
> > >>>>> --
> > >>>>> 2.43.5
> > >>>>>
> > >>>>
> > >
>

Thanks
Barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 23/10/2024 11:26, Barry Song wrote:
> On Wed, Oct 23, 2024 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Wed, Oct 23, 2024 at 10:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>
>>>
>>>
>>> On 22/10/2024 21:46, Barry Song wrote:
>>>> On Wed, Oct 23, 2024 at 4:26 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 21/10/2024 11:40, Usama Arif wrote:
>>>>>>
>>>>>>
>>>>>> On 21/10/2024 06:09, Barry Song wrote:
>>>>>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>>>>
>>>>>>>> After large folio zswapout support added in [1], this patch adds
>>>>>>>> support for zswapin of large folios to bring it on par with zram.
>>>>>>>> This series makes sure that the benefits of large folios (fewer
>>>>>>>> page faults, batched PTE and rmap manipulation, reduced lru list,
>>>>>>>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
>>>>>>>> using zswap.
>>>>>>>>
>>>>>>>> It builds on top of [2] which added large folio swapin support for
>>>>>>>> zram and provides the same level of large folio swapin support as
>>>>>>>> zram, i.e. only supporting swap count == 1.
>>>>>>>>
>>>>>>>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
>>>>>>>> no readahead swapin performance [3], and also allows us to build on large
>>>>>>>> folio swapin support added in [2], hence is a prerequisite for patch 3.
>>>>>>>>
>>>>>>>> Patch 3 adds support for large folio zswapin. This patch does not add
>>>>>>>> support for hybrid backends (i.e. folios partly present swap and zswap).
>>>>>>>>
>>>>>>>> The main performance benefit comes from maintaining large folios *after*
>>>>>>>> swapin, large folio performance improvements have been mentioned in previous
>>>>>>>> series posted on it [2],[4], so have not added those. Below is a simple
>>>>>>>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
>>>>>>>> with memory integrity check).
>>>>>>>>
>>>>>>>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
>>>>>>>> Base kernel                     |   1165        |    1163
>>>>>>>> Kernel with mTHP zswpin series  |   1203        |     738
>>>>>>>
>>>>>>> Hi Usama,
>>>>>>> Do you know where this minor regression for non-mTHP comes from?
>>>>>>> As you even have skipped swapcache for small folios in zswap in patch1,
>>>>>>> that part should have some gain? is it because of zswap_present_test()?
>>>>>>>
>>>>>>
>>>>>> Hi Barry,
>>>>>>
>>>>>> The microbenchmark does a sequential read of 1G of memory, so it probably
>>>>>> isnt very representative of real world usecases. This also means that
>>>>>> swap_vma_readahead is able to readahead accurately all pages in its window.
>>>>>> With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
>>>>>> do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
>>>>>> do_swap_page calls. I had added some prints and I was seeing 8 pages being
>>>>>> readahead in 1 do_swap_page. The larger number of calls causes the slight
>>>>>> regression (eventhough they are quite fast). I think in a realistic scenario,
>>>>>> where readahead window wont be as large, there wont be a regression.
>>>>>> The cost of zswap_present_test in the whole call stack of swapping page is
>>>>>> very low and I think can be ignored.
>>>>>>
>>>>>> I think the more interesting thing is what Kanchana pointed out in
>>>>>> https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
>>>>>> I am curious, did you see this when testing large folio swapin and compression
>>>>>> at 4K granuality? Its looks like swap thrashing so I think it would be common
>>>>>> between zswap and zram. I dont have larger granuality zswap compression done,
>>>>>> which is why I think there is a regression in time taken. (It could be because
>>>>>> its tested on intel as well).
>>>>>>
>>>>>> Thanks,
>>>>>> Usama
>>>>>>
>>>>>
>>>>> Hi,
>>>>>
>>>>> So I have been doing some benchmarking after Kanchana pointed out a performance
>>>>> regression in [1] of swapping in large folio. I would love to get thoughts from
>>>>> zram folks on this, as thats where large folio swapin was first added [2].
>>>>> As far as I can see, the current support in zram is doing large folio swapin
>>>>> at 4K granuality. The large granuality compression in [3] which was posted
>>>>> in March is not merged, so I am currently comparing upstream zram with this series.
>>>>>
>>>>> With the microbenchmark below of timing 1G swapin, there was a very large improvement
>>>>> in performance by using this series. I think similar numbers would be seen in zram.
>>>>
>>>> Imagine running several apps on a phone and switching
>>>> between them: A → B → C → D → E … → A → B … The app
>>>> currently on the screen retains its memory, while the ones
>>>> sent to the background are swapped out. When we bring
>>>> those apps back to the foreground, their memory is restored.
>>>> This behavior is quite similar to what you're seeing with
>>>> your microbenchmark.
>>>>
>>>
>>> Hi Barry,
>>>
>>> Thanks for explaining this! Do you know if there is some open source benchmark
>>> we could use to show an improvement in app switching with large folios?
>>>
>>
>> I’m fairly certain the Android team has this benchmark, but it’s not
>> open source.
>>
>> A straightforward way to simulate this is to use a script that
>> cyclically launches multiple applications, such as Chrome, Firefox,
>> Office, PDF, and others.
>>
>> for example:
>>
>> launch chrome;
>> launch firefox;
>> launch youtube;
>> ....
>> launch chrome;
>> launch firefox;
>> ....
>>
>> On Android, we have "Android activity manager 'am' command" to do that.
>> https://gist.github.com/tsohr/5711945
>>
>> Not quite sure if other windows managers have similar tools.
>>
>>> Also I guess swap thrashing can happen when apps are brought back to foreground?
>>>
>>
>> Typically, the foreground app doesn't experience much swapping,
>> as it is the most recently or frequently used. However, this may
>> not hold for very low-end phones, where memory is significantly
>> less than the app's working set. For instance, we can't expect a
>> good user experience when playing a large game that requires 8GB
>> of memory on a 4GB phone! :-)
>> And for low-end phones, we never even enable mTHP.
>>
>>>>>
>>>>> But when doing kernel build test, Kanchana saw a regression in [1]. I believe
>>>>> its because of swap thrashing (causing large zswap activity), due to larger page swapin.
>>>>> The part of the code that decides large folio swapin is the same between zswap and zram,
>>>>> so I believe this would be observed in zram as well.
>>>>
>>>> Is this an extreme case where the workload's working set far
>>>> exceeds the available memory by memcg limitation? I doubt mTHP
>>>> would provide any real benefit from the start if the workload is bound to
>>>> experience swap thrashing. What if we disable mTHP entirely?
>>>>
>>>
>>> I would agree, this is an extreme case. I wanted (z)swap activity to happen so limited
>>> memory.max to 4G.
>>>
>>> mTHP is beneficial in kernel test benchmarking going from no mTHP to 16K:
>>>
>>> ARM make defconfig; time make -j$(nproc) Image, cgroup memory.max=4G
>>> metric         no mTHP         16K mTHP=always
>>> real           1m0.613s         0m52.008s
>>> user           25m23.028s       25m19.488s
>>> sys            25m45.466s       18m11.640s
>>> zswpin         1911194          3108438
>>> zswpout        6880815          9374628
>>> pgfault        120430166        48976658
>>> pgmajfault     1580674          2327086
>>>
>>>
>>
>> Interesting! We never use a phone to build the Linux kernel, but
>> let me see if I can find some other machines to reproduce your data.
> 
> Hi Usama,
> 
> I suspect the regression occurs because you're running an edge case
> where the memory cgroup stays nearly full most of the time (this isn't
> an inherent issue with large folio swap-in). As a result, swapping in
> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> next swap-in then recreates the overflow, leading to a repeating
> cycle.
> 

Yes, agreed! Looking at the swap counters, I think this is what is going
on as well.

> We need a way to stop the cup from repeatedly filling to the brim and
> overflowing. While not a definitive fix, the following change might help
> improve the situation:
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> 
> index 17af08367c68..f2fa0eeb2d9a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> 
> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> *folio, struct mm_struct *mm,
>                 memcg = get_mem_cgroup_from_mm(mm);
>         rcu_read_unlock();
> 
> -       ret = charge_memcg(folio, memcg, gfp);
> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> MEMCG_CHARGE_BATCH)
> +               ret = -ENOMEM;
> +       else
> +               ret = charge_memcg(folio, memcg, gfp);
> 
>         css_put(&memcg->css);
>         return ret;
> }
> 

The diff makes sense to me. Let me test later today and get back to you.

Thanks!

> Please confirm if it makes the kernel build with memcg limitation
> faster. If so, let's
> work together to figure out an official patch :-) The above code hasn't consider
> the parent memcg's overflow, so not an ideal fix.
> 
>>
>>>
>>>
>>>>>
>>>>> My initial thought was this might be because its intel, where you dont have the advantage
>>>>> of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
>>>>> and ARM as well, though a bit less (have added the numbers below).
>>>>>
>>>>> The numbers show that the zswap activity increases and page faults decrease.
>>>>> Overall this does result in sys time increasing and real time slightly increases,
>>>>> likely because the cost of increased zswap activity is more than the benefit of
>>>>> lower page faults.
>>>>> I can see in [3] that pagefaults reduced in zram as well.
>>>>>
>>>>> Large folio swapin shows good numbers in microbenchmarks that just target reduce page
>>>>> faults and sequential swapin only, but not in kernel build test. Is a similar regression
>>>>> observed with zram when enabling large folio swapin on kernel build test? Maybe large
>>>>> folio swapin makes more sense on workloads where mappings are kept for a longer time?
>>>>>
>>>>
>>>> I suspect this is because mTHP doesn't always benefit workloads
>>>> when available memory is quite limited compared to the working set.
>>>> In that case, mTHP swap-in might introduce more features that
>>>> exacerbate the problem. We used to have an extra control "swapin_enabled"
>>>> for swap-in, but it never gained much traction:
>>>> https://lore.kernel.org/linux-mm/20240726094618.401593-5-21cnbao@gmail.com/
>>>> We can reconsider whether to include the knob, but if it's better
>>>> to disable mTHP entirely for these cases, we can still adhere to
>>>> the policy of "enabled".
>>>>
>>> Yes I think this makes sense to have. The only thing is, its too many knobs!
>>> I personally think its already difficult to decide upto which mTHP size we
>>> should enable (and I think this changes per workload). But if we add swapin_enabled
>>> on top of that it can make things more difficult.
>>>
>>>> Using large block compression and decompression in zRAM will
>>>> significantly reduce CPU usage, likely making the issue unnoticeable.
>>>> However, the default minimum size for large block support is currently
>>>> set to 64KB(ZSMALLOC_MULTI_PAGES_ORDER = 4).
>>>>
>>>
>>> I saw that the patch was sent in March, and there werent any updates after?
>>> Maybe I can try and cherry-pick that and see if we can develop large
>>> granularity compression for zswap.
>>
>> will provide an updated version next week.
>>
>>>
>>>>>
>>>>> Kernel build numbers in cgroup with memory.max=4G to trigger zswap
>>>>> Command for AMD: make defconfig; time make -j$(nproc) bzImage
>>>>> Command for ARM: make defconfig; time make -j$(nproc) Image
>>>>>
>>>>>
>>>>> AMD 16K+32K THP=always
>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>> real           1m23.038s        1m23.050s
>>>>> user           53m57.210s       53m53.437s
>>>>> sys            7m24.592s        7m48.843s
>>>>> zswpin         612070           999244
>>>>> zswpout        2226403          2347979
>>>>> pgfault        20667366         20481728
>>>>> pgmajfault     385887           269117
>>>>>
>>>>> AMD 16K+32K+64K THP=always
>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>> real           1m22.975s        1m23.266s
>>>>> user           53m51.302s       53m51.069s
>>>>> sys            7m40.168s        7m57.104s
>>>>> zswpin         676492           1258573
>>>>> zswpout        2449839          2714767
>>>>> pgfault        17540746         17296555
>>>>> pgmajfault     429629           307495
>>>>> --------------------------
>>>>> ARM 16K+32K THP=always
>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>> real           0m51.168s        0m52.086s
>>>>> user           25m14.715s       25m15.765s
>>>>> sys            17m18.856s       18m8.031s
>>>>> zswpin         3904129          7339245
>>>>> zswpout        11171295         13473461
>>>>> pgfault        37313345         36011338
>>>>> pgmajfault     2726253          1932642
>>>>>
>>>>>
>>>>> ARM 16K+32K+64K THP=always
>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>> real           0m52.017s        0m53.828s
>>>>> user           25m2.742s        25m0.046s
>>>>> sys            18m24.525s       20m26.207s
>>>>> zswpin         4853571          8908664
>>>>> zswpout        12297199         15768764
>>>>> pgfault        32158152         30425519
>>>>> pgmajfault     3320717          2237015
>>>>>
>>>>>
>>>>> Thanks!
>>>>> Usama
>>>>>
>>>>>
>>>>> [1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
>>>>> [2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
>>>>> [3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/
>>>>>
>>>>>>
>>>>>>>>
>>>>>>>> The time measured was pretty consistent between runs (~1-2% variation).
>>>>>>>> There is 36% improvement in zswapin time with 1M folios. The percentage
>>>>>>>> improvement is likely to be more if the memcmp is removed.
>>>>>>>>
>>>>>>>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
>>>>>>>> index 40de679248b8..77068c577c86 100644
>>>>>>>> --- a/tools/testing/selftests/cgroup/test_zswap.c
>>>>>>>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
>>>>>>>> @@ -9,6 +9,8 @@
>>>>>>>>  #include <string.h>
>>>>>>>>  #include <sys/wait.h>
>>>>>>>>  #include <sys/mman.h>
>>>>>>>> +#include <sys/time.h>
>>>>>>>> +#include <malloc.h>
>>>>>>>>
>>>>>>>>  #include "../kselftest.h"
>>>>>>>>  #include "cgroup_util.h"
>>>>>>>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
>>>>>>>>         return test_zswap_writeback(root, false);
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +static int zswapin_perf(const char *cgroup, void *arg)
>>>>>>>> +{
>>>>>>>> +       long pagesize = sysconf(_SC_PAGESIZE);
>>>>>>>> +       size_t memsize = MB(1*1024);
>>>>>>>> +       char buf[pagesize];
>>>>>>>> +       int ret = -1;
>>>>>>>> +       char *mem;
>>>>>>>> +       struct timeval start, end;
>>>>>>>> +
>>>>>>>> +       mem = (char *)memalign(2*1024*1024, memsize);
>>>>>>>> +       if (!mem)
>>>>>>>> +               return ret;
>>>>>>>> +
>>>>>>>> +       /*
>>>>>>>> +        * Fill half of each page with increasing data, and keep other
>>>>>>>> +        * half empty, this will result in data that is still compressible
>>>>>>>> +        * and ends up in zswap, with material zswap usage.
>>>>>>>> +        */
>>>>>>>> +       for (int i = 0; i < pagesize; i++)
>>>>>>>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
>>>>>>>> +
>>>>>>>> +       for (int i = 0; i < memsize; i += pagesize)
>>>>>>>> +               memcpy(&mem[i], buf, pagesize);
>>>>>>>> +
>>>>>>>> +       /* Try and reclaim allocated memory */
>>>>>>>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
>>>>>>>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
>>>>>>>> +               goto out;
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       gettimeofday(&start, NULL);
>>>>>>>> +       /* zswpin */
>>>>>>>> +       for (int i = 0; i < memsize; i += pagesize) {
>>>>>>>> +               if (memcmp(&mem[i], buf, pagesize)) {
>>>>>>>> +                       ksft_print_msg("invalid memory\n");
>>>>>>>> +                       goto out;
>>>>>>>> +               }
>>>>>>>> +       }
>>>>>>>> +       gettimeofday(&end, NULL);
>>>>>>>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
>>>>>>>> +       ret = 0;
>>>>>>>> +out:
>>>>>>>> +       free(mem);
>>>>>>>> +       return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int test_zswapin_perf(const char *root)
>>>>>>>> +{
>>>>>>>> +       int ret = KSFT_FAIL;
>>>>>>>> +       char *test_group;
>>>>>>>> +
>>>>>>>> +       test_group = cg_name(root, "zswapin_perf_test");
>>>>>>>> +       if (!test_group)
>>>>>>>> +               goto out;
>>>>>>>> +       if (cg_create(test_group))
>>>>>>>> +               goto out;
>>>>>>>> +
>>>>>>>> +       if (cg_run(test_group, zswapin_perf, NULL))
>>>>>>>> +               goto out;
>>>>>>>> +
>>>>>>>> +       ret = KSFT_PASS;
>>>>>>>> +out:
>>>>>>>> +       cg_destroy(test_group);
>>>>>>>> +       free(test_group);
>>>>>>>> +       return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  /*
>>>>>>>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
>>>>>>>>   * limit in zswap, writeback should affect only the zswapped pages of that
>>>>>>>> @@ -584,6 +654,7 @@ struct zswap_test {
>>>>>>>>         T(test_zswapin),
>>>>>>>>         T(test_zswap_writeback_enabled),
>>>>>>>>         T(test_zswap_writeback_disabled),
>>>>>>>> +       T(test_zswapin_perf),
>>>>>>>>         T(test_no_kmem_bypass),
>>>>>>>>         T(test_no_invasive_cgroup_shrink),
>>>>>>>>  };
>>>>>>>>
>>>>>>>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
>>>>>>>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
>>>>>>>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
>>>>>>>> [4] https://lwn.net/Articles/955575/
>>>>>>>>
>>>>>>>> Usama Arif (4):
>>>>>>>>   mm/zswap: skip swapcache for swapping in zswap pages
>>>>>>>>   mm/zswap: modify zswap_decompress to accept page instead of folio
>>>>>>>>   mm/zswap: add support for large folio zswapin
>>>>>>>>   mm/zswap: count successful large folio zswap loads
>>>>>>>>
>>>>>>>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
>>>>>>>>  include/linux/huge_mm.h                    |   1 +
>>>>>>>>  include/linux/zswap.h                      |   6 ++
>>>>>>>>  mm/huge_memory.c                           |   3 +
>>>>>>>>  mm/memory.c                                |  16 +--
>>>>>>>>  mm/page_io.c                               |   2 +-
>>>>>>>>  mm/zswap.c                                 | 120 ++++++++++++++-------
>>>>>>>>  7 files changed, 99 insertions(+), 52 deletions(-)
>>>>>>>>
>>>>>>>> --
>>>>>>>> 2.43.5
>>>>>>>>
>>>>>>>
>>>>
>>
> 
> Thanks
> Barry

Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 23/10/2024 11:48, Usama Arif wrote:
> 
> 
> On 23/10/2024 11:26, Barry Song wrote:
>> On Wed, Oct 23, 2024 at 11:07 AM Barry Song <21cnbao@gmail.com> wrote:
>>>
>>> On Wed, Oct 23, 2024 at 10:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>>
>>>>
>>>> On 22/10/2024 21:46, Barry Song wrote:
>>>>> On Wed, Oct 23, 2024 at 4:26 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 21/10/2024 11:40, Usama Arif wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 21/10/2024 06:09, Barry Song wrote:
>>>>>>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> After large folio zswapout support added in [1], this patch adds
>>>>>>>>> support for zswapin of large folios to bring it on par with zram.
>>>>>>>>> This series makes sure that the benefits of large folios (fewer
>>>>>>>>> page faults, batched PTE and rmap manipulation, reduced lru list,
>>>>>>>>> TLB coalescing (for arm64 and amd)) are not lost at swap out when
>>>>>>>>> using zswap.
>>>>>>>>>
>>>>>>>>> It builds on top of [2] which added large folio swapin support for
>>>>>>>>> zram and provides the same level of large folio swapin support as
>>>>>>>>> zram, i.e. only supporting swap count == 1.
>>>>>>>>>
>>>>>>>>> Patch 1 skips swapcache for swapping in zswap pages, this should improve
>>>>>>>>> no readahead swapin performance [3], and also allows us to build on large
>>>>>>>>> folio swapin support added in [2], hence is a prerequisite for patch 3.
>>>>>>>>>
>>>>>>>>> Patch 3 adds support for large folio zswapin. This patch does not add
>>>>>>>>> support for hybrid backends (i.e. folios partly present swap and zswap).
>>>>>>>>>
>>>>>>>>> The main performance benefit comes from maintaining large folios *after*
>>>>>>>>> swapin, large folio performance improvements have been mentioned in previous
>>>>>>>>> series posted on it [2],[4], so have not added those. Below is a simple
>>>>>>>>> microbenchmark to measure the time needed *for* zswpin of 1G memory (along
>>>>>>>>> with memory integrity check).
>>>>>>>>>
>>>>>>>>>                                 |  no mTHP (ms) | 1M mTHP enabled (ms)
>>>>>>>>> Base kernel                     |   1165        |    1163
>>>>>>>>> Kernel with mTHP zswpin series  |   1203        |     738
>>>>>>>>
>>>>>>>> Hi Usama,
>>>>>>>> Do you know where this minor regression for non-mTHP comes from?
>>>>>>>> As you even have skipped swapcache for small folios in zswap in patch1,
>>>>>>>> that part should have some gain? is it because of zswap_present_test()?
>>>>>>>>
>>>>>>>
>>>>>>> Hi Barry,
>>>>>>>
>>>>>>> The microbenchmark does a sequential read of 1G of memory, so it probably
>>>>>>> isnt very representative of real world usecases. This also means that
>>>>>>> swap_vma_readahead is able to readahead accurately all pages in its window.
>>>>>>> With this patch series, if doing 4K swapin, you get 1G/4K calls of fast
>>>>>>> do_swap_page. Without this patch, you get 1G/(4K*readahead window) of slow
>>>>>>> do_swap_page calls. I had added some prints and I was seeing 8 pages being
>>>>>>> readahead in 1 do_swap_page. The larger number of calls causes the slight
>>>>>>> regression (eventhough they are quite fast). I think in a realistic scenario,
>>>>>>> where readahead window wont be as large, there wont be a regression.
>>>>>>> The cost of zswap_present_test in the whole call stack of swapping page is
>>>>>>> very low and I think can be ignored.
>>>>>>>
>>>>>>> I think the more interesting thing is what Kanchana pointed out in
>>>>>>> https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
>>>>>>> I am curious, did you see this when testing large folio swapin and compression
>>>>>>> at 4K granuality? Its looks like swap thrashing so I think it would be common
>>>>>>> between zswap and zram. I dont have larger granuality zswap compression done,
>>>>>>> which is why I think there is a regression in time taken. (It could be because
>>>>>>> its tested on intel as well).
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Usama
>>>>>>>
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> So I have been doing some benchmarking after Kanchana pointed out a performance
>>>>>> regression in [1] of swapping in large folio. I would love to get thoughts from
>>>>>> zram folks on this, as thats where large folio swapin was first added [2].
>>>>>> As far as I can see, the current support in zram is doing large folio swapin
>>>>>> at 4K granuality. The large granuality compression in [3] which was posted
>>>>>> in March is not merged, so I am currently comparing upstream zram with this series.
>>>>>>
>>>>>> With the microbenchmark below of timing 1G swapin, there was a very large improvement
>>>>>> in performance by using this series. I think similar numbers would be seen in zram.
>>>>>
>>>>> Imagine running several apps on a phone and switching
>>>>> between them: A → B → C → D → E … → A → B … The app
>>>>> currently on the screen retains its memory, while the ones
>>>>> sent to the background are swapped out. When we bring
>>>>> those apps back to the foreground, their memory is restored.
>>>>> This behavior is quite similar to what you're seeing with
>>>>> your microbenchmark.
>>>>>
>>>>
>>>> Hi Barry,
>>>>
>>>> Thanks for explaining this! Do you know if there is some open source benchmark
>>>> we could use to show an improvement in app switching with large folios?
>>>>
>>>
>>> I’m fairly certain the Android team has this benchmark, but it’s not
>>> open source.
>>>
>>> A straightforward way to simulate this is to use a script that
>>> cyclically launches multiple applications, such as Chrome, Firefox,
>>> Office, PDF, and others.
>>>
>>> for example:
>>>
>>> launch chrome;
>>> launch firefox;
>>> launch youtube;
>>> ....
>>> launch chrome;
>>> launch firefox;
>>> ....
>>>
>>> On Android, we have "Android activity manager 'am' command" to do that.
>>> https://gist.github.com/tsohr/5711945
>>>
>>> Not quite sure if other windows managers have similar tools.
>>>
>>>> Also I guess swap thrashing can happen when apps are brought back to foreground?
>>>>
>>>
>>> Typically, the foreground app doesn't experience much swapping,
>>> as it is the most recently or frequently used. However, this may
>>> not hold for very low-end phones, where memory is significantly
>>> less than the app's working set. For instance, we can't expect a
>>> good user experience when playing a large game that requires 8GB
>>> of memory on a 4GB phone! :-)
>>> And for low-end phones, we never even enable mTHP.
>>>
>>>>>>
>>>>>> But when doing kernel build test, Kanchana saw a regression in [1]. I believe
>>>>>> its because of swap thrashing (causing large zswap activity), due to larger page swapin.
>>>>>> The part of the code that decides large folio swapin is the same between zswap and zram,
>>>>>> so I believe this would be observed in zram as well.
>>>>>
>>>>> Is this an extreme case where the workload's working set far
>>>>> exceeds the available memory by memcg limitation? I doubt mTHP
>>>>> would provide any real benefit from the start if the workload is bound to
>>>>> experience swap thrashing. What if we disable mTHP entirely?
>>>>>
>>>>
>>>> I would agree, this is an extreme case. I wanted (z)swap activity to happen so limited
>>>> memory.max to 4G.
>>>>
>>>> mTHP is beneficial in kernel test benchmarking going from no mTHP to 16K:
>>>>
>>>> ARM make defconfig; time make -j$(nproc) Image, cgroup memory.max=4G
>>>> metric         no mTHP         16K mTHP=always
>>>> real           1m0.613s         0m52.008s
>>>> user           25m23.028s       25m19.488s
>>>> sys            25m45.466s       18m11.640s
>>>> zswpin         1911194          3108438
>>>> zswpout        6880815          9374628
>>>> pgfault        120430166        48976658
>>>> pgmajfault     1580674          2327086
>>>>
>>>>
>>>
>>> Interesting! We never use a phone to build the Linux kernel, but
>>> let me see if I can find some other machines to reproduce your data.
>>
>> Hi Usama,
>>
>> I suspect the regression occurs because you're running an edge case
>> where the memory cgroup stays nearly full most of the time (this isn't
>> an inherent issue with large folio swap-in). As a result, swapping in
>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
>> next swap-in then recreates the overflow, leading to a repeating
>> cycle.
>>
> 
> Yes, agreed! Looking at the swap counters, I think this is what is going
> on as well.
> 
>> We need a way to stop the cup from repeatedly filling to the brim and
>> overflowing. While not a definitive fix, the following change might help
>> improve the situation:
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>
>> index 17af08367c68..f2fa0eeb2d9a 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>>
>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
>> *folio, struct mm_struct *mm,
>>                 memcg = get_mem_cgroup_from_mm(mm);
>>         rcu_read_unlock();
>>
>> -       ret = charge_memcg(folio, memcg, gfp);
>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
>> MEMCG_CHARGE_BATCH)
>> +               ret = -ENOMEM;
>> +       else
>> +               ret = charge_memcg(folio, memcg, gfp);
>>
>>         css_put(&memcg->css);
>>         return ret;
>> }
>>
> 
> The diff makes sense to me. Let me test later today and get back to you.
> 
> Thanks!
> 
>> Please confirm if it makes the kernel build with memcg limitation
>> faster. If so, let's
>> work together to figure out an official patch :-) The above code hasn't consider
>> the parent memcg's overflow, so not an ideal fix.
>>

Thanks Barry, I think this fixes the regression, and even gives an improvement!
I think the below might be better to do:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c098fd7f5c5e..0a1ec55cc079 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                memcg = get_mem_cgroup_from_mm(mm);
        rcu_read_unlock();
 
-       ret = charge_memcg(folio, memcg, gfp);
+       if (folio_test_large(folio) &&
+           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
+               ret = -ENOMEM;
+       else
+               ret = charge_memcg(folio, memcg, gfp);
 
        css_put(&memcg->css);
        return ret;


AMD 16K+32K THP=always
metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
real           1m23.038s        1m23.050s                                   1m22.704s
user           53m57.210s       53m53.437s                                  53m52.577s
sys            7m24.592s        7m48.843s                                   7m22.519s
zswpin         612070           999244                                      815934
zswpout        2226403          2347979                                     2054980
pgfault        20667366         20481728                                    20478690
pgmajfault     385887           269117                                      309702

AMD 16K+32K+64K THP=always
metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
real           1m22.975s        1m23.266s                                  1m22.549s
user           53m51.302s       53m51.069s                                 53m46.471s
sys            7m40.168s        7m57.104s                                  7m25.012s
zswpin         676492           1258573                                    1225703
zswpout        2449839          2714767                                    2899178
pgfault        17540746         17296555                                   17234663
pgmajfault     429629           307495                                     287859

>>>
>>>>
>>>>
>>>>>>
>>>>>> My initial thought was this might be because its intel, where you dont have the advantage
>>>>>> of TLB coalescing, so tested on AMD and ARM, but the regression is there on AMD
>>>>>> and ARM as well, though a bit less (have added the numbers below).
>>>>>>
>>>>>> The numbers show that the zswap activity increases and page faults decrease.
>>>>>> Overall this does result in sys time increasing and real time slightly increases,
>>>>>> likely because the cost of increased zswap activity is more than the benefit of
>>>>>> lower page faults.
>>>>>> I can see in [3] that pagefaults reduced in zram as well.
>>>>>>
>>>>>> Large folio swapin shows good numbers in microbenchmarks that just target reduce page
>>>>>> faults and sequential swapin only, but not in kernel build test. Is a similar regression
>>>>>> observed with zram when enabling large folio swapin on kernel build test? Maybe large
>>>>>> folio swapin makes more sense on workloads where mappings are kept for a longer time?
>>>>>>
>>>>>
>>>>> I suspect this is because mTHP doesn't always benefit workloads
>>>>> when available memory is quite limited compared to the working set.
>>>>> In that case, mTHP swap-in might introduce more features that
>>>>> exacerbate the problem. We used to have an extra control "swapin_enabled"
>>>>> for swap-in, but it never gained much traction:
>>>>> https://lore.kernel.org/linux-mm/20240726094618.401593-5-21cnbao@gmail.com/
>>>>> We can reconsider whether to include the knob, but if it's better
>>>>> to disable mTHP entirely for these cases, we can still adhere to
>>>>> the policy of "enabled".
>>>>>
>>>> Yes I think this makes sense to have. The only thing is, its too many knobs!
>>>> I personally think its already difficult to decide upto which mTHP size we
>>>> should enable (and I think this changes per workload). But if we add swapin_enabled
>>>> on top of that it can make things more difficult.
>>>>
>>>>> Using large block compression and decompression in zRAM will
>>>>> significantly reduce CPU usage, likely making the issue unnoticeable.
>>>>> However, the default minimum size for large block support is currently
>>>>> set to 64KB(ZSMALLOC_MULTI_PAGES_ORDER = 4).
>>>>>
>>>>
>>>> I saw that the patch was sent in March, and there werent any updates after?
>>>> Maybe I can try and cherry-pick that and see if we can develop large
>>>> granularity compression for zswap.
>>>
>>> will provide an updated version next week.
>>>
>>>>
>>>>>>
>>>>>> Kernel build numbers in cgroup with memory.max=4G to trigger zswap
>>>>>> Command for AMD: make defconfig; time make -j$(nproc) bzImage
>>>>>> Command for ARM: make defconfig; time make -j$(nproc) Image
>>>>>>
>>>>>>
>>>>>> AMD 16K+32K THP=always
>>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>>> real           1m23.038s        1m23.050s
>>>>>> user           53m57.210s       53m53.437s
>>>>>> sys            7m24.592s        7m48.843s
>>>>>> zswpin         612070           999244
>>>>>> zswpout        2226403          2347979
>>>>>> pgfault        20667366         20481728
>>>>>> pgmajfault     385887           269117
>>>>>>
>>>>>> AMD 16K+32K+64K THP=always
>>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>>> real           1m22.975s        1m23.266s
>>>>>> user           53m51.302s       53m51.069s
>>>>>> sys            7m40.168s        7m57.104s
>>>>>> zswpin         676492           1258573
>>>>>> zswpout        2449839          2714767
>>>>>> pgfault        17540746         17296555
>>>>>> pgmajfault     429629           307495
>>>>>> --------------------------
>>>>>> ARM 16K+32K THP=always
>>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>>> real           0m51.168s        0m52.086s
>>>>>> user           25m14.715s       25m15.765s
>>>>>> sys            17m18.856s       18m8.031s
>>>>>> zswpin         3904129          7339245
>>>>>> zswpout        11171295         13473461
>>>>>> pgfault        37313345         36011338
>>>>>> pgmajfault     2726253          1932642
>>>>>>
>>>>>>
>>>>>> ARM 16K+32K+64K THP=always
>>>>>> metric         mm-unstable      mm-unstable + large folio zswapin series
>>>>>> real           0m52.017s        0m53.828s
>>>>>> user           25m2.742s        25m0.046s
>>>>>> sys            18m24.525s       20m26.207s
>>>>>> zswpin         4853571          8908664
>>>>>> zswpout        12297199         15768764
>>>>>> pgfault        32158152         30425519
>>>>>> pgmajfault     3320717          2237015
>>>>>>
>>>>>>
>>>>>> Thanks!
>>>>>> Usama
>>>>>>
>>>>>>
>>>>>> [1] https://lore.kernel.org/all/f2f2053f-ec5f-46a4-800d-50a3d2e61bff@gmail.com/
>>>>>> [2] https://lore.kernel.org/all/20240821074541.516249-3-hanchuanhua@oppo.com/
>>>>>> [3] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/
>>>>>>
>>>>>>>
>>>>>>>>>
>>>>>>>>> The time measured was pretty consistent between runs (~1-2% variation).
>>>>>>>>> There is 36% improvement in zswapin time with 1M folios. The percentage
>>>>>>>>> improvement is likely to be more if the memcmp is removed.
>>>>>>>>>
>>>>>>>>> diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
>>>>>>>>> index 40de679248b8..77068c577c86 100644
>>>>>>>>> --- a/tools/testing/selftests/cgroup/test_zswap.c
>>>>>>>>> +++ b/tools/testing/selftests/cgroup/test_zswap.c
>>>>>>>>> @@ -9,6 +9,8 @@
>>>>>>>>>  #include <string.h>
>>>>>>>>>  #include <sys/wait.h>
>>>>>>>>>  #include <sys/mman.h>
>>>>>>>>> +#include <sys/time.h>
>>>>>>>>> +#include <malloc.h>
>>>>>>>>>
>>>>>>>>>  #include "../kselftest.h"
>>>>>>>>>  #include "cgroup_util.h"
>>>>>>>>> @@ -407,6 +409,74 @@ static int test_zswap_writeback_disabled(const char *root)
>>>>>>>>>         return test_zswap_writeback(root, false);
>>>>>>>>>  }
>>>>>>>>>
>>>>>>>>> +static int zswapin_perf(const char *cgroup, void *arg)
>>>>>>>>> +{
>>>>>>>>> +       long pagesize = sysconf(_SC_PAGESIZE);
>>>>>>>>> +       size_t memsize = MB(1*1024);
>>>>>>>>> +       char buf[pagesize];
>>>>>>>>> +       int ret = -1;
>>>>>>>>> +       char *mem;
>>>>>>>>> +       struct timeval start, end;
>>>>>>>>> +
>>>>>>>>> +       mem = (char *)memalign(2*1024*1024, memsize);
>>>>>>>>> +       if (!mem)
>>>>>>>>> +               return ret;
>>>>>>>>> +
>>>>>>>>> +       /*
>>>>>>>>> +        * Fill half of each page with increasing data, and keep other
>>>>>>>>> +        * half empty, this will result in data that is still compressible
>>>>>>>>> +        * and ends up in zswap, with material zswap usage.
>>>>>>>>> +        */
>>>>>>>>> +       for (int i = 0; i < pagesize; i++)
>>>>>>>>> +               buf[i] = i < pagesize/2 ? (char) i : 0;
>>>>>>>>> +
>>>>>>>>> +       for (int i = 0; i < memsize; i += pagesize)
>>>>>>>>> +               memcpy(&mem[i], buf, pagesize);
>>>>>>>>> +
>>>>>>>>> +       /* Try and reclaim allocated memory */
>>>>>>>>> +       if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
>>>>>>>>> +               ksft_print_msg("Failed to reclaim all of the requested memory\n");
>>>>>>>>> +               goto out;
>>>>>>>>> +       }
>>>>>>>>> +
>>>>>>>>> +       gettimeofday(&start, NULL);
>>>>>>>>> +       /* zswpin */
>>>>>>>>> +       for (int i = 0; i < memsize; i += pagesize) {
>>>>>>>>> +               if (memcmp(&mem[i], buf, pagesize)) {
>>>>>>>>> +                       ksft_print_msg("invalid memory\n");
>>>>>>>>> +                       goto out;
>>>>>>>>> +               }
>>>>>>>>> +       }
>>>>>>>>> +       gettimeofday(&end, NULL);
>>>>>>>>> +       printf ("zswapin took %fms to run.\n", (end.tv_sec - start.tv_sec)*1000 + (double)(end.tv_usec - start.tv_usec) / 1000);
>>>>>>>>> +       ret = 0;
>>>>>>>>> +out:
>>>>>>>>> +       free(mem);
>>>>>>>>> +       return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static int test_zswapin_perf(const char *root)
>>>>>>>>> +{
>>>>>>>>> +       int ret = KSFT_FAIL;
>>>>>>>>> +       char *test_group;
>>>>>>>>> +
>>>>>>>>> +       test_group = cg_name(root, "zswapin_perf_test");
>>>>>>>>> +       if (!test_group)
>>>>>>>>> +               goto out;
>>>>>>>>> +       if (cg_create(test_group))
>>>>>>>>> +               goto out;
>>>>>>>>> +
>>>>>>>>> +       if (cg_run(test_group, zswapin_perf, NULL))
>>>>>>>>> +               goto out;
>>>>>>>>> +
>>>>>>>>> +       ret = KSFT_PASS;
>>>>>>>>> +out:
>>>>>>>>> +       cg_destroy(test_group);
>>>>>>>>> +       free(test_group);
>>>>>>>>> +       return ret;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>  /*
>>>>>>>>>   * When trying to store a memcg page in zswap, if the memcg hits its memory
>>>>>>>>>   * limit in zswap, writeback should affect only the zswapped pages of that
>>>>>>>>> @@ -584,6 +654,7 @@ struct zswap_test {
>>>>>>>>>         T(test_zswapin),
>>>>>>>>>         T(test_zswap_writeback_enabled),
>>>>>>>>>         T(test_zswap_writeback_disabled),
>>>>>>>>> +       T(test_zswapin_perf),
>>>>>>>>>         T(test_no_kmem_bypass),
>>>>>>>>>         T(test_no_invasive_cgroup_shrink),
>>>>>>>>>  };
>>>>>>>>>
>>>>>>>>> [1] https://lore.kernel.org/all/20241001053222.6944-1-kanchana.p.sridhar@intel.com/
>>>>>>>>> [2] https://lore.kernel.org/all/20240821074541.516249-1-hanchuanhua@oppo.com/
>>>>>>>>> [3] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#u
>>>>>>>>> [4] https://lwn.net/Articles/955575/
>>>>>>>>>
>>>>>>>>> Usama Arif (4):
>>>>>>>>>   mm/zswap: skip swapcache for swapping in zswap pages
>>>>>>>>>   mm/zswap: modify zswap_decompress to accept page instead of folio
>>>>>>>>>   mm/zswap: add support for large folio zswapin
>>>>>>>>>   mm/zswap: count successful large folio zswap loads
>>>>>>>>>
>>>>>>>>>  Documentation/admin-guide/mm/transhuge.rst |   3 +
>>>>>>>>>  include/linux/huge_mm.h                    |   1 +
>>>>>>>>>  include/linux/zswap.h                      |   6 ++
>>>>>>>>>  mm/huge_memory.c                           |   3 +
>>>>>>>>>  mm/memory.c                                |  16 +--
>>>>>>>>>  mm/page_io.c                               |   2 +-
>>>>>>>>>  mm/zswap.c                                 | 120 ++++++++++++++-------
>>>>>>>>>  7 files changed, 99 insertions(+), 52 deletions(-)
>>>>>>>>>
>>>>>>>>> --
>>>>>>>>> 2.43.5
>>>>>>>>>
>>>>>>>>
>>>>>
>>>
>>
>> Thanks
>> Barry
> 

Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Yosry Ahmed 1 month ago
[..]
> >> I suspect the regression occurs because you're running an edge case
> >> where the memory cgroup stays nearly full most of the time (this isn't
> >> an inherent issue with large folio swap-in). As a result, swapping in
> >> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> >> next swap-in then recreates the overflow, leading to a repeating
> >> cycle.
> >>
> >
> > Yes, agreed! Looking at the swap counters, I think this is what is going
> > on as well.
> >
> >> We need a way to stop the cup from repeatedly filling to the brim and
> >> overflowing. While not a definitive fix, the following change might help
> >> improve the situation:
> >>
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >>
> >> index 17af08367c68..f2fa0eeb2d9a 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >>
> >> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> >> *folio, struct mm_struct *mm,
> >>                 memcg = get_mem_cgroup_from_mm(mm);
> >>         rcu_read_unlock();
> >>
> >> -       ret = charge_memcg(folio, memcg, gfp);
> >> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> >> MEMCG_CHARGE_BATCH)
> >> +               ret = -ENOMEM;
> >> +       else
> >> +               ret = charge_memcg(folio, memcg, gfp);
> >>
> >>         css_put(&memcg->css);
> >>         return ret;
> >> }
> >>
> >
> > The diff makes sense to me. Let me test later today and get back to you.
> >
> > Thanks!
> >
> >> Please confirm if it makes the kernel build with memcg limitation
> >> faster. If so, let's
> >> work together to figure out an official patch :-) The above code hasn't consider
> >> the parent memcg's overflow, so not an ideal fix.
> >>
>
> Thanks Barry, I think this fixes the regression, and even gives an improvement!
> I think the below might be better to do:
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c098fd7f5c5e..0a1ec55cc079 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>                 memcg = get_mem_cgroup_from_mm(mm);
>         rcu_read_unlock();
>
> -       ret = charge_memcg(folio, memcg, gfp);
> +       if (folio_test_large(folio) &&
> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> +               ret = -ENOMEM;
> +       else
> +               ret = charge_memcg(folio, memcg, gfp);
>
>         css_put(&memcg->css);
>         return ret;
>
>
> AMD 16K+32K THP=always
> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
> real           1m23.038s        1m23.050s                                   1m22.704s
> user           53m57.210s       53m53.437s                                  53m52.577s
> sys            7m24.592s        7m48.843s                                   7m22.519s
> zswpin         612070           999244                                      815934
> zswpout        2226403          2347979                                     2054980
> pgfault        20667366         20481728                                    20478690
> pgmajfault     385887           269117                                      309702
>
> AMD 16K+32K+64K THP=always
> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
> real           1m22.975s        1m23.266s                                  1m22.549s
> user           53m51.302s       53m51.069s                                 53m46.471s
> sys            7m40.168s        7m57.104s                                  7m25.012s
> zswpin         676492           1258573                                    1225703
> zswpout        2449839          2714767                                    2899178
> pgfault        17540746         17296555                                   17234663
> pgmajfault     429629           307495                                     287859
>

Thanks Usama and Barry for looking into this. It seems like this would
fix a regression with large folio swapin regardless of zswap. Can the
same result be reproduced on zram without this series?
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 23/10/2024 19:02, Yosry Ahmed wrote:
> [..]
>>>> I suspect the regression occurs because you're running an edge case
>>>> where the memory cgroup stays nearly full most of the time (this isn't
>>>> an inherent issue with large folio swap-in). As a result, swapping in
>>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
>>>> next swap-in then recreates the overflow, leading to a repeating
>>>> cycle.
>>>>
>>>
>>> Yes, agreed! Looking at the swap counters, I think this is what is going
>>> on as well.
>>>
>>>> We need a way to stop the cup from repeatedly filling to the brim and
>>>> overflowing. While not a definitive fix, the following change might help
>>>> improve the situation:
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>
>>>> index 17af08367c68..f2fa0eeb2d9a 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>>
>>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
>>>> *folio, struct mm_struct *mm,
>>>>                 memcg = get_mem_cgroup_from_mm(mm);
>>>>         rcu_read_unlock();
>>>>
>>>> -       ret = charge_memcg(folio, memcg, gfp);
>>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
>>>> MEMCG_CHARGE_BATCH)
>>>> +               ret = -ENOMEM;
>>>> +       else
>>>> +               ret = charge_memcg(folio, memcg, gfp);
>>>>
>>>>         css_put(&memcg->css);
>>>>         return ret;
>>>> }
>>>>
>>>
>>> The diff makes sense to me. Let me test later today and get back to you.
>>>
>>> Thanks!
>>>
>>>> Please confirm if it makes the kernel build with memcg limitation
>>>> faster. If so, let's
>>>> work together to figure out an official patch :-) The above code hasn't consider
>>>> the parent memcg's overflow, so not an ideal fix.
>>>>
>>
>> Thanks Barry, I think this fixes the regression, and even gives an improvement!
>> I think the below might be better to do:
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index c098fd7f5c5e..0a1ec55cc079 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>>                 memcg = get_mem_cgroup_from_mm(mm);
>>         rcu_read_unlock();
>>
>> -       ret = charge_memcg(folio, memcg, gfp);
>> +       if (folio_test_large(folio) &&
>> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
>> +               ret = -ENOMEM;
>> +       else
>> +               ret = charge_memcg(folio, memcg, gfp);
>>
>>         css_put(&memcg->css);
>>         return ret;
>>
>>
>> AMD 16K+32K THP=always
>> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
>> real           1m23.038s        1m23.050s                                   1m22.704s
>> user           53m57.210s       53m53.437s                                  53m52.577s
>> sys            7m24.592s        7m48.843s                                   7m22.519s
>> zswpin         612070           999244                                      815934
>> zswpout        2226403          2347979                                     2054980
>> pgfault        20667366         20481728                                    20478690
>> pgmajfault     385887           269117                                      309702
>>
>> AMD 16K+32K+64K THP=always
>> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
>> real           1m22.975s        1m23.266s                                  1m22.549s
>> user           53m51.302s       53m51.069s                                 53m46.471s
>> sys            7m40.168s        7m57.104s                                  7m25.012s
>> zswpin         676492           1258573                                    1225703
>> zswpout        2449839          2714767                                    2899178
>> pgfault        17540746         17296555                                   17234663
>> pgmajfault     429629           307495                                     287859
>>
> 
> Thanks Usama and Barry for looking into this. It seems like this would
> fix a regression with large folio swapin regardless of zswap. Can the
> same result be reproduced on zram without this series?


Yes, its a regression in large folio swapin support regardless of zswap/zram.

Need to do 3 tests, one with probably the below diff to remove large folio support,
one with current upstream and one with upstream + swap thrashing fix.

We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
Any zram volunteers to try this?

diff --git a/mm/memory.c b/mm/memory.c
index fecdd044bc0b..62f6b087beb3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4124,6 +4124,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
        gfp_t gfp;
        int order;
 
+       goto fallback;
+
        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Thu, Oct 24, 2024 at 7:31 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 23/10/2024 19:02, Yosry Ahmed wrote:
> > [..]
> >>>> I suspect the regression occurs because you're running an edge case
> >>>> where the memory cgroup stays nearly full most of the time (this isn't
> >>>> an inherent issue with large folio swap-in). As a result, swapping in
> >>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> >>>> next swap-in then recreates the overflow, leading to a repeating
> >>>> cycle.
> >>>>
> >>>
> >>> Yes, agreed! Looking at the swap counters, I think this is what is going
> >>> on as well.
> >>>
> >>>> We need a way to stop the cup from repeatedly filling to the brim and
> >>>> overflowing. While not a definitive fix, the following change might help
> >>>> improve the situation:
> >>>>
> >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >>>>
> >>>> index 17af08367c68..f2fa0eeb2d9a 100644
> >>>> --- a/mm/memcontrol.c
> >>>> +++ b/mm/memcontrol.c
> >>>>
> >>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> >>>> *folio, struct mm_struct *mm,
> >>>>                 memcg = get_mem_cgroup_from_mm(mm);
> >>>>         rcu_read_unlock();
> >>>>
> >>>> -       ret = charge_memcg(folio, memcg, gfp);
> >>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> >>>> MEMCG_CHARGE_BATCH)
> >>>> +               ret = -ENOMEM;
> >>>> +       else
> >>>> +               ret = charge_memcg(folio, memcg, gfp);
> >>>>
> >>>>         css_put(&memcg->css);
> >>>>         return ret;
> >>>> }
> >>>>
> >>>
> >>> The diff makes sense to me. Let me test later today and get back to you.
> >>>
> >>> Thanks!
> >>>
> >>>> Please confirm if it makes the kernel build with memcg limitation
> >>>> faster. If so, let's
> >>>> work together to figure out an official patch :-) The above code hasn't consider
> >>>> the parent memcg's overflow, so not an ideal fix.
> >>>>
> >>
> >> Thanks Barry, I think this fixes the regression, and even gives an improvement!
> >> I think the below might be better to do:
> >>
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index c098fd7f5c5e..0a1ec55cc079 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> >>                 memcg = get_mem_cgroup_from_mm(mm);
> >>         rcu_read_unlock();
> >>
> >> -       ret = charge_memcg(folio, memcg, gfp);
> >> +       if (folio_test_large(folio) &&
> >> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> >> +               ret = -ENOMEM;
> >> +       else
> >> +               ret = charge_memcg(folio, memcg, gfp);
> >>
> >>         css_put(&memcg->css);
> >>         return ret;
> >>
> >>
> >> AMD 16K+32K THP=always
> >> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
> >> real           1m23.038s        1m23.050s                                   1m22.704s
> >> user           53m57.210s       53m53.437s                                  53m52.577s
> >> sys            7m24.592s        7m48.843s                                   7m22.519s
> >> zswpin         612070           999244                                      815934
> >> zswpout        2226403          2347979                                     2054980
> >> pgfault        20667366         20481728                                    20478690
> >> pgmajfault     385887           269117                                      309702
> >>
> >> AMD 16K+32K+64K THP=always
> >> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
> >> real           1m22.975s        1m23.266s                                  1m22.549s
> >> user           53m51.302s       53m51.069s                                 53m46.471s
> >> sys            7m40.168s        7m57.104s                                  7m25.012s
> >> zswpin         676492           1258573                                    1225703
> >> zswpout        2449839          2714767                                    2899178
> >> pgfault        17540746         17296555                                   17234663
> >> pgmajfault     429629           307495                                     287859
> >>
> >
> > Thanks Usama and Barry for looking into this. It seems like this would
> > fix a regression with large folio swapin regardless of zswap. Can the
> > same result be reproduced on zram without this series?
>
>
> Yes, its a regression in large folio swapin support regardless of zswap/zram.
>
> Need to do 3 tests, one with probably the below diff to remove large folio support,
> one with current upstream and one with upstream + swap thrashing fix.
>
> We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
> Any zram volunteers to try this?

Hi Usama,

I tried a quick experiment:

echo 1 > /sys/module/zswap/parameters/enabled
echo 0 > /sys/module/zswap/parameters/enabled

This was to test the zRAM scenario. Enabling zswap even
once disables mTHP swap-in. :)

I noticed a similar regression with zRAM alone, but the change resolved
the issue and even sped up the kernel build compared to the setup without
mTHP swap-in.

However, I’m still working on a proper patch to address this. The current
approach:

mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio))

isn’t sufficient, as it doesn’t cover cases where group A contains group B, and
we’re operating within group B. The problem occurs not at the boundary of
group B but at the boundary of group A.

I believe there’s still room for improvement. For example, if a 64KB charge
attempt fails, there’s no need to waste time trying 32KB or 16KB. We can
directly fall back to 4KB, as 32KB and 16KB will also fail based on our
margin detection logic.

>
> diff --git a/mm/memory.c b/mm/memory.c
> index fecdd044bc0b..62f6b087beb3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4124,6 +4124,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>         gfp_t gfp;
>         int order;
>
> +       goto fallback;
> +
>         /*
>          * If uffd is active for the vma we need per-page fault fidelity to
>          * maintain the uffd semantics.

Thanks
Barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Usama Arif 1 month ago

On 23/10/2024 19:52, Barry Song wrote:
> On Thu, Oct 24, 2024 at 7:31 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 23/10/2024 19:02, Yosry Ahmed wrote:
>>> [..]
>>>>>> I suspect the regression occurs because you're running an edge case
>>>>>> where the memory cgroup stays nearly full most of the time (this isn't
>>>>>> an inherent issue with large folio swap-in). As a result, swapping in
>>>>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
>>>>>> next swap-in then recreates the overflow, leading to a repeating
>>>>>> cycle.
>>>>>>
>>>>>
>>>>> Yes, agreed! Looking at the swap counters, I think this is what is going
>>>>> on as well.
>>>>>
>>>>>> We need a way to stop the cup from repeatedly filling to the brim and
>>>>>> overflowing. While not a definitive fix, the following change might help
>>>>>> improve the situation:
>>>>>>
>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>>
>>>>>> index 17af08367c68..f2fa0eeb2d9a 100644
>>>>>> --- a/mm/memcontrol.c
>>>>>> +++ b/mm/memcontrol.c
>>>>>>
>>>>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
>>>>>> *folio, struct mm_struct *mm,
>>>>>>                 memcg = get_mem_cgroup_from_mm(mm);
>>>>>>         rcu_read_unlock();
>>>>>>
>>>>>> -       ret = charge_memcg(folio, memcg, gfp);
>>>>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
>>>>>> MEMCG_CHARGE_BATCH)
>>>>>> +               ret = -ENOMEM;
>>>>>> +       else
>>>>>> +               ret = charge_memcg(folio, memcg, gfp);
>>>>>>
>>>>>>         css_put(&memcg->css);
>>>>>>         return ret;
>>>>>> }
>>>>>>
>>>>>
>>>>> The diff makes sense to me. Let me test later today and get back to you.
>>>>>
>>>>> Thanks!
>>>>>
>>>>>> Please confirm if it makes the kernel build with memcg limitation
>>>>>> faster. If so, let's
>>>>>> work together to figure out an official patch :-) The above code hasn't consider
>>>>>> the parent memcg's overflow, so not an ideal fix.
>>>>>>
>>>>
>>>> Thanks Barry, I think this fixes the regression, and even gives an improvement!
>>>> I think the below might be better to do:
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index c098fd7f5c5e..0a1ec55cc079 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>>>>                 memcg = get_mem_cgroup_from_mm(mm);
>>>>         rcu_read_unlock();
>>>>
>>>> -       ret = charge_memcg(folio, memcg, gfp);
>>>> +       if (folio_test_large(folio) &&
>>>> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
>>>> +               ret = -ENOMEM;
>>>> +       else
>>>> +               ret = charge_memcg(folio, memcg, gfp);
>>>>
>>>>         css_put(&memcg->css);
>>>>         return ret;
>>>>
>>>>
>>>> AMD 16K+32K THP=always
>>>> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
>>>> real           1m23.038s        1m23.050s                                   1m22.704s
>>>> user           53m57.210s       53m53.437s                                  53m52.577s
>>>> sys            7m24.592s        7m48.843s                                   7m22.519s
>>>> zswpin         612070           999244                                      815934
>>>> zswpout        2226403          2347979                                     2054980
>>>> pgfault        20667366         20481728                                    20478690
>>>> pgmajfault     385887           269117                                      309702
>>>>
>>>> AMD 16K+32K+64K THP=always
>>>> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
>>>> real           1m22.975s        1m23.266s                                  1m22.549s
>>>> user           53m51.302s       53m51.069s                                 53m46.471s
>>>> sys            7m40.168s        7m57.104s                                  7m25.012s
>>>> zswpin         676492           1258573                                    1225703
>>>> zswpout        2449839          2714767                                    2899178
>>>> pgfault        17540746         17296555                                   17234663
>>>> pgmajfault     429629           307495                                     287859
>>>>
>>>
>>> Thanks Usama and Barry for looking into this. It seems like this would
>>> fix a regression with large folio swapin regardless of zswap. Can the
>>> same result be reproduced on zram without this series?
>>
>>
>> Yes, its a regression in large folio swapin support regardless of zswap/zram.
>>
>> Need to do 3 tests, one with probably the below diff to remove large folio support,
>> one with current upstream and one with upstream + swap thrashing fix.
>>
>> We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
>> Any zram volunteers to try this?
> 
> Hi Usama,
> 
> I tried a quick experiment:
> 
> echo 1 > /sys/module/zswap/parameters/enabled
> echo 0 > /sys/module/zswap/parameters/enabled
> 
> This was to test the zRAM scenario. Enabling zswap even
> once disables mTHP swap-in. :)
> 
> I noticed a similar regression with zRAM alone, but the change resolved
> the issue and even sped up the kernel build compared to the setup without
> mTHP swap-in.

Thanks for trying, this is amazing!
> 
> However, I’m still working on a proper patch to address this. The current
> approach:
> 
> mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio))
> 
> isn’t sufficient, as it doesn’t cover cases where group A contains group B, and
> we’re operating within group B. The problem occurs not at the boundary of
> group B but at the boundary of group A.

I am not sure I completely followed this. As MEMCG_CHARGE_BATCH=64, if we are
trying to swapin a 16kB page, we basically check if atleast 64/4 = 16 folios can be
charged to cgroup, which is reasonable. If we try to swapin a 1M folio, we just
check if we can charge atleast 1 folio. Are you saying that checking just 1 folio
is not enough in this case and can still cause thrashing, i.e we should check more?

If we want to maintain consitency for all folios another option is
mem_cgroup_margin(memcg) < MEMCG_CHARGE_BATCH * folio_nr_pages(folio)
but I think this is too extreme, we would be checking if 64M can be charged to
cgroup just to swapin 1M.

> 
> I believe there’s still room for improvement. For example, if a 64KB charge
> attempt fails, there’s no need to waste time trying 32KB or 16KB. We can
> directly fall back to 4KB, as 32KB and 16KB will also fail based on our
> margin detection logic.
> 

Yes that makes sense. Would something like below work to fix that:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c098fd7f5c5e..0a1ec55cc079 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                memcg = get_mem_cgroup_from_mm(mm);
        rcu_read_unlock();
 
-       ret = charge_memcg(folio, memcg, gfp);
+       if (folio_test_large(folio) &&
+           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
+               ret = -ENOMEM;
+       else
+               ret = charge_memcg(folio, memcg, gfp);
 
        css_put(&memcg->css);
        return ret;
diff --git a/mm/memory.c b/mm/memory.c
index fecdd044bc0b..b6ce6605dc63 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4123,6 +4123,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
        pte_t *pte;
        gfp_t gfp;
        int order;
+       int ret;
 
        /*
         * If uffd is active for the vma we need per-page fault fidelity to
@@ -4170,9 +4171,13 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr, true);
                if (folio) {
-                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
-                                                           gfp, entry))
+                       ret = mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry);
+                       if (!ret) {
                                return folio;
+                       } else if (ret == -ENOMEM) {
+                               folio_put(folio);
+                               goto fallback;
+                       }
                        folio_put(folio);
                }
                order = next_order(&orders, order);

Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Thu, Oct 24, 2024 at 9:36 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, Oct 24, 2024 at 8:47 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 23/10/2024 19:52, Barry Song wrote:
> > > On Thu, Oct 24, 2024 at 7:31 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>
> > >>
> > >>
> > >> On 23/10/2024 19:02, Yosry Ahmed wrote:
> > >>> [..]
> > >>>>>> I suspect the regression occurs because you're running an edge case
> > >>>>>> where the memory cgroup stays nearly full most of the time (this isn't
> > >>>>>> an inherent issue with large folio swap-in). As a result, swapping in
> > >>>>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> > >>>>>> next swap-in then recreates the overflow, leading to a repeating
> > >>>>>> cycle.
> > >>>>>>
> > >>>>>
> > >>>>> Yes, agreed! Looking at the swap counters, I think this is what is going
> > >>>>> on as well.
> > >>>>>
> > >>>>>> We need a way to stop the cup from repeatedly filling to the brim and
> > >>>>>> overflowing. While not a definitive fix, the following change might help
> > >>>>>> improve the situation:
> > >>>>>>
> > >>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > >>>>>>
> > >>>>>> index 17af08367c68..f2fa0eeb2d9a 100644
> > >>>>>> --- a/mm/memcontrol.c
> > >>>>>> +++ b/mm/memcontrol.c
> > >>>>>>
> > >>>>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> > >>>>>> *folio, struct mm_struct *mm,
> > >>>>>>                 memcg = get_mem_cgroup_from_mm(mm);
> > >>>>>>         rcu_read_unlock();
> > >>>>>>
> > >>>>>> -       ret = charge_memcg(folio, memcg, gfp);
> > >>>>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> > >>>>>> MEMCG_CHARGE_BATCH)
> > >>>>>> +               ret = -ENOMEM;
> > >>>>>> +       else
> > >>>>>> +               ret = charge_memcg(folio, memcg, gfp);
> > >>>>>>
> > >>>>>>         css_put(&memcg->css);
> > >>>>>>         return ret;
> > >>>>>> }
> > >>>>>>
> > >>>>>
> > >>>>> The diff makes sense to me. Let me test later today and get back to you.
> > >>>>>
> > >>>>> Thanks!
> > >>>>>
> > >>>>>> Please confirm if it makes the kernel build with memcg limitation
> > >>>>>> faster. If so, let's
> > >>>>>> work together to figure out an official patch :-) The above code hasn't consider
> > >>>>>> the parent memcg's overflow, so not an ideal fix.
> > >>>>>>
> > >>>>
> > >>>> Thanks Barry, I think this fixes the regression, and even gives an improvement!
> > >>>> I think the below might be better to do:
> > >>>>
> > >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > >>>> index c098fd7f5c5e..0a1ec55cc079 100644
> > >>>> --- a/mm/memcontrol.c
> > >>>> +++ b/mm/memcontrol.c
> > >>>> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > >>>>                 memcg = get_mem_cgroup_from_mm(mm);
> > >>>>         rcu_read_unlock();
> > >>>>
> > >>>> -       ret = charge_memcg(folio, memcg, gfp);
> > >>>> +       if (folio_test_large(folio) &&
> > >>>> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> > >>>> +               ret = -ENOMEM;
> > >>>> +       else
> > >>>> +               ret = charge_memcg(folio, memcg, gfp);
> > >>>>
> > >>>>         css_put(&memcg->css);
> > >>>>         return ret;
> > >>>>
> > >>>>
> > >>>> AMD 16K+32K THP=always
> > >>>> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
> > >>>> real           1m23.038s        1m23.050s                                   1m22.704s
> > >>>> user           53m57.210s       53m53.437s                                  53m52.577s
> > >>>> sys            7m24.592s        7m48.843s                                   7m22.519s
> > >>>> zswpin         612070           999244                                      815934
> > >>>> zswpout        2226403          2347979                                     2054980
> > >>>> pgfault        20667366         20481728                                    20478690
> > >>>> pgmajfault     385887           269117                                      309702
> > >>>>
> > >>>> AMD 16K+32K+64K THP=always
> > >>>> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
> > >>>> real           1m22.975s        1m23.266s                                  1m22.549s
> > >>>> user           53m51.302s       53m51.069s                                 53m46.471s
> > >>>> sys            7m40.168s        7m57.104s                                  7m25.012s
> > >>>> zswpin         676492           1258573                                    1225703
> > >>>> zswpout        2449839          2714767                                    2899178
> > >>>> pgfault        17540746         17296555                                   17234663
> > >>>> pgmajfault     429629           307495                                     287859
> > >>>>
> > >>>
> > >>> Thanks Usama and Barry for looking into this. It seems like this would
> > >>> fix a regression with large folio swapin regardless of zswap. Can the
> > >>> same result be reproduced on zram without this series?
> > >>
> > >>
> > >> Yes, its a regression in large folio swapin support regardless of zswap/zram.
> > >>
> > >> Need to do 3 tests, one with probably the below diff to remove large folio support,
> > >> one with current upstream and one with upstream + swap thrashing fix.
> > >>
> > >> We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
> > >> Any zram volunteers to try this?
> > >
> > > Hi Usama,
> > >
> > > I tried a quick experiment:
> > >
> > > echo 1 > /sys/module/zswap/parameters/enabled
> > > echo 0 > /sys/module/zswap/parameters/enabled
> > >
> > > This was to test the zRAM scenario. Enabling zswap even
> > > once disables mTHP swap-in. :)
> > >
> > > I noticed a similar regression with zRAM alone, but the change resolved
> > > the issue and even sped up the kernel build compared to the setup without
> > > mTHP swap-in.
> >
> > Thanks for trying, this is amazing!
> > >
> > > However, I’m still working on a proper patch to address this. The current
> > > approach:
> > >
> > > mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio))
> > >
> > > isn’t sufficient, as it doesn’t cover cases where group A contains group B, and
> > > we’re operating within group B. The problem occurs not at the boundary of
> > > group B but at the boundary of group A.
> >
> > I am not sure I completely followed this. As MEMCG_CHARGE_BATCH=64, if we are
> > trying to swapin a 16kB page, we basically check if atleast 64/4 = 16 folios can be
> > charged to cgroup, which is reasonable. If we try to swapin a 1M folio, we just
> > check if we can charge atleast 1 folio. Are you saying that checking just 1 folio
> > is not enough in this case and can still cause thrashing, i.e we should check more?
>
> My understanding is that cgroups are hierarchical. Even if we don’t
> hit the memory
>  limit of the folio’s direct memcg, we could still reach the limit of
> one of its parent
> memcgs. Imagine a structure like:
>
> /sys/fs/cgroup/a/b/c/d
>
> If we’re compiling the kernel in d, there’s a chance that while d
> isn’t at its limit, its
> parents (c, b, or a) could be. Currently, the check only applies to d.

To clarify, I mean something like this:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 17af08367c68..cc6d21848ee8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4530,6 +4530,29 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
 	return 0;
 }

+/*
+ * When the memory cgroup is nearly full, swapping in large folios can
+ * easily lead to swap thrashing, as the memcg operates on the edge of
+ * being full. We maintain a margin to allow for quick fallback to
+ * smaller folios during the swap-in process.
+ */
+static inline bool mem_cgroup_swapin_margin_protected(struct mem_cgroup *memcg,
+		struct folio *folio)
+{
+	unsigned int nr;
+
+	if (!folio_test_large(folio))
+		return false;
+
+	nr = max_t(unsigned int, folio_nr_pages(folio), MEMCG_CHARGE_BATCH);
+	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+		if (mem_cgroup_margin(memcg) < nr)
+			return true;
+	}
+
+	return false;
+}
+
 /**
  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
  * @folio: folio to charge.
@@ -4547,7 +4570,8 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
-	int ret;
+	int ret = -ENOMEM;
+	bool margin_prot;

 	if (mem_cgroup_disabled())
 		return 0;
@@ -4557,9 +4581,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 	memcg = mem_cgroup_from_id(id);
 	if (!memcg || !css_tryget_online(&memcg->css))
 		memcg = get_mem_cgroup_from_mm(mm);
+	margin_prot = mem_cgroup_swapin_margin_protected(memcg, folio);
 	rcu_read_unlock();

-	ret = charge_memcg(folio, memcg, gfp);
+	if (!margin_prot)
+		ret = charge_memcg(folio, memcg, gfp);

 	css_put(&memcg->css);
 	return ret;

>
> >
> > If we want to maintain consitency for all folios another option is
> > mem_cgroup_margin(memcg) < MEMCG_CHARGE_BATCH * folio_nr_pages(folio)
> > but I think this is too extreme, we would be checking if 64M can be charged to
> > cgroup just to swapin 1M.
> >
> > >
> > > I believe there’s still room for improvement. For example, if a 64KB charge
> > > attempt fails, there’s no need to waste time trying 32KB or 16KB. We can
> > > directly fall back to 4KB, as 32KB and 16KB will also fail based on our
> > > margin detection logic.
> > >
> >
> > Yes that makes sense. Would something like below work to fix that:
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index c098fd7f5c5e..0a1ec55cc079 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> >                 memcg = get_mem_cgroup_from_mm(mm);
> >         rcu_read_unlock();
> >
> > -       ret = charge_memcg(folio, memcg, gfp);
> > +       if (folio_test_large(folio) &&
> > +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> > +               ret = -ENOMEM;
> > +       else
> > +               ret = charge_memcg(folio, memcg, gfp);
> >
> >         css_put(&memcg->css);
> >         return ret;
> > diff --git a/mm/memory.c b/mm/memory.c
> > index fecdd044bc0b..b6ce6605dc63 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4123,6 +4123,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >         pte_t *pte;
> >         gfp_t gfp;
> >         int order;
> > +       int ret;
> >
> >         /*
> >          * If uffd is active for the vma we need per-page fault fidelity to
> > @@ -4170,9 +4171,13 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> >                 folio = vma_alloc_folio(gfp, order, vma, addr, true);
> >                 if (folio) {
> > -                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> > -                                                           gfp, entry))
> > +                       ret = mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry);
> > +                       if (!ret) {
> >                                 return folio;
> > +                       } else if (ret == -ENOMEM) {
> > +                               folio_put(folio);
> > +                               goto fallback;
> > +                       }
> >                         folio_put(folio);
> >                 }
> >                 order = next_order(&orders, order);
> >
>
> Yes, does it make your kernel build even faster?

Thanks
Barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Johannes Weiner 1 month ago
On Thu, Oct 24, 2024 at 12:35:48PM +1300, Barry Song wrote:
> On Thu, Oct 24, 2024 at 9:36 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Thu, Oct 24, 2024 at 8:47 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > >
> > >
> > >
> > > On 23/10/2024 19:52, Barry Song wrote:
> > > > On Thu, Oct 24, 2024 at 7:31 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > > >>
> > > >>
> > > >>
> > > >> On 23/10/2024 19:02, Yosry Ahmed wrote:
> > > >>> [..]
> > > >>>>>> I suspect the regression occurs because you're running an edge case
> > > >>>>>> where the memory cgroup stays nearly full most of the time (this isn't
> > > >>>>>> an inherent issue with large folio swap-in). As a result, swapping in
> > > >>>>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> > > >>>>>> next swap-in then recreates the overflow, leading to a repeating
> > > >>>>>> cycle.
> > > >>>>>>
> > > >>>>>
> > > >>>>> Yes, agreed! Looking at the swap counters, I think this is what is going
> > > >>>>> on as well.
> > > >>>>>
> > > >>>>>> We need a way to stop the cup from repeatedly filling to the brim and
> > > >>>>>> overflowing. While not a definitive fix, the following change might help
> > > >>>>>> improve the situation:
> > > >>>>>>
> > > >>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > >>>>>>
> > > >>>>>> index 17af08367c68..f2fa0eeb2d9a 100644
> > > >>>>>> --- a/mm/memcontrol.c
> > > >>>>>> +++ b/mm/memcontrol.c
> > > >>>>>>
> > > >>>>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> > > >>>>>> *folio, struct mm_struct *mm,
> > > >>>>>>                 memcg = get_mem_cgroup_from_mm(mm);
> > > >>>>>>         rcu_read_unlock();
> > > >>>>>>
> > > >>>>>> -       ret = charge_memcg(folio, memcg, gfp);
> > > >>>>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> > > >>>>>> MEMCG_CHARGE_BATCH)
> > > >>>>>> +               ret = -ENOMEM;
> > > >>>>>> +       else
> > > >>>>>> +               ret = charge_memcg(folio, memcg, gfp);
> > > >>>>>>
> > > >>>>>>         css_put(&memcg->css);
> > > >>>>>>         return ret;
> > > >>>>>> }
> > > >>>>>>
> > > >>>>>
> > > >>>>> The diff makes sense to me. Let me test later today and get back to you.
> > > >>>>>
> > > >>>>> Thanks!
> > > >>>>>
> > > >>>>>> Please confirm if it makes the kernel build with memcg limitation
> > > >>>>>> faster. If so, let's
> > > >>>>>> work together to figure out an official patch :-) The above code hasn't consider
> > > >>>>>> the parent memcg's overflow, so not an ideal fix.
> > > >>>>>>
> > > >>>>
> > > >>>> Thanks Barry, I think this fixes the regression, and even gives an improvement!
> > > >>>> I think the below might be better to do:
> > > >>>>
> > > >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > >>>> index c098fd7f5c5e..0a1ec55cc079 100644
> > > >>>> --- a/mm/memcontrol.c
> > > >>>> +++ b/mm/memcontrol.c
> > > >>>> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > > >>>>                 memcg = get_mem_cgroup_from_mm(mm);
> > > >>>>         rcu_read_unlock();
> > > >>>>
> > > >>>> -       ret = charge_memcg(folio, memcg, gfp);
> > > >>>> +       if (folio_test_large(folio) &&
> > > >>>> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> > > >>>> +               ret = -ENOMEM;
> > > >>>> +       else
> > > >>>> +               ret = charge_memcg(folio, memcg, gfp);
> > > >>>>
> > > >>>>         css_put(&memcg->css);
> > > >>>>         return ret;
> > > >>>>
> > > >>>>
> > > >>>> AMD 16K+32K THP=always
> > > >>>> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
> > > >>>> real           1m23.038s        1m23.050s                                   1m22.704s
> > > >>>> user           53m57.210s       53m53.437s                                  53m52.577s
> > > >>>> sys            7m24.592s        7m48.843s                                   7m22.519s
> > > >>>> zswpin         612070           999244                                      815934
> > > >>>> zswpout        2226403          2347979                                     2054980
> > > >>>> pgfault        20667366         20481728                                    20478690
> > > >>>> pgmajfault     385887           269117                                      309702
> > > >>>>
> > > >>>> AMD 16K+32K+64K THP=always
> > > >>>> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
> > > >>>> real           1m22.975s        1m23.266s                                  1m22.549s
> > > >>>> user           53m51.302s       53m51.069s                                 53m46.471s
> > > >>>> sys            7m40.168s        7m57.104s                                  7m25.012s
> > > >>>> zswpin         676492           1258573                                    1225703
> > > >>>> zswpout        2449839          2714767                                    2899178
> > > >>>> pgfault        17540746         17296555                                   17234663
> > > >>>> pgmajfault     429629           307495                                     287859
> > > >>>>
> > > >>>
> > > >>> Thanks Usama and Barry for looking into this. It seems like this would
> > > >>> fix a regression with large folio swapin regardless of zswap. Can the
> > > >>> same result be reproduced on zram without this series?
> > > >>
> > > >>
> > > >> Yes, its a regression in large folio swapin support regardless of zswap/zram.
> > > >>
> > > >> Need to do 3 tests, one with probably the below diff to remove large folio support,
> > > >> one with current upstream and one with upstream + swap thrashing fix.
> > > >>
> > > >> We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
> > > >> Any zram volunteers to try this?
> > > >
> > > > Hi Usama,
> > > >
> > > > I tried a quick experiment:
> > > >
> > > > echo 1 > /sys/module/zswap/parameters/enabled
> > > > echo 0 > /sys/module/zswap/parameters/enabled
> > > >
> > > > This was to test the zRAM scenario. Enabling zswap even
> > > > once disables mTHP swap-in. :)
> > > >
> > > > I noticed a similar regression with zRAM alone, but the change resolved
> > > > the issue and even sped up the kernel build compared to the setup without
> > > > mTHP swap-in.
> > >
> > > Thanks for trying, this is amazing!
> > > >
> > > > However, I’m still working on a proper patch to address this. The current
> > > > approach:
> > > >
> > > > mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio))
> > > >
> > > > isn’t sufficient, as it doesn’t cover cases where group A contains group B, and
> > > > we’re operating within group B. The problem occurs not at the boundary of
> > > > group B but at the boundary of group A.
> > >
> > > I am not sure I completely followed this. As MEMCG_CHARGE_BATCH=64, if we are
> > > trying to swapin a 16kB page, we basically check if atleast 64/4 = 16 folios can be
> > > charged to cgroup, which is reasonable. If we try to swapin a 1M folio, we just
> > > check if we can charge atleast 1 folio. Are you saying that checking just 1 folio
> > > is not enough in this case and can still cause thrashing, i.e we should check more?
> >
> > My understanding is that cgroups are hierarchical. Even if we don’t
> > hit the memory
> >  limit of the folio’s direct memcg, we could still reach the limit of
> > one of its parent
> > memcgs. Imagine a structure like:
> >
> > /sys/fs/cgroup/a/b/c/d
> >
> > If we’re compiling the kernel in d, there’s a chance that while d
> > isn’t at its limit, its
> > parents (c, b, or a) could be. Currently, the check only applies to d.
> 
> To clarify, I mean something like this:
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 17af08367c68..cc6d21848ee8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4530,6 +4530,29 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
>  	return 0;
>  }
> 
> +/*
> + * When the memory cgroup is nearly full, swapping in large folios can
> + * easily lead to swap thrashing, as the memcg operates on the edge of
> + * being full. We maintain a margin to allow for quick fallback to
> + * smaller folios during the swap-in process.
> + */
> +static inline bool mem_cgroup_swapin_margin_protected(struct mem_cgroup *memcg,
> +		struct folio *folio)
> +{
> +	unsigned int nr;
> +
> +	if (!folio_test_large(folio))
> +		return false;
> +
> +	nr = max_t(unsigned int, folio_nr_pages(folio), MEMCG_CHARGE_BATCH);
> +	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
> +		if (mem_cgroup_margin(memcg) < nr)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  /**
>   * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
>   * @folio: folio to charge.
> @@ -4547,7 +4570,8 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>  {
>  	struct mem_cgroup *memcg;
>  	unsigned short id;
> -	int ret;
> +	int ret = -ENOMEM;
> +	bool margin_prot;
> 
>  	if (mem_cgroup_disabled())
>  		return 0;
> @@ -4557,9 +4581,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>  	memcg = mem_cgroup_from_id(id);
>  	if (!memcg || !css_tryget_online(&memcg->css))
>  		memcg = get_mem_cgroup_from_mm(mm);
> +	margin_prot = mem_cgroup_swapin_margin_protected(memcg, folio);
>  	rcu_read_unlock();
> 
> -	ret = charge_memcg(folio, memcg, gfp);
> +	if (!margin_prot)
> +		ret = charge_memcg(folio, memcg, gfp);
> 
>  	css_put(&memcg->css);
>  	return ret;

I'm not quite following.

The charging code DOES the margin check. If you just want to avoid
reclaim, pass gfp without __GFP_DIRECT_RECLAIM, and it will return
-ENOMEM if there is no margin.

alloc_swap_folio() passes the THP mask, which should not include the
reclaim flag per default (GFP_TRANSHUGE_LIGHT). Unless you run with
defrag=always. Is that what's going on?
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Fri, Oct 25, 2024 at 3:29 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Thu, Oct 24, 2024 at 12:35:48PM +1300, Barry Song wrote:
> > On Thu, Oct 24, 2024 at 9:36 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > On Thu, Oct 24, 2024 at 8:47 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > > >
> > > >
> > > >
> > > > On 23/10/2024 19:52, Barry Song wrote:
> > > > > On Thu, Oct 24, 2024 at 7:31 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > > > >>
> > > > >>
> > > > >>
> > > > >> On 23/10/2024 19:02, Yosry Ahmed wrote:
> > > > >>> [..]
> > > > >>>>>> I suspect the regression occurs because you're running an edge case
> > > > >>>>>> where the memory cgroup stays nearly full most of the time (this isn't
> > > > >>>>>> an inherent issue with large folio swap-in). As a result, swapping in
> > > > >>>>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> > > > >>>>>> next swap-in then recreates the overflow, leading to a repeating
> > > > >>>>>> cycle.
> > > > >>>>>>
> > > > >>>>>
> > > > >>>>> Yes, agreed! Looking at the swap counters, I think this is what is going
> > > > >>>>> on as well.
> > > > >>>>>
> > > > >>>>>> We need a way to stop the cup from repeatedly filling to the brim and
> > > > >>>>>> overflowing. While not a definitive fix, the following change might help
> > > > >>>>>> improve the situation:
> > > > >>>>>>
> > > > >>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > >>>>>>
> > > > >>>>>> index 17af08367c68..f2fa0eeb2d9a 100644
> > > > >>>>>> --- a/mm/memcontrol.c
> > > > >>>>>> +++ b/mm/memcontrol.c
> > > > >>>>>>
> > > > >>>>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> > > > >>>>>> *folio, struct mm_struct *mm,
> > > > >>>>>>                 memcg = get_mem_cgroup_from_mm(mm);
> > > > >>>>>>         rcu_read_unlock();
> > > > >>>>>>
> > > > >>>>>> -       ret = charge_memcg(folio, memcg, gfp);
> > > > >>>>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> > > > >>>>>> MEMCG_CHARGE_BATCH)
> > > > >>>>>> +               ret = -ENOMEM;
> > > > >>>>>> +       else
> > > > >>>>>> +               ret = charge_memcg(folio, memcg, gfp);
> > > > >>>>>>
> > > > >>>>>>         css_put(&memcg->css);
> > > > >>>>>>         return ret;
> > > > >>>>>> }
> > > > >>>>>>
> > > > >>>>>
> > > > >>>>> The diff makes sense to me. Let me test later today and get back to you.
> > > > >>>>>
> > > > >>>>> Thanks!
> > > > >>>>>
> > > > >>>>>> Please confirm if it makes the kernel build with memcg limitation
> > > > >>>>>> faster. If so, let's
> > > > >>>>>> work together to figure out an official patch :-) The above code hasn't consider
> > > > >>>>>> the parent memcg's overflow, so not an ideal fix.
> > > > >>>>>>
> > > > >>>>
> > > > >>>> Thanks Barry, I think this fixes the regression, and even gives an improvement!
> > > > >>>> I think the below might be better to do:
> > > > >>>>
> > > > >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > >>>> index c098fd7f5c5e..0a1ec55cc079 100644
> > > > >>>> --- a/mm/memcontrol.c
> > > > >>>> +++ b/mm/memcontrol.c
> > > > >>>> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > > > >>>>                 memcg = get_mem_cgroup_from_mm(mm);
> > > > >>>>         rcu_read_unlock();
> > > > >>>>
> > > > >>>> -       ret = charge_memcg(folio, memcg, gfp);
> > > > >>>> +       if (folio_test_large(folio) &&
> > > > >>>> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> > > > >>>> +               ret = -ENOMEM;
> > > > >>>> +       else
> > > > >>>> +               ret = charge_memcg(folio, memcg, gfp);
> > > > >>>>
> > > > >>>>         css_put(&memcg->css);
> > > > >>>>         return ret;
> > > > >>>>
> > > > >>>>
> > > > >>>> AMD 16K+32K THP=always
> > > > >>>> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
> > > > >>>> real           1m23.038s        1m23.050s                                   1m22.704s
> > > > >>>> user           53m57.210s       53m53.437s                                  53m52.577s
> > > > >>>> sys            7m24.592s        7m48.843s                                   7m22.519s
> > > > >>>> zswpin         612070           999244                                      815934
> > > > >>>> zswpout        2226403          2347979                                     2054980
> > > > >>>> pgfault        20667366         20481728                                    20478690
> > > > >>>> pgmajfault     385887           269117                                      309702
> > > > >>>>
> > > > >>>> AMD 16K+32K+64K THP=always
> > > > >>>> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
> > > > >>>> real           1m22.975s        1m23.266s                                  1m22.549s
> > > > >>>> user           53m51.302s       53m51.069s                                 53m46.471s
> > > > >>>> sys            7m40.168s        7m57.104s                                  7m25.012s
> > > > >>>> zswpin         676492           1258573                                    1225703
> > > > >>>> zswpout        2449839          2714767                                    2899178
> > > > >>>> pgfault        17540746         17296555                                   17234663
> > > > >>>> pgmajfault     429629           307495                                     287859
> > > > >>>>
> > > > >>>
> > > > >>> Thanks Usama and Barry for looking into this. It seems like this would
> > > > >>> fix a regression with large folio swapin regardless of zswap. Can the
> > > > >>> same result be reproduced on zram without this series?
> > > > >>
> > > > >>
> > > > >> Yes, its a regression in large folio swapin support regardless of zswap/zram.
> > > > >>
> > > > >> Need to do 3 tests, one with probably the below diff to remove large folio support,
> > > > >> one with current upstream and one with upstream + swap thrashing fix.
> > > > >>
> > > > >> We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
> > > > >> Any zram volunteers to try this?
> > > > >
> > > > > Hi Usama,
> > > > >
> > > > > I tried a quick experiment:
> > > > >
> > > > > echo 1 > /sys/module/zswap/parameters/enabled
> > > > > echo 0 > /sys/module/zswap/parameters/enabled
> > > > >
> > > > > This was to test the zRAM scenario. Enabling zswap even
> > > > > once disables mTHP swap-in. :)
> > > > >
> > > > > I noticed a similar regression with zRAM alone, but the change resolved
> > > > > the issue and even sped up the kernel build compared to the setup without
> > > > > mTHP swap-in.
> > > >
> > > > Thanks for trying, this is amazing!
> > > > >
> > > > > However, I’m still working on a proper patch to address this. The current
> > > > > approach:
> > > > >
> > > > > mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio))
> > > > >
> > > > > isn’t sufficient, as it doesn’t cover cases where group A contains group B, and
> > > > > we’re operating within group B. The problem occurs not at the boundary of
> > > > > group B but at the boundary of group A.
> > > >
> > > > I am not sure I completely followed this. As MEMCG_CHARGE_BATCH=64, if we are
> > > > trying to swapin a 16kB page, we basically check if atleast 64/4 = 16 folios can be
> > > > charged to cgroup, which is reasonable. If we try to swapin a 1M folio, we just
> > > > check if we can charge atleast 1 folio. Are you saying that checking just 1 folio
> > > > is not enough in this case and can still cause thrashing, i.e we should check more?
> > >
> > > My understanding is that cgroups are hierarchical. Even if we don’t
> > > hit the memory
> > >  limit of the folio’s direct memcg, we could still reach the limit of
> > > one of its parent
> > > memcgs. Imagine a structure like:
> > >
> > > /sys/fs/cgroup/a/b/c/d
> > >
> > > If we’re compiling the kernel in d, there’s a chance that while d
> > > isn’t at its limit, its
> > > parents (c, b, or a) could be. Currently, the check only applies to d.
> >
> > To clarify, I mean something like this:
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 17af08367c68..cc6d21848ee8 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4530,6 +4530,29 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
> >       return 0;
> >  }
> >
> > +/*
> > + * When the memory cgroup is nearly full, swapping in large folios can
> > + * easily lead to swap thrashing, as the memcg operates on the edge of
> > + * being full. We maintain a margin to allow for quick fallback to
> > + * smaller folios during the swap-in process.
> > + */
> > +static inline bool mem_cgroup_swapin_margin_protected(struct mem_cgroup *memcg,
> > +             struct folio *folio)
> > +{
> > +     unsigned int nr;
> > +
> > +     if (!folio_test_large(folio))
> > +             return false;
> > +
> > +     nr = max_t(unsigned int, folio_nr_pages(folio), MEMCG_CHARGE_BATCH);
> > +     for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
> > +             if (mem_cgroup_margin(memcg) < nr)
> > +                     return true;
> > +     }
> > +
> > +     return false;
> > +}
> > +
> >  /**
> >   * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
> >   * @folio: folio to charge.
> > @@ -4547,7 +4570,8 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> >  {
> >       struct mem_cgroup *memcg;
> >       unsigned short id;
> > -     int ret;
> > +     int ret = -ENOMEM;
> > +     bool margin_prot;
> >
> >       if (mem_cgroup_disabled())
> >               return 0;
> > @@ -4557,9 +4581,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> >       memcg = mem_cgroup_from_id(id);
> >       if (!memcg || !css_tryget_online(&memcg->css))
> >               memcg = get_mem_cgroup_from_mm(mm);
> > +     margin_prot = mem_cgroup_swapin_margin_protected(memcg, folio);
> >       rcu_read_unlock();
> >
> > -     ret = charge_memcg(folio, memcg, gfp);
> > +     if (!margin_prot)
> > +             ret = charge_memcg(folio, memcg, gfp);
> >
> >       css_put(&memcg->css);
> >       return ret;
>
> I'm not quite following.
>
> The charging code DOES the margin check. If you just want to avoid
> reclaim, pass gfp without __GFP_DIRECT_RECLAIM, and it will return
> -ENOMEM if there is no margin.
>
> alloc_swap_folio() passes the THP mask, which should not include the
> reclaim flag per default (GFP_TRANSHUGE_LIGHT). Unless you run with
> defrag=always. Is that what's going on?

No, quite sure "defrag=never" can just achieve the same result. Imagine we only
have small folios—each time reclamation occurs, we have at least a
SWAP_CLUSTER_MAX buffer before the next reclamation is triggered.

 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),

However, with large folios, we can quickly exhaust the SWAP_CLUSTER_MAX
buffer and reach the next reclamation point.
Once we consume SWAP_CLUSTER_MAX - 1, the mem_cgroup_swapin_charge_folio()
call for the final small folio with GFP_KERNEL will trigger reclamation.
        if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                           GFP_KERNEL, entry)) {

Thanks
Barry
Re: [RFC 0/4] mm: zswap: add support for zswapin of large folios
Posted by Barry Song 1 month ago
On Thu, Oct 24, 2024 at 8:47 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 23/10/2024 19:52, Barry Song wrote:
> > On Thu, Oct 24, 2024 at 7:31 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >>
> >>
> >> On 23/10/2024 19:02, Yosry Ahmed wrote:
> >>> [..]
> >>>>>> I suspect the regression occurs because you're running an edge case
> >>>>>> where the memory cgroup stays nearly full most of the time (this isn't
> >>>>>> an inherent issue with large folio swap-in). As a result, swapping in
> >>>>>> mTHP quickly triggers a memcg overflow, causing a swap-out. The
> >>>>>> next swap-in then recreates the overflow, leading to a repeating
> >>>>>> cycle.
> >>>>>>
> >>>>>
> >>>>> Yes, agreed! Looking at the swap counters, I think this is what is going
> >>>>> on as well.
> >>>>>
> >>>>>> We need a way to stop the cup from repeatedly filling to the brim and
> >>>>>> overflowing. While not a definitive fix, the following change might help
> >>>>>> improve the situation:
> >>>>>>
> >>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >>>>>>
> >>>>>> index 17af08367c68..f2fa0eeb2d9a 100644
> >>>>>> --- a/mm/memcontrol.c
> >>>>>> +++ b/mm/memcontrol.c
> >>>>>>
> >>>>>> @@ -4559,7 +4559,10 @@ int mem_cgroup_swapin_charge_folio(struct folio
> >>>>>> *folio, struct mm_struct *mm,
> >>>>>>                 memcg = get_mem_cgroup_from_mm(mm);
> >>>>>>         rcu_read_unlock();
> >>>>>>
> >>>>>> -       ret = charge_memcg(folio, memcg, gfp);
> >>>>>> +       if (folio_test_large(folio) && mem_cgroup_margin(memcg) <
> >>>>>> MEMCG_CHARGE_BATCH)
> >>>>>> +               ret = -ENOMEM;
> >>>>>> +       else
> >>>>>> +               ret = charge_memcg(folio, memcg, gfp);
> >>>>>>
> >>>>>>         css_put(&memcg->css);
> >>>>>>         return ret;
> >>>>>> }
> >>>>>>
> >>>>>
> >>>>> The diff makes sense to me. Let me test later today and get back to you.
> >>>>>
> >>>>> Thanks!
> >>>>>
> >>>>>> Please confirm if it makes the kernel build with memcg limitation
> >>>>>> faster. If so, let's
> >>>>>> work together to figure out an official patch :-) The above code hasn't consider
> >>>>>> the parent memcg's overflow, so not an ideal fix.
> >>>>>>
> >>>>
> >>>> Thanks Barry, I think this fixes the regression, and even gives an improvement!
> >>>> I think the below might be better to do:
> >>>>
> >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >>>> index c098fd7f5c5e..0a1ec55cc079 100644
> >>>> --- a/mm/memcontrol.c
> >>>> +++ b/mm/memcontrol.c
> >>>> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> >>>>                 memcg = get_mem_cgroup_from_mm(mm);
> >>>>         rcu_read_unlock();
> >>>>
> >>>> -       ret = charge_memcg(folio, memcg, gfp);
> >>>> +       if (folio_test_large(folio) &&
> >>>> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> >>>> +               ret = -ENOMEM;
> >>>> +       else
> >>>> +               ret = charge_memcg(folio, memcg, gfp);
> >>>>
> >>>>         css_put(&memcg->css);
> >>>>         return ret;
> >>>>
> >>>>
> >>>> AMD 16K+32K THP=always
> >>>> metric         mm-unstable      mm-unstable + large folio zswapin series    mm-unstable + large folio zswapin + no swap thrashing fix
> >>>> real           1m23.038s        1m23.050s                                   1m22.704s
> >>>> user           53m57.210s       53m53.437s                                  53m52.577s
> >>>> sys            7m24.592s        7m48.843s                                   7m22.519s
> >>>> zswpin         612070           999244                                      815934
> >>>> zswpout        2226403          2347979                                     2054980
> >>>> pgfault        20667366         20481728                                    20478690
> >>>> pgmajfault     385887           269117                                      309702
> >>>>
> >>>> AMD 16K+32K+64K THP=always
> >>>> metric         mm-unstable      mm-unstable + large folio zswapin series   mm-unstable + large folio zswapin + no swap thrashing fix
> >>>> real           1m22.975s        1m23.266s                                  1m22.549s
> >>>> user           53m51.302s       53m51.069s                                 53m46.471s
> >>>> sys            7m40.168s        7m57.104s                                  7m25.012s
> >>>> zswpin         676492           1258573                                    1225703
> >>>> zswpout        2449839          2714767                                    2899178
> >>>> pgfault        17540746         17296555                                   17234663
> >>>> pgmajfault     429629           307495                                     287859
> >>>>
> >>>
> >>> Thanks Usama and Barry for looking into this. It seems like this would
> >>> fix a regression with large folio swapin regardless of zswap. Can the
> >>> same result be reproduced on zram without this series?
> >>
> >>
> >> Yes, its a regression in large folio swapin support regardless of zswap/zram.
> >>
> >> Need to do 3 tests, one with probably the below diff to remove large folio support,
> >> one with current upstream and one with upstream + swap thrashing fix.
> >>
> >> We only use zswap and dont have a zram setup (and I am a bit lazy to create one :)).
> >> Any zram volunteers to try this?
> >
> > Hi Usama,
> >
> > I tried a quick experiment:
> >
> > echo 1 > /sys/module/zswap/parameters/enabled
> > echo 0 > /sys/module/zswap/parameters/enabled
> >
> > This was to test the zRAM scenario. Enabling zswap even
> > once disables mTHP swap-in. :)
> >
> > I noticed a similar regression with zRAM alone, but the change resolved
> > the issue and even sped up the kernel build compared to the setup without
> > mTHP swap-in.
>
> Thanks for trying, this is amazing!
> >
> > However, I’m still working on a proper patch to address this. The current
> > approach:
> >
> > mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio))
> >
> > isn’t sufficient, as it doesn’t cover cases where group A contains group B, and
> > we’re operating within group B. The problem occurs not at the boundary of
> > group B but at the boundary of group A.
>
> I am not sure I completely followed this. As MEMCG_CHARGE_BATCH=64, if we are
> trying to swapin a 16kB page, we basically check if atleast 64/4 = 16 folios can be
> charged to cgroup, which is reasonable. If we try to swapin a 1M folio, we just
> check if we can charge atleast 1 folio. Are you saying that checking just 1 folio
> is not enough in this case and can still cause thrashing, i.e we should check more?

My understanding is that cgroups are hierarchical. Even if we don’t
hit the memory
 limit of the folio’s direct memcg, we could still reach the limit of
one of its parent
memcgs. Imagine a structure like:

/sys/fs/cgroup/a/b/c/d

If we’re compiling the kernel in d, there’s a chance that while d
isn’t at its limit, its
parents (c, b, or a) could be. Currently, the check only applies to d.

>
> If we want to maintain consitency for all folios another option is
> mem_cgroup_margin(memcg) < MEMCG_CHARGE_BATCH * folio_nr_pages(folio)
> but I think this is too extreme, we would be checking if 64M can be charged to
> cgroup just to swapin 1M.
>
> >
> > I believe there’s still room for improvement. For example, if a 64KB charge
> > attempt fails, there’s no need to waste time trying 32KB or 16KB. We can
> > directly fall back to 4KB, as 32KB and 16KB will also fail based on our
> > margin detection logic.
> >
>
> Yes that makes sense. Would something like below work to fix that:
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c098fd7f5c5e..0a1ec55cc079 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4550,7 +4550,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>                 memcg = get_mem_cgroup_from_mm(mm);
>         rcu_read_unlock();
>
> -       ret = charge_memcg(folio, memcg, gfp);
> +       if (folio_test_large(folio) &&
> +           mem_cgroup_margin(memcg) < max(MEMCG_CHARGE_BATCH, folio_nr_pages(folio)))
> +               ret = -ENOMEM;
> +       else
> +               ret = charge_memcg(folio, memcg, gfp);
>
>         css_put(&memcg->css);
>         return ret;
> diff --git a/mm/memory.c b/mm/memory.c
> index fecdd044bc0b..b6ce6605dc63 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4123,6 +4123,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>         pte_t *pte;
>         gfp_t gfp;
>         int order;
> +       int ret;
>
>         /*
>          * If uffd is active for the vma we need per-page fault fidelity to
> @@ -4170,9 +4171,13 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>                 folio = vma_alloc_folio(gfp, order, vma, addr, true);
>                 if (folio) {
> -                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> -                                                           gfp, entry))
> +                       ret = mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry);
> +                       if (!ret) {
>                                 return folio;
> +                       } else if (ret == -ENOMEM) {
> +                               folio_put(folio);
> +                               goto fallback;
> +                       }
>                         folio_put(folio);
>                 }
>                 order = next_order(&orders, order);
>

Yes, does it make your kernel build even faster?

Thanks
Barry
[RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Usama Arif 1 month, 1 week ago
As mentioned in [1], there is a significant improvement in no
readahead swapin performance for super fast devices when skipping
swapcache.

With large folio zswapin support added in later patches, this will also
mean this path will also act as "readahead" by swapping in multiple
pages into large folios. further improving performance.

[1] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#m5a792a04dfea20eb7af4c355d00503efe1c86a93

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/zswap.h |  6 ++++++
 mm/memory.c           |  3 ++-
 mm/page_io.c          |  1 -
 mm/zswap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index d961ead91bf1..e418d75db738 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -27,6 +27,7 @@ struct zswap_lruvec_state {
 unsigned long zswap_total_pages(void);
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
+bool zswap_present_test(swp_entry_t swp, int nr_pages);
 void zswap_invalidate(swp_entry_t swp);
 int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
@@ -49,6 +50,11 @@ static inline bool zswap_load(struct folio *folio)
 	return false;
 }
 
+static inline bool zswap_present_test(swp_entry_t swp, int nr_pages)
+{
+	return false;
+}
+
 static inline void zswap_invalidate(swp_entry_t swp) {}
 static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 03e5452dd0c0..49d243131169 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4289,7 +4289,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	swapcache = folio;
 
 	if (!folio) {
-		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
+		if ((data_race(si->flags & SWP_SYNCHRONOUS_IO) ||
+		    zswap_present_test(entry, 1)) &&
 		    __swap_count(entry) == 1) {
 			/* skip swapcache */
 			folio = alloc_swap_folio(vmf);
diff --git a/mm/page_io.c b/mm/page_io.c
index 4aa34862676f..2a15b197968a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -602,7 +602,6 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 	unsigned long pflags;
 	bool in_thrashing;
 
-	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
 
diff --git a/mm/zswap.c b/mm/zswap.c
index 7f00cc918e7c..f4b03071b2fb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
 	return ret;
 }
 
+static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
+{
+	return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
+}
+
+/* Returns true if the entire folio is in zswap */
+bool zswap_present_test(swp_entry_t swp, int nr_pages)
+{
+	pgoff_t offset = swp_offset(swp), tree_max_idx;
+	int max_idx = 0, i = 0, tree_offset = 0;
+	unsigned int type = swp_type(swp);
+	struct zswap_entry *entry = NULL;
+	struct xarray *tree;
+
+	while (i < nr_pages) {
+		tree_offset = offset + i;
+		/* Check if the tree exists. */
+		if (!swp_offset_in_zswap(type, tree_offset))
+			return false;
+
+		tree = swap_zswap_tree(swp_entry(type, tree_offset));
+		XA_STATE(xas, tree, tree_offset);
+
+		tree_max_idx = tree_offset % SWAP_ADDRESS_SPACE_PAGES ?
+			ALIGN(tree_offset, SWAP_ADDRESS_SPACE_PAGES) :
+			ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES);
+		max_idx = min(offset + nr_pages, tree_max_idx) - 1;
+		rcu_read_lock();
+		xas_for_each(&xas, entry, max_idx) {
+			if (xas_retry(&xas, entry))
+				continue;
+			i++;
+		}
+		rcu_read_unlock();
+		/*
+		 * If xas_for_each exits because entry is NULL and
+		 * the number of entries checked are less then max idx,
+		 * then zswap does not contain the entire folio.
+		 */
+		if (!entry && offset + i <= max_idx)
+			return false;
+	}
+
+	return true;
+}
+
 bool zswap_load(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-- 
2.43.5
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Yosry Ahmed 1 month ago
On Fri, Oct 18, 2024 at 3:50 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> As mentioned in [1], there is a significant improvement in no
> readahead swapin performance for super fast devices when skipping
> swapcache.
>
> With large folio zswapin support added in later patches, this will also
> mean this path will also act as "readahead" by swapping in multiple
> pages into large folios. further improving performance.
>
> [1] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#m5a792a04dfea20eb7af4c355d00503efe1c86a93
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/zswap.h |  6 ++++++
>  mm/memory.c           |  3 ++-
>  mm/page_io.c          |  1 -
>  mm/zswap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 54 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index d961ead91bf1..e418d75db738 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -27,6 +27,7 @@ struct zswap_lruvec_state {
>  unsigned long zswap_total_pages(void);
>  bool zswap_store(struct folio *folio);
>  bool zswap_load(struct folio *folio);
> +bool zswap_present_test(swp_entry_t swp, int nr_pages);
>  void zswap_invalidate(swp_entry_t swp);
>  int zswap_swapon(int type, unsigned long nr_pages);
>  void zswap_swapoff(int type);
> @@ -49,6 +50,11 @@ static inline bool zswap_load(struct folio *folio)
>         return false;
>  }
>
> +static inline bool zswap_present_test(swp_entry_t swp, int nr_pages)
> +{
> +       return false;
> +}
> +
>  static inline void zswap_invalidate(swp_entry_t swp) {}
>  static inline int zswap_swapon(int type, unsigned long nr_pages)
>  {
> diff --git a/mm/memory.c b/mm/memory.c
> index 03e5452dd0c0..49d243131169 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4289,7 +4289,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         swapcache = folio;
>
>         if (!folio) {
> -               if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> +               if ((data_race(si->flags & SWP_SYNCHRONOUS_IO) ||
> +                   zswap_present_test(entry, 1)) &&
>                     __swap_count(entry) == 1) {
>                         /* skip swapcache */
>                         folio = alloc_swap_folio(vmf);
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 4aa34862676f..2a15b197968a 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -602,7 +602,6 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>         unsigned long pflags;
>         bool in_thrashing;
>
> -       VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
>         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>         VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7f00cc918e7c..f4b03071b2fb 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
>         return ret;
>  }
>
> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
> +{
> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
> +}
> +
> +/* Returns true if the entire folio is in zswap */
> +bool zswap_present_test(swp_entry_t swp, int nr_pages)

Also, did you check how the performance changes if we bring back the
bitmap of present entries (i.e. what used to be frontswap's bitmap)
instead of the tree lookups here?

> +{
> +       pgoff_t offset = swp_offset(swp), tree_max_idx;
> +       int max_idx = 0, i = 0, tree_offset = 0;
> +       unsigned int type = swp_type(swp);
> +       struct zswap_entry *entry = NULL;
> +       struct xarray *tree;
> +
> +       while (i < nr_pages) {
> +               tree_offset = offset + i;
> +               /* Check if the tree exists. */
> +               if (!swp_offset_in_zswap(type, tree_offset))
> +                       return false;
> +
> +               tree = swap_zswap_tree(swp_entry(type, tree_offset));
> +               XA_STATE(xas, tree, tree_offset);
> +
> +               tree_max_idx = tree_offset % SWAP_ADDRESS_SPACE_PAGES ?
> +                       ALIGN(tree_offset, SWAP_ADDRESS_SPACE_PAGES) :
> +                       ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES);
> +               max_idx = min(offset + nr_pages, tree_max_idx) - 1;
> +               rcu_read_lock();
> +               xas_for_each(&xas, entry, max_idx) {
> +                       if (xas_retry(&xas, entry))
> +                               continue;
> +                       i++;
> +               }
> +               rcu_read_unlock();
> +               /*
> +                * If xas_for_each exits because entry is NULL and
> +                * the number of entries checked are less then max idx,
> +                * then zswap does not contain the entire folio.
> +                */
> +               if (!entry && offset + i <= max_idx)
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
>  bool zswap_load(struct folio *folio)
>  {
>         swp_entry_t swp = folio->swap;
> --
> 2.43.5
>
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Usama Arif 1 month ago

On 21/10/2024 22:11, Yosry Ahmed wrote:
> On Fri, Oct 18, 2024 at 3:50 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> As mentioned in [1], there is a significant improvement in no
>> readahead swapin performance for super fast devices when skipping
>> swapcache.
>>
>> With large folio zswapin support added in later patches, this will also
>> mean this path will also act as "readahead" by swapping in multiple
>> pages into large folios. further improving performance.
>>
>> [1] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#m5a792a04dfea20eb7af4c355d00503efe1c86a93
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/zswap.h |  6 ++++++
>>  mm/memory.c           |  3 ++-
>>  mm/page_io.c          |  1 -
>>  mm/zswap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 54 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
>> index d961ead91bf1..e418d75db738 100644
>> --- a/include/linux/zswap.h
>> +++ b/include/linux/zswap.h
>> @@ -27,6 +27,7 @@ struct zswap_lruvec_state {
>>  unsigned long zswap_total_pages(void);
>>  bool zswap_store(struct folio *folio);
>>  bool zswap_load(struct folio *folio);
>> +bool zswap_present_test(swp_entry_t swp, int nr_pages);
>>  void zswap_invalidate(swp_entry_t swp);
>>  int zswap_swapon(int type, unsigned long nr_pages);
>>  void zswap_swapoff(int type);
>> @@ -49,6 +50,11 @@ static inline bool zswap_load(struct folio *folio)
>>         return false;
>>  }
>>
>> +static inline bool zswap_present_test(swp_entry_t swp, int nr_pages)
>> +{
>> +       return false;
>> +}
>> +
>>  static inline void zswap_invalidate(swp_entry_t swp) {}
>>  static inline int zswap_swapon(int type, unsigned long nr_pages)
>>  {
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 03e5452dd0c0..49d243131169 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4289,7 +4289,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>         swapcache = folio;
>>
>>         if (!folio) {
>> -               if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>> +               if ((data_race(si->flags & SWP_SYNCHRONOUS_IO) ||
>> +                   zswap_present_test(entry, 1)) &&
>>                     __swap_count(entry) == 1) {
>>                         /* skip swapcache */
>>                         folio = alloc_swap_folio(vmf);
>> diff --git a/mm/page_io.c b/mm/page_io.c
>> index 4aa34862676f..2a15b197968a 100644
>> --- a/mm/page_io.c
>> +++ b/mm/page_io.c
>> @@ -602,7 +602,6 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>>         unsigned long pflags;
>>         bool in_thrashing;
>>
>> -       VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
>>         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>>         VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 7f00cc918e7c..f4b03071b2fb 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
>>         return ret;
>>  }
>>
>> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
>> +{
>> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
>> +}
>> +
>> +/* Returns true if the entire folio is in zswap */
>> +bool zswap_present_test(swp_entry_t swp, int nr_pages)
> 
> Also, did you check how the performance changes if we bring back the
> bitmap of present entries (i.e. what used to be frontswap's bitmap)
> instead of the tree lookups here?
> 

I think the cost of tree lookup is not much and compared to zswap_decompress
can probably be ignored. zswap_present_test is essentially just xa_load for
the first entry, and then xas_next_entry for subsequent entries which is even
cheaper than xa_load.
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Yosry Ahmed 1 month ago
[..]
> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> index 7f00cc918e7c..f4b03071b2fb 100644
> >> --- a/mm/zswap.c
> >> +++ b/mm/zswap.c
> >> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
> >>         return ret;
> >>  }
> >>
> >> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
> >> +{
> >> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
> >> +}
> >> +
> >> +/* Returns true if the entire folio is in zswap */
> >> +bool zswap_present_test(swp_entry_t swp, int nr_pages)
> >
> > Also, did you check how the performance changes if we bring back the
> > bitmap of present entries (i.e. what used to be frontswap's bitmap)
> > instead of the tree lookups here?
> >
>
> I think the cost of tree lookup is not much and compared to zswap_decompress
> can probably be ignored. zswap_present_test is essentially just xa_load for
> the first entry, and then xas_next_entry for subsequent entries which is even
> cheaper than xa_load.

Maybe it's worth measuring if it's not too much work. IIUC there is a
regression that we don't fully understand with this series, and the
extra lookup may be contributing to that. I think it could be just
fine, but I can't tell without numbers :)
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Yosry Ahmed 1 month ago
On Fri, Oct 18, 2024 at 3:50 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> As mentioned in [1], there is a significant improvement in no
> readahead swapin performance for super fast devices when skipping
> swapcache.

FYI, Kairui was working on removing the swapcache bypass completely,
which I think may be a good thing:
https://lore.kernel.org/lkml/20240326185032.72159-1-ryncsn@gmail.com/

However, that series is old, since before the large folio swapin
support, so I am not sure if/when he intends to refresh it.

In his approach there is still a swapin path for synchronous swapin
though, which we can still utilize for zswap.

>
> With large folio zswapin support added in later patches, this will also
> mean this path will also act as "readahead" by swapping in multiple
> pages into large folios. further improving performance.
>
> [1] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#m5a792a04dfea20eb7af4c355d00503efe1c86a93
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/zswap.h |  6 ++++++
>  mm/memory.c           |  3 ++-
>  mm/page_io.c          |  1 -
>  mm/zswap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 54 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index d961ead91bf1..e418d75db738 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -27,6 +27,7 @@ struct zswap_lruvec_state {
>  unsigned long zswap_total_pages(void);
>  bool zswap_store(struct folio *folio);
>  bool zswap_load(struct folio *folio);
> +bool zswap_present_test(swp_entry_t swp, int nr_pages);
>  void zswap_invalidate(swp_entry_t swp);
>  int zswap_swapon(int type, unsigned long nr_pages);
>  void zswap_swapoff(int type);
> @@ -49,6 +50,11 @@ static inline bool zswap_load(struct folio *folio)
>         return false;
>  }
>
> +static inline bool zswap_present_test(swp_entry_t swp, int nr_pages)
> +{
> +       return false;
> +}
> +
>  static inline void zswap_invalidate(swp_entry_t swp) {}
>  static inline int zswap_swapon(int type, unsigned long nr_pages)
>  {
> diff --git a/mm/memory.c b/mm/memory.c
> index 03e5452dd0c0..49d243131169 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4289,7 +4289,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         swapcache = folio;
>
>         if (!folio) {
> -               if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> +               if ((data_race(si->flags & SWP_SYNCHRONOUS_IO) ||
> +                   zswap_present_test(entry, 1)) &&
>                     __swap_count(entry) == 1) {
>                         /* skip swapcache */
>                         folio = alloc_swap_folio(vmf);
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 4aa34862676f..2a15b197968a 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -602,7 +602,6 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>         unsigned long pflags;
>         bool in_thrashing;
>
> -       VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
>         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>         VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7f00cc918e7c..f4b03071b2fb 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
>         return ret;
>  }
>
> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
> +{
> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];

I am not sure I understand what we are looking for here. When does
this return false? Aren't the zswap trees always allocated during
swapon?

> +}
> +
> +/* Returns true if the entire folio is in zswap */

There isn't really a folio at this point, maybe "Returns true if the
entire range is in zswap"?

Also, this is racy because an exclusive load, invalidation, or
writeback can cause an entry to be removed from zswap. Under what
conditions is this safe? The caller can probably guarantee we don't
race against invalidation, but can we guarantee that concurrent
exclusive loads or writebacks don't happen?

If the answer is yes, this needs to be properly documented.

> +bool zswap_present_test(swp_entry_t swp, int nr_pages)
> +{
> +       pgoff_t offset = swp_offset(swp), tree_max_idx;
> +       int max_idx = 0, i = 0, tree_offset = 0;
> +       unsigned int type = swp_type(swp);
> +       struct zswap_entry *entry = NULL;
> +       struct xarray *tree;
> +
> +       while (i < nr_pages) {
> +               tree_offset = offset + i;
> +               /* Check if the tree exists. */
> +               if (!swp_offset_in_zswap(type, tree_offset))
> +                       return false;
> +
> +               tree = swap_zswap_tree(swp_entry(type, tree_offset));
> +               XA_STATE(xas, tree, tree_offset);

Please do not mix declarations with code.

> +
> +               tree_max_idx = tree_offset % SWAP_ADDRESS_SPACE_PAGES ?
> +                       ALIGN(tree_offset, SWAP_ADDRESS_SPACE_PAGES) :
> +                       ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES);

Does this work if we always use ALIGN(tree_offset + 1,
SWAP_ADDRESS_SPACE_PAGES)?

> +               max_idx = min(offset + nr_pages, tree_max_idx) - 1;
> +               rcu_read_lock();
> +               xas_for_each(&xas, entry, max_idx) {
> +                       if (xas_retry(&xas, entry))
> +                               continue;
> +                       i++;
> +               }
> +               rcu_read_unlock();
> +               /*
> +                * If xas_for_each exits because entry is NULL and

nit: add () to the end of function names (i.e. xas_for_each())

> +                * the number of entries checked are less then max idx,

s/then/than

> +                * then zswap does not contain the entire folio.
> +                */
> +               if (!entry && offset + i <= max_idx)
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
>  bool zswap_load(struct folio *folio)
>  {
>         swp_entry_t swp = folio->swap;
> --
> 2.43.5
>
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Usama Arif 1 month ago

On 21/10/2024 22:09, Yosry Ahmed wrote:
> On Fri, Oct 18, 2024 at 3:50 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> As mentioned in [1], there is a significant improvement in no
>> readahead swapin performance for super fast devices when skipping
>> swapcache.
> 
> FYI, Kairui was working on removing the swapcache bypass completely,
> which I think may be a good thing:
> https://lore.kernel.org/lkml/20240326185032.72159-1-ryncsn@gmail.com/
> 
> However, that series is old, since before the large folio swapin
> support, so I am not sure if/when he intends to refresh it.
> 
> In his approach there is still a swapin path for synchronous swapin
> though, which we can still utilize for zswap.
> 
>>
>> With large folio zswapin support added in later patches, this will also
>> mean this path will also act as "readahead" by swapping in multiple
>> pages into large folios. further improving performance.
>>
>> [1] https://lore.kernel.org/all/1505886205-9671-5-git-send-email-minchan@kernel.org/T/#m5a792a04dfea20eb7af4c355d00503efe1c86a93
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/zswap.h |  6 ++++++
>>  mm/memory.c           |  3 ++-
>>  mm/page_io.c          |  1 -
>>  mm/zswap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 54 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
>> index d961ead91bf1..e418d75db738 100644
>> --- a/include/linux/zswap.h
>> +++ b/include/linux/zswap.h
>> @@ -27,6 +27,7 @@ struct zswap_lruvec_state {
>>  unsigned long zswap_total_pages(void);
>>  bool zswap_store(struct folio *folio);
>>  bool zswap_load(struct folio *folio);
>> +bool zswap_present_test(swp_entry_t swp, int nr_pages);
>>  void zswap_invalidate(swp_entry_t swp);
>>  int zswap_swapon(int type, unsigned long nr_pages);
>>  void zswap_swapoff(int type);
>> @@ -49,6 +50,11 @@ static inline bool zswap_load(struct folio *folio)
>>         return false;
>>  }
>>
>> +static inline bool zswap_present_test(swp_entry_t swp, int nr_pages)
>> +{
>> +       return false;
>> +}
>> +
>>  static inline void zswap_invalidate(swp_entry_t swp) {}
>>  static inline int zswap_swapon(int type, unsigned long nr_pages)
>>  {
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 03e5452dd0c0..49d243131169 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4289,7 +4289,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>         swapcache = folio;
>>
>>         if (!folio) {
>> -               if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>> +               if ((data_race(si->flags & SWP_SYNCHRONOUS_IO) ||
>> +                   zswap_present_test(entry, 1)) &&
>>                     __swap_count(entry) == 1) {
>>                         /* skip swapcache */
>>                         folio = alloc_swap_folio(vmf);
>> diff --git a/mm/page_io.c b/mm/page_io.c
>> index 4aa34862676f..2a15b197968a 100644
>> --- a/mm/page_io.c
>> +++ b/mm/page_io.c
>> @@ -602,7 +602,6 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
>>         unsigned long pflags;
>>         bool in_thrashing;
>>
>> -       VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
>>         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>>         VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 7f00cc918e7c..f4b03071b2fb 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
>>         return ret;
>>  }
>>
>> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
>> +{
>> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
> 
> I am not sure I understand what we are looking for here. When does
> this return false? Aren't the zswap trees always allocated during
> swapon?
> 

Hi Yosry,

Thanks for the review!

It becomes useful in patch 3 when trying to determine if a large folio can be allocated.

For e.g. if the swap entry is the last entry of the last tree, and 1M folios are enabled
(nr_pages = 256), then the while loop in zswap_present_test will try to access a tree
that doesn't exist from the 2nd 4K page onwards if we dont have this check in
zswap_present_test.

>> +}
>> +
>> +/* Returns true if the entire folio is in zswap */
> 
> There isn't really a folio at this point, maybe "Returns true if the
> entire range is in zswap"?

Will change, Thanks!

> 
> Also, this is racy because an exclusive load, invalidation, or
> writeback can cause an entry to be removed from zswap. Under what
> conditions is this safe? The caller can probably guarantee we don't
> race against invalidation, but can we guarantee that concurrent
> exclusive loads or writebacks don't happen?
> 
> If the answer is yes, this needs to be properly documented.

swapcache_prepare should stop things from becoming racy.

lets say trying to swapin a mTHP of size 32 pages:
- T1 is doing do_swap_page, T2 is doing zswap_writeback.
- T1 - Check if the entire 32 pages is in zswap, swapcache_prepare(entry, nr_pages) in do_swap_page is not yet called. 
- T2 - zswap_writeback_entry starts and lets say writes page 2 to swap. it calls __read_swap_cache_async -> swapcache_prepare increments swap_map count, writes page to swap.
- T1 - swapcache_prepare is then called and fails and then there will be a pagefault again for it.

I will try and document this better.

> 
>> +bool zswap_present_test(swp_entry_t swp, int nr_pages)
>> +{
>> +       pgoff_t offset = swp_offset(swp), tree_max_idx;
>> +       int max_idx = 0, i = 0, tree_offset = 0;
>> +       unsigned int type = swp_type(swp);
>> +       struct zswap_entry *entry = NULL;
>> +       struct xarray *tree;
>> +
>> +       while (i < nr_pages) {
>> +               tree_offset = offset + i;
>> +               /* Check if the tree exists. */
>> +               if (!swp_offset_in_zswap(type, tree_offset))
>> +                       return false;
>> +
>> +               tree = swap_zswap_tree(swp_entry(type, tree_offset));
>> +               XA_STATE(xas, tree, tree_offset);
> 
> Please do not mix declarations with code.
> 
>> +
>> +               tree_max_idx = tree_offset % SWAP_ADDRESS_SPACE_PAGES ?
>> +                       ALIGN(tree_offset, SWAP_ADDRESS_SPACE_PAGES) :
>> +                       ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES);
> 
> Does this work if we always use ALIGN(tree_offset + 1,
> SWAP_ADDRESS_SPACE_PAGES)?

Yes, I think max_idx = min(offset + nr_pages, ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES)) - 1;
will work. I will test it out, Thanks!


> 
>> +               max_idx = min(offset + nr_pages, tree_max_idx) - 1;
>> +               rcu_read_lock();
>> +               xas_for_each(&xas, entry, max_idx) {
>> +                       if (xas_retry(&xas, entry))
>> +                               continue;
>> +                       i++;
>> +               }
>> +               rcu_read_unlock();
>> +               /*
>> +                * If xas_for_each exits because entry is NULL and
> 
> nit: add () to the end of function names (i.e. xas_for_each())
> 
>> +                * the number of entries checked are less then max idx,
> 
> s/then/than
> 
>> +                * then zswap does not contain the entire folio.
>> +                */
>> +               if (!entry && offset + i <= max_idx)
>> +                       return false;
>> +       }
>> +
>> +       return true;
>> +}
>> +
>>  bool zswap_load(struct folio *folio)
>>  {
>>         swp_entry_t swp = folio->swap;
>> --
>> 2.43.5
>>

Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Yosry Ahmed 1 month ago
[..]
> >> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
> >>         return ret;
> >>  }
> >>
> >> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
> >> +{
> >> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
> >
> > I am not sure I understand what we are looking for here. When does
> > this return false? Aren't the zswap trees always allocated during
> > swapon?
> >
>
> Hi Yosry,
>
> Thanks for the review!
>
> It becomes useful in patch 3 when trying to determine if a large folio can be allocated.
>
> For e.g. if the swap entry is the last entry of the last tree, and 1M folios are enabled
> (nr_pages = 256), then the while loop in zswap_present_test will try to access a tree
> that doesn't exist from the 2nd 4K page onwards if we dont have this check in
> zswap_present_test.

Doesn't swap_pte_batch() make sure that the range of swap entries
passed here all corresponds to existing swap entries, and those
entries should always have a corresponding zswap tree? How can the
passed in range contain an entry that is not in any zswap tree?

I feel like I am missing something.

>
> >> +}
> >> +
> >> +/* Returns true if the entire folio is in zswap */
> >
> > There isn't really a folio at this point, maybe "Returns true if the
> > entire range is in zswap"?
>
> Will change, Thanks!
>
> >
> > Also, this is racy because an exclusive load, invalidation, or
> > writeback can cause an entry to be removed from zswap. Under what
> > conditions is this safe? The caller can probably guarantee we don't
> > race against invalidation, but can we guarantee that concurrent
> > exclusive loads or writebacks don't happen?
> >
> > If the answer is yes, this needs to be properly documented.
>
> swapcache_prepare should stop things from becoming racy.
>
> lets say trying to swapin a mTHP of size 32 pages:
> - T1 is doing do_swap_page, T2 is doing zswap_writeback.
> - T1 - Check if the entire 32 pages is in zswap, swapcache_prepare(entry, nr_pages) in do_swap_page is not yet called.
> - T2 - zswap_writeback_entry starts and lets say writes page 2 to swap. it calls __read_swap_cache_async -> swapcache_prepare increments swap_map count, writes page to swap.

Can the folio be dropped from the swapcache at this point (e.g. by
reclaim)? If yes, it seems like swapcache_prepare() can succeed and
try to read it from zswap.

> - T1 - swapcache_prepare is then called and fails and then there will be a pagefault again for it.
>
> I will try and document this better.

We need to establish the rules for zswap_present_test() to not be
racy, document them at the definition, establish the safety of racy
callers (i.e. can_swapin_thp()), and document them at the call sites.

>
> >
> >> +bool zswap_present_test(swp_entry_t swp, int nr_pages)
> >> +{
> >> +       pgoff_t offset = swp_offset(swp), tree_max_idx;
> >> +       int max_idx = 0, i = 0, tree_offset = 0;
> >> +       unsigned int type = swp_type(swp);
> >> +       struct zswap_entry *entry = NULL;
> >> +       struct xarray *tree;
> >> +
> >> +       while (i < nr_pages) {
> >> +               tree_offset = offset + i;
> >> +               /* Check if the tree exists. */
> >> +               if (!swp_offset_in_zswap(type, tree_offset))
> >> +                       return false;
> >> +
> >> +               tree = swap_zswap_tree(swp_entry(type, tree_offset));
> >> +               XA_STATE(xas, tree, tree_offset);
> >
> > Please do not mix declarations with code.
> >
> >> +
> >> +               tree_max_idx = tree_offset % SWAP_ADDRESS_SPACE_PAGES ?
> >> +                       ALIGN(tree_offset, SWAP_ADDRESS_SPACE_PAGES) :
> >> +                       ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES);
> >
> > Does this work if we always use ALIGN(tree_offset + 1,
> > SWAP_ADDRESS_SPACE_PAGES)?
>
> Yes, I think max_idx = min(offset + nr_pages, ALIGN(tree_offset + 1, SWAP_ADDRESS_SPACE_PAGES)) - 1;
> will work. I will test it out, Thanks!

Might need to split it over two lines.
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Nhat Pham 1 month ago
On Tue, Oct 22, 2024 at 5:46 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> [..]
> > >> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
> > >>         return ret;
> > >>  }
> > >>
> > >> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
> > >> +{
> > >> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
> > >
> > > I am not sure I understand what we are looking for here. When does
> > > this return false? Aren't the zswap trees always allocated during
> > > swapon?
> > >
> >
> > Hi Yosry,
> >
> > Thanks for the review!
> >
> > It becomes useful in patch 3 when trying to determine if a large folio can be allocated.
> >
> > For e.g. if the swap entry is the last entry of the last tree, and 1M folios are enabled
> > (nr_pages = 256), then the while loop in zswap_present_test will try to access a tree
> > that doesn't exist from the 2nd 4K page onwards if we dont have this check in
> > zswap_present_test.
>
> Doesn't swap_pte_batch() make sure that the range of swap entries
> passed here all corresponds to existing swap entries, and those
> entries should always have a corresponding zswap tree? How can the
> passed in range contain an entry that is not in any zswap tree?
>
> I feel like I am missing something.
>
> >
> > >> +}
> > >> +
> > >> +/* Returns true if the entire folio is in zswap */
> > >
> > > There isn't really a folio at this point, maybe "Returns true if the
> > > entire range is in zswap"?
> >
> > Will change, Thanks!
> >
> > >
> > > Also, this is racy because an exclusive load, invalidation, or
> > > writeback can cause an entry to be removed from zswap. Under what
> > > conditions is this safe? The caller can probably guarantee we don't
> > > race against invalidation, but can we guarantee that concurrent
> > > exclusive loads or writebacks don't happen?
> > >
> > > If the answer is yes, this needs to be properly documented.
> >
> > swapcache_prepare should stop things from becoming racy.
> >
> > lets say trying to swapin a mTHP of size 32 pages:
> > - T1 is doing do_swap_page, T2 is doing zswap_writeback.
> > - T1 - Check if the entire 32 pages is in zswap, swapcache_prepare(entry, nr_pages) in do_swap_page is not yet called.
> > - T2 - zswap_writeback_entry starts and lets say writes page 2 to swap. it calls __read_swap_cache_async -> swapcache_prepare increments swap_map count, writes page to swap.
>
> Can the folio be dropped from the swapcache at this point (e.g. by
> reclaim)? If yes, it seems like swapcache_prepare() can succeed and
> try to read it from zswap.
>

I think you're onto something.

Can this happen: say T2 writebacks a couple of tail pages, but not all
of them, then drops everything from swap cache. Then T1 can definitely
proceed. It would then check again in zswap_load(), which returns
false (thanks to zswap_present()) test.

All fine and good so far, but then in swap_read_folio(), it would try
to fall back to reading the entire large folio from swapfile, which
will contain bogus data in pages that have not been written back by
T2.

I think the problem is swap_read_folio() assumes it always succeeds,
and a precondition for successful reading is the large folio must have
no mixed backing state for its subpages, which we fail to guarantee
before entering swap_read_folio(). This can lead to memory corruption.

Buuut, I think all we need to do is just check the backing state again
after T1's swapcache_prepare() call. At this point, we are guaranteed
to have a stable backing state. If it fails here, then we can just
exit and fall back to individual page swapping in.
Re: [RFC 1/4] mm/zswap: skip swapcache for swapping in zswap pages
Posted by Yosry Ahmed 1 month ago
On Fri, Oct 25, 2024 at 11:19 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Oct 22, 2024 at 5:46 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > [..]
> > > >> @@ -1576,6 +1576,52 @@ bool zswap_store(struct folio *folio)
> > > >>         return ret;
> > > >>  }
> > > >>
> > > >> +static bool swp_offset_in_zswap(unsigned int type, pgoff_t offset)
> > > >> +{
> > > >> +       return (offset >> SWAP_ADDRESS_SPACE_SHIFT) <  nr_zswap_trees[type];
> > > >
> > > > I am not sure I understand what we are looking for here. When does
> > > > this return false? Aren't the zswap trees always allocated during
> > > > swapon?
> > > >
> > >
> > > Hi Yosry,
> > >
> > > Thanks for the review!
> > >
> > > It becomes useful in patch 3 when trying to determine if a large folio can be allocated.
> > >
> > > For e.g. if the swap entry is the last entry of the last tree, and 1M folios are enabled
> > > (nr_pages = 256), then the while loop in zswap_present_test will try to access a tree
> > > that doesn't exist from the 2nd 4K page onwards if we dont have this check in
> > > zswap_present_test.
> >
> > Doesn't swap_pte_batch() make sure that the range of swap entries
> > passed here all corresponds to existing swap entries, and those
> > entries should always have a corresponding zswap tree? How can the
> > passed in range contain an entry that is not in any zswap tree?
> >
> > I feel like I am missing something.
> >
> > >
> > > >> +}
> > > >> +
> > > >> +/* Returns true if the entire folio is in zswap */
> > > >
> > > > There isn't really a folio at this point, maybe "Returns true if the
> > > > entire range is in zswap"?
> > >
> > > Will change, Thanks!
> > >
> > > >
> > > > Also, this is racy because an exclusive load, invalidation, or
> > > > writeback can cause an entry to be removed from zswap. Under what
> > > > conditions is this safe? The caller can probably guarantee we don't
> > > > race against invalidation, but can we guarantee that concurrent
> > > > exclusive loads or writebacks don't happen?
> > > >
> > > > If the answer is yes, this needs to be properly documented.
> > >
> > > swapcache_prepare should stop things from becoming racy.
> > >
> > > lets say trying to swapin a mTHP of size 32 pages:
> > > - T1 is doing do_swap_page, T2 is doing zswap_writeback.
> > > - T1 - Check if the entire 32 pages is in zswap, swapcache_prepare(entry, nr_pages) in do_swap_page is not yet called.
> > > - T2 - zswap_writeback_entry starts and lets say writes page 2 to swap. it calls __read_swap_cache_async -> swapcache_prepare increments swap_map count, writes page to swap.
> >
> > Can the folio be dropped from the swapcache at this point (e.g. by
> > reclaim)? If yes, it seems like swapcache_prepare() can succeed and
> > try to read it from zswap.
> >
>
> I think you're onto something.
>
> Can this happen: say T2 writebacks a couple of tail pages, but not all
> of them, then drops everything from swap cache. Then T1 can definitely
> proceed. It would then check again in zswap_load(), which returns
> false (thanks to zswap_present()) test.
>
> All fine and good so far, but then in swap_read_folio(), it would try
> to fall back to reading the entire large folio from swapfile, which
> will contain bogus data in pages that have not been written back by
> T2.
>
> I think the problem is swap_read_folio() assumes it always succeeds,
> and a precondition for successful reading is the large folio must have
> no mixed backing state for its subpages, which we fail to guarantee
> before entering swap_read_folio(). This can lead to memory corruption.
>
> Buuut, I think all we need to do is just check the backing state again
> after T1's swapcache_prepare() call. At this point, we are guaranteed
> to have a stable backing state. If it fails here, then we can just
> exit and fall back to individual page swapping in.

I think this should work, but we need to take a closer look for other
things that can go wrong along this path.
[RFC 2/4] mm/zswap: modify zswap_decompress to accept page instead of folio
Posted by Usama Arif 1 month, 1 week ago
This is a prerequisite for zswap_load to be able to decompress
large folios. zswap_load will iterate through each page in a folio
and decompress into it.

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 mm/zswap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index f4b03071b2fb..9cc91ae31116 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -953,7 +953,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
-static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 {
 	struct zpool *zpool = entry->pool->zpool;
 	struct scatterlist input, output;
@@ -982,7 +982,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 	sg_init_one(&input, src, entry->length);
 	sg_init_table(&output, 1);
-	sg_set_folio(&output, folio, PAGE_SIZE, 0);
+	sg_set_page(&output, page, PAGE_SIZE, 0);
 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
 	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
 	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
@@ -1055,7 +1055,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 		return -ENOMEM;
 	}
 
-	zswap_decompress(entry, folio);
+	zswap_decompress(entry, &folio->page);
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
@@ -1666,7 +1666,7 @@ bool zswap_load(struct folio *folio)
 	if (!entry)
 		return false;
 
-	zswap_decompress(entry, folio);
+	zswap_decompress(entry, &folio->page);
 
 	count_vm_event(ZSWPIN);
 	if (entry->objcg)
-- 
2.43.5
[RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Usama Arif 1 month, 1 week ago
At time of folio allocation, alloc_swap_folio checks if the entire
folio is in zswap to determine folio order.
During swap_read_folio, zswap_load will check if the entire folio
is in zswap, and if it is, it will iterate through the pages in
folio and decompress them.
This will mean the benefits of large folios (fewer page faults, batched
PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
and amd) are not lost at swap out when using zswap.
This patch does not add support for hybrid backends (i.e. folios
partly present swap and zswap).

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 mm/memory.c | 13 +++-------
 mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
 2 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 49d243131169..75f7b9f5fb32 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 
 	/*
 	 * swap_read_folio() can't handle the case a large folio is hybridly
-	 * from different backends. And they are likely corner cases. Similar
-	 * things might be added once zswap support large folios.
+	 * from different backends. And they are likely corner cases.
 	 */
 	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
 		return false;
 	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
 		return false;
+	if (unlikely(!zswap_present_test(entry, nr_pages)))
+		return false;
 
 	return true;
 }
@@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		goto fallback;
 
-	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
-	 */
-	if (!zswap_never_enabled())
-		goto fallback;
-
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
diff --git a/mm/zswap.c b/mm/zswap.c
index 9cc91ae31116..a5aa86c24060 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
 
 bool zswap_load(struct folio *folio)
 {
+	int nr_pages = folio_nr_pages(folio);
 	swp_entry_t swp = folio->swap;
+	unsigned int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	bool swapcache = folio_test_swapcache(folio);
-	struct xarray *tree = swap_zswap_tree(swp);
+	struct xarray *tree;
 	struct zswap_entry *entry;
+	int i;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 
 	if (zswap_never_enabled())
 		return false;
 
-	/*
-	 * Large folios should not be swapped in while zswap is being used, as
-	 * they are not properly handled. Zswap does not properly load large
-	 * folios, and a large folio may only be partially in zswap.
-	 *
-	 * Return true without marking the folio uptodate so that an IO error is
-	 * emitted (e.g. do_swap_page() will sigbus).
-	 */
-	if (WARN_ON_ONCE(folio_test_large(folio)))
-		return true;
-
-	/*
-	 * When reading into the swapcache, invalidate our entry. The
-	 * swapcache can be the authoritative owner of the page and
-	 * its mappings, and the pressure that results from having two
-	 * in-memory copies outweighs any benefits of caching the
-	 * compression work.
-	 *
-	 * (Most swapins go through the swapcache. The notable
-	 * exception is the singleton fault on SWP_SYNCHRONOUS_IO
-	 * files, which reads into a private page and may free it if
-	 * the fault fails. We remain the primary owner of the entry.)
-	 */
-	if (swapcache)
-		entry = xa_erase(tree, offset);
-	else
-		entry = xa_load(tree, offset);
-
-	if (!entry)
+	if (!zswap_present_test(folio->swap, nr_pages))
 		return false;
 
-	zswap_decompress(entry, &folio->page);
+	for (i = 0; i < nr_pages; ++i) {
+		tree = swap_zswap_tree(swp_entry(type, offset + i));
+		/*
+		 * When reading into the swapcache, invalidate our entry. The
+		 * swapcache can be the authoritative owner of the page and
+		 * its mappings, and the pressure that results from having two
+		 * in-memory copies outweighs any benefits of caching the
+		 * compression work.
+		 *
+		 * (Swapins with swap count > 1 go through the swapcache.
+		 * For swap count == 1, the swapcache is skipped and we
+		 * remain the primary owner of the entry.)
+		 */
+		if (swapcache)
+			entry = xa_erase(tree, offset + i);
+		else
+			entry = xa_load(tree, offset + i);
 
-	count_vm_event(ZSWPIN);
-	if (entry->objcg)
-		count_objcg_events(entry->objcg, ZSWPIN, 1);
+		zswap_decompress(entry, folio_page(folio, i));
 
-	if (swapcache) {
-		zswap_entry_free(entry);
-		folio_mark_dirty(folio);
+		if (entry->objcg)
+			count_objcg_events(entry->objcg, ZSWPIN, 1);
+		if (swapcache)
+			zswap_entry_free(entry);
 	}
 
+	count_vm_events(ZSWPIN, nr_pages);
+	if (swapcache)
+		folio_mark_dirty(folio);
+
 	folio_mark_uptodate(folio);
 	return true;
 }
-- 
2.43.5
Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Barry Song 1 month ago
On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
> At time of folio allocation, alloc_swap_folio checks if the entire
> folio is in zswap to determine folio order.
> During swap_read_folio, zswap_load will check if the entire folio
> is in zswap, and if it is, it will iterate through the pages in
> folio and decompress them.
> This will mean the benefits of large folios (fewer page faults, batched
> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
> and amd) are not lost at swap out when using zswap.
> This patch does not add support for hybrid backends (i.e. folios
> partly present swap and zswap).
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  mm/memory.c | 13 +++-------
>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
>  2 files changed, 34 insertions(+), 47 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 49d243131169..75f7b9f5fb32 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
>
>         /*
>          * swap_read_folio() can't handle the case a large folio is hybridly
> -        * from different backends. And they are likely corner cases. Similar
> -        * things might be added once zswap support large folios.
> +        * from different backends. And they are likely corner cases.
>          */
>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
>                 return false;
>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
>                 return false;
> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
> +               return false;
>
>         return true;
>  }
> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>         if (unlikely(userfaultfd_armed(vma)))
>                 goto fallback;
>
> -       /*
> -        * A large swapped out folio could be partially or fully in zswap. We
> -        * lack handling for such cases, so fallback to swapping in order-0
> -        * folio.
> -        */
> -       if (!zswap_never_enabled())
> -               goto fallback;
> -
>         entry = pte_to_swp_entry(vmf->orig_pte);
>         /*
>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9cc91ae31116..a5aa86c24060 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
>
>  bool zswap_load(struct folio *folio)
>  {
> +       int nr_pages = folio_nr_pages(folio);
>         swp_entry_t swp = folio->swap;
> +       unsigned int type = swp_type(swp);
>         pgoff_t offset = swp_offset(swp);
>         bool swapcache = folio_test_swapcache(folio);
> -       struct xarray *tree = swap_zswap_tree(swp);
> +       struct xarray *tree;
>         struct zswap_entry *entry;
> +       int i;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>
>         if (zswap_never_enabled())
>                 return false;
>
> -       /*
> -        * Large folios should not be swapped in while zswap is being used, as
> -        * they are not properly handled. Zswap does not properly load large
> -        * folios, and a large folio may only be partially in zswap.
> -        *
> -        * Return true without marking the folio uptodate so that an IO error is
> -        * emitted (e.g. do_swap_page() will sigbus).
> -        */
> -       if (WARN_ON_ONCE(folio_test_large(folio)))
> -               return true;
> -
> -       /*
> -        * When reading into the swapcache, invalidate our entry. The
> -        * swapcache can be the authoritative owner of the page and
> -        * its mappings, and the pressure that results from having two
> -        * in-memory copies outweighs any benefits of caching the
> -        * compression work.
> -        *
> -        * (Most swapins go through the swapcache. The notable
> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
> -        * files, which reads into a private page and may free it if
> -        * the fault fails. We remain the primary owner of the entry.)
> -        */
> -       if (swapcache)
> -               entry = xa_erase(tree, offset);
> -       else
> -               entry = xa_load(tree, offset);
> -
> -       if (!entry)
> +       if (!zswap_present_test(folio->swap, nr_pages))
>                 return false;

Hi Usama,

Is there any chance that zswap_present_test() returns true
in do_swap_page() but false in zswap_load()? If that’s
possible, could we be missing something? For example,
could it be that zswap has been partially released (with
part of it still present) during an mTHP swap-in?

If this happens with an mTHP, my understanding is that
we shouldn't proceed with reading corrupted data from the
disk backend.

>
> -       zswap_decompress(entry, &folio->page);
> +       for (i = 0; i < nr_pages; ++i) {
> +               tree = swap_zswap_tree(swp_entry(type, offset + i));
> +               /*
> +                * When reading into the swapcache, invalidate our entry. The
> +                * swapcache can be the authoritative owner of the page and
> +                * its mappings, and the pressure that results from having two
> +                * in-memory copies outweighs any benefits of caching the
> +                * compression work.
> +                *
> +                * (Swapins with swap count > 1 go through the swapcache.
> +                * For swap count == 1, the swapcache is skipped and we
> +                * remain the primary owner of the entry.)
> +                */
> +               if (swapcache)
> +                       entry = xa_erase(tree, offset + i);
> +               else
> +                       entry = xa_load(tree, offset + i);
>
> -       count_vm_event(ZSWPIN);
> -       if (entry->objcg)
> -               count_objcg_events(entry->objcg, ZSWPIN, 1);
> +               zswap_decompress(entry, folio_page(folio, i));
>
> -       if (swapcache) {
> -               zswap_entry_free(entry);
> -               folio_mark_dirty(folio);
> +               if (entry->objcg)
> +                       count_objcg_events(entry->objcg, ZSWPIN, 1);
> +               if (swapcache)
> +                       zswap_entry_free(entry);
>         }
>
> +       count_vm_events(ZSWPIN, nr_pages);
> +       if (swapcache)
> +               folio_mark_dirty(folio);
> +
>         folio_mark_uptodate(folio);
>         return true;
>  }
> --
> 2.43.5
>

Thanks
barry
Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Usama Arif 1 month ago

On 21/10/2024 06:49, Barry Song wrote:
> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> At time of folio allocation, alloc_swap_folio checks if the entire
>> folio is in zswap to determine folio order.
>> During swap_read_folio, zswap_load will check if the entire folio
>> is in zswap, and if it is, it will iterate through the pages in
>> folio and decompress them.
>> This will mean the benefits of large folios (fewer page faults, batched
>> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
>> and amd) are not lost at swap out when using zswap.
>> This patch does not add support for hybrid backends (i.e. folios
>> partly present swap and zswap).
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  mm/memory.c | 13 +++-------
>>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
>>  2 files changed, 34 insertions(+), 47 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 49d243131169..75f7b9f5fb32 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
>>
>>         /*
>>          * swap_read_folio() can't handle the case a large folio is hybridly
>> -        * from different backends. And they are likely corner cases. Similar
>> -        * things might be added once zswap support large folios.
>> +        * from different backends. And they are likely corner cases.
>>          */
>>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
>>                 return false;
>>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
>>                 return false;
>> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
>> +               return false;
>>
>>         return true;
>>  }
>> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>>         if (unlikely(userfaultfd_armed(vma)))
>>                 goto fallback;
>>
>> -       /*
>> -        * A large swapped out folio could be partially or fully in zswap. We
>> -        * lack handling for such cases, so fallback to swapping in order-0
>> -        * folio.
>> -        */
>> -       if (!zswap_never_enabled())
>> -               goto fallback;
>> -
>>         entry = pte_to_swp_entry(vmf->orig_pte);
>>         /*
>>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 9cc91ae31116..a5aa86c24060 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
>>
>>  bool zswap_load(struct folio *folio)
>>  {
>> +       int nr_pages = folio_nr_pages(folio);
>>         swp_entry_t swp = folio->swap;
>> +       unsigned int type = swp_type(swp);
>>         pgoff_t offset = swp_offset(swp);
>>         bool swapcache = folio_test_swapcache(folio);
>> -       struct xarray *tree = swap_zswap_tree(swp);
>> +       struct xarray *tree;
>>         struct zswap_entry *entry;
>> +       int i;
>>
>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>>
>>         if (zswap_never_enabled())
>>                 return false;
>>
>> -       /*
>> -        * Large folios should not be swapped in while zswap is being used, as
>> -        * they are not properly handled. Zswap does not properly load large
>> -        * folios, and a large folio may only be partially in zswap.
>> -        *
>> -        * Return true without marking the folio uptodate so that an IO error is
>> -        * emitted (e.g. do_swap_page() will sigbus).
>> -        */
>> -       if (WARN_ON_ONCE(folio_test_large(folio)))
>> -               return true;
>> -
>> -       /*
>> -        * When reading into the swapcache, invalidate our entry. The
>> -        * swapcache can be the authoritative owner of the page and
>> -        * its mappings, and the pressure that results from having two
>> -        * in-memory copies outweighs any benefits of caching the
>> -        * compression work.
>> -        *
>> -        * (Most swapins go through the swapcache. The notable
>> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
>> -        * files, which reads into a private page and may free it if
>> -        * the fault fails. We remain the primary owner of the entry.)
>> -        */
>> -       if (swapcache)
>> -               entry = xa_erase(tree, offset);
>> -       else
>> -               entry = xa_load(tree, offset);
>> -
>> -       if (!entry)
>> +       if (!zswap_present_test(folio->swap, nr_pages))
>>                 return false;
> 
> Hi Usama,
> 
> Is there any chance that zswap_present_test() returns true
> in do_swap_page() but false in zswap_load()? If that’s
> possible, could we be missing something? For example,
> could it be that zswap has been partially released (with
> part of it still present) during an mTHP swap-in?
> 
> If this happens with an mTHP, my understanding is that
> we shouldn't proceed with reading corrupted data from the
> disk backend.
> 

If its not swapcache, the zswap entry is not deleted so I think
it should be ok?

We can check over here if the entire folio is in zswap,
and if not, return true without marking the folio uptodate
to give an error.


>>
>> -       zswap_decompress(entry, &folio->page);
>> +       for (i = 0; i < nr_pages; ++i) {
>> +               tree = swap_zswap_tree(swp_entry(type, offset + i));
>> +               /*
>> +                * When reading into the swapcache, invalidate our entry. The
>> +                * swapcache can be the authoritative owner of the page and
>> +                * its mappings, and the pressure that results from having two
>> +                * in-memory copies outweighs any benefits of caching the
>> +                * compression work.
>> +                *
>> +                * (Swapins with swap count > 1 go through the swapcache.
>> +                * For swap count == 1, the swapcache is skipped and we
>> +                * remain the primary owner of the entry.)
>> +                */
>> +               if (swapcache)
>> +                       entry = xa_erase(tree, offset + i);
>> +               else
>> +                       entry = xa_load(tree, offset + i);
>>
>> -       count_vm_event(ZSWPIN);
>> -       if (entry->objcg)
>> -               count_objcg_events(entry->objcg, ZSWPIN, 1);
>> +               zswap_decompress(entry, folio_page(folio, i));
>>
>> -       if (swapcache) {
>> -               zswap_entry_free(entry);
>> -               folio_mark_dirty(folio);
>> +               if (entry->objcg)
>> +                       count_objcg_events(entry->objcg, ZSWPIN, 1);
>> +               if (swapcache)
>> +                       zswap_entry_free(entry);
>>         }
>>
>> +       count_vm_events(ZSWPIN, nr_pages);
>> +       if (swapcache)
>> +               folio_mark_dirty(folio);
>> +
>>         folio_mark_uptodate(folio);
>>         return true;
>>  }
>> --
>> 2.43.5
>>
> 
> Thanks
> barry

Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Barry Song 1 month ago
On Mon, Oct 21, 2024 at 11:44 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 21/10/2024 06:49, Barry Song wrote:
> > On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> At time of folio allocation, alloc_swap_folio checks if the entire
> >> folio is in zswap to determine folio order.
> >> During swap_read_folio, zswap_load will check if the entire folio
> >> is in zswap, and if it is, it will iterate through the pages in
> >> folio and decompress them.
> >> This will mean the benefits of large folios (fewer page faults, batched
> >> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
> >> and amd) are not lost at swap out when using zswap.
> >> This patch does not add support for hybrid backends (i.e. folios
> >> partly present swap and zswap).
> >>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>  mm/memory.c | 13 +++-------
> >>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
> >>  2 files changed, 34 insertions(+), 47 deletions(-)
> >>
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index 49d243131169..75f7b9f5fb32 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
> >>
> >>         /*
> >>          * swap_read_folio() can't handle the case a large folio is hybridly
> >> -        * from different backends. And they are likely corner cases. Similar
> >> -        * things might be added once zswap support large folios.
> >> +        * from different backends. And they are likely corner cases.
> >>          */
> >>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
> >>                 return false;
> >>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> >>                 return false;
> >> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
> >> +               return false;
> >>
> >>         return true;
> >>  }
> >> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >>         if (unlikely(userfaultfd_armed(vma)))
> >>                 goto fallback;
> >>
> >> -       /*
> >> -        * A large swapped out folio could be partially or fully in zswap. We
> >> -        * lack handling for such cases, so fallback to swapping in order-0
> >> -        * folio.
> >> -        */
> >> -       if (!zswap_never_enabled())
> >> -               goto fallback;
> >> -
> >>         entry = pte_to_swp_entry(vmf->orig_pte);
> >>         /*
> >>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> index 9cc91ae31116..a5aa86c24060 100644
> >> --- a/mm/zswap.c
> >> +++ b/mm/zswap.c
> >> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
> >>
> >>  bool zswap_load(struct folio *folio)
> >>  {
> >> +       int nr_pages = folio_nr_pages(folio);
> >>         swp_entry_t swp = folio->swap;
> >> +       unsigned int type = swp_type(swp);
> >>         pgoff_t offset = swp_offset(swp);
> >>         bool swapcache = folio_test_swapcache(folio);
> >> -       struct xarray *tree = swap_zswap_tree(swp);
> >> +       struct xarray *tree;
> >>         struct zswap_entry *entry;
> >> +       int i;
> >>
> >>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> >>
> >>         if (zswap_never_enabled())
> >>                 return false;
> >>
> >> -       /*
> >> -        * Large folios should not be swapped in while zswap is being used, as
> >> -        * they are not properly handled. Zswap does not properly load large
> >> -        * folios, and a large folio may only be partially in zswap.
> >> -        *
> >> -        * Return true without marking the folio uptodate so that an IO error is
> >> -        * emitted (e.g. do_swap_page() will sigbus).
> >> -        */
> >> -       if (WARN_ON_ONCE(folio_test_large(folio)))
> >> -               return true;
> >> -
> >> -       /*
> >> -        * When reading into the swapcache, invalidate our entry. The
> >> -        * swapcache can be the authoritative owner of the page and
> >> -        * its mappings, and the pressure that results from having two
> >> -        * in-memory copies outweighs any benefits of caching the
> >> -        * compression work.
> >> -        *
> >> -        * (Most swapins go through the swapcache. The notable
> >> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
> >> -        * files, which reads into a private page and may free it if
> >> -        * the fault fails. We remain the primary owner of the entry.)
> >> -        */
> >> -       if (swapcache)
> >> -               entry = xa_erase(tree, offset);
> >> -       else
> >> -               entry = xa_load(tree, offset);
> >> -
> >> -       if (!entry)
> >> +       if (!zswap_present_test(folio->swap, nr_pages))
> >>                 return false;
> >
> > Hi Usama,
> >
> > Is there any chance that zswap_present_test() returns true
> > in do_swap_page() but false in zswap_load()? If that’s
> > possible, could we be missing something? For example,
> > could it be that zswap has been partially released (with
> > part of it still present) during an mTHP swap-in?
> >
> > If this happens with an mTHP, my understanding is that
> > we shouldn't proceed with reading corrupted data from the
> > disk backend.
> >
>
> If its not swapcache, the zswap entry is not deleted so I think
> it should be ok?
>
> We can check over here if the entire folio is in zswap,
> and if not, return true without marking the folio uptodate
> to give an error.

We have swapcache_prepare() called in do_swap_page(), which should
have protected these entries from being partially freed by other processes
(for example, if someone falls back to small folios for the same address).
Therefore, I believe that zswap_present_test() cannot be false for mTHP in
the current case where only synchronous I/O is supported.

the below might help detect the bug?

if (!zswap_present_test(folio->swap, nr_pages)) {
     if (WARN_ON_ONCE(nr_pages > 1))
                return true;
     return false;
}

the code seems quite ugly :-) do we have some way to unify the code
for large and small folios?

not quite sure about shmem though....

>
>
> >>
> >> -       zswap_decompress(entry, &folio->page);
> >> +       for (i = 0; i < nr_pages; ++i) {
> >> +               tree = swap_zswap_tree(swp_entry(type, offset + i));
> >> +               /*
> >> +                * When reading into the swapcache, invalidate our entry. The
> >> +                * swapcache can be the authoritative owner of the page and
> >> +                * its mappings, and the pressure that results from having two
> >> +                * in-memory copies outweighs any benefits of caching the
> >> +                * compression work.
> >> +                *
> >> +                * (Swapins with swap count > 1 go through the swapcache.
> >> +                * For swap count == 1, the swapcache is skipped and we
> >> +                * remain the primary owner of the entry.)
> >> +                */
> >> +               if (swapcache)
> >> +                       entry = xa_erase(tree, offset + i);
> >> +               else
> >> +                       entry = xa_load(tree, offset + i);
> >>
> >> -       count_vm_event(ZSWPIN);
> >> -       if (entry->objcg)
> >> -               count_objcg_events(entry->objcg, ZSWPIN, 1);
> >> +               zswap_decompress(entry, folio_page(folio, i));
> >>
> >> -       if (swapcache) {
> >> -               zswap_entry_free(entry);
> >> -               folio_mark_dirty(folio);
> >> +               if (entry->objcg)
> >> +                       count_objcg_events(entry->objcg, ZSWPIN, 1);
> >> +               if (swapcache)
> >> +                       zswap_entry_free(entry);
> >>         }
> >>
> >> +       count_vm_events(ZSWPIN, nr_pages);
> >> +       if (swapcache)
> >> +               folio_mark_dirty(folio);
> >> +
> >>         folio_mark_uptodate(folio);
> >>         return true;
> >>  }
> >> --
> >> 2.43.5
> >>
> >

Thanks
barry
Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Usama Arif 1 month ago

On 21/10/2024 11:55, Barry Song wrote:
> On Mon, Oct 21, 2024 at 11:44 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 21/10/2024 06:49, Barry Song wrote:
>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>> At time of folio allocation, alloc_swap_folio checks if the entire
>>>> folio is in zswap to determine folio order.
>>>> During swap_read_folio, zswap_load will check if the entire folio
>>>> is in zswap, and if it is, it will iterate through the pages in
>>>> folio and decompress them.
>>>> This will mean the benefits of large folios (fewer page faults, batched
>>>> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
>>>> and amd) are not lost at swap out when using zswap.
>>>> This patch does not add support for hybrid backends (i.e. folios
>>>> partly present swap and zswap).
>>>>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>>  mm/memory.c | 13 +++-------
>>>>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
>>>>  2 files changed, 34 insertions(+), 47 deletions(-)
>>>>
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 49d243131169..75f7b9f5fb32 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
>>>>
>>>>         /*
>>>>          * swap_read_folio() can't handle the case a large folio is hybridly
>>>> -        * from different backends. And they are likely corner cases. Similar
>>>> -        * things might be added once zswap support large folios.
>>>> +        * from different backends. And they are likely corner cases.
>>>>          */
>>>>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
>>>>                 return false;
>>>>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
>>>>                 return false;
>>>> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
>>>> +               return false;
>>>>
>>>>         return true;
>>>>  }
>>>> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>>>>         if (unlikely(userfaultfd_armed(vma)))
>>>>                 goto fallback;
>>>>
>>>> -       /*
>>>> -        * A large swapped out folio could be partially or fully in zswap. We
>>>> -        * lack handling for such cases, so fallback to swapping in order-0
>>>> -        * folio.
>>>> -        */
>>>> -       if (!zswap_never_enabled())
>>>> -               goto fallback;
>>>> -
>>>>         entry = pte_to_swp_entry(vmf->orig_pte);
>>>>         /*
>>>>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>>> diff --git a/mm/zswap.c b/mm/zswap.c
>>>> index 9cc91ae31116..a5aa86c24060 100644
>>>> --- a/mm/zswap.c
>>>> +++ b/mm/zswap.c
>>>> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
>>>>
>>>>  bool zswap_load(struct folio *folio)
>>>>  {
>>>> +       int nr_pages = folio_nr_pages(folio);
>>>>         swp_entry_t swp = folio->swap;
>>>> +       unsigned int type = swp_type(swp);
>>>>         pgoff_t offset = swp_offset(swp);
>>>>         bool swapcache = folio_test_swapcache(folio);
>>>> -       struct xarray *tree = swap_zswap_tree(swp);
>>>> +       struct xarray *tree;
>>>>         struct zswap_entry *entry;
>>>> +       int i;
>>>>
>>>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>>>>
>>>>         if (zswap_never_enabled())
>>>>                 return false;
>>>>
>>>> -       /*
>>>> -        * Large folios should not be swapped in while zswap is being used, as
>>>> -        * they are not properly handled. Zswap does not properly load large
>>>> -        * folios, and a large folio may only be partially in zswap.
>>>> -        *
>>>> -        * Return true without marking the folio uptodate so that an IO error is
>>>> -        * emitted (e.g. do_swap_page() will sigbus).
>>>> -        */
>>>> -       if (WARN_ON_ONCE(folio_test_large(folio)))
>>>> -               return true;
>>>> -
>>>> -       /*
>>>> -        * When reading into the swapcache, invalidate our entry. The
>>>> -        * swapcache can be the authoritative owner of the page and
>>>> -        * its mappings, and the pressure that results from having two
>>>> -        * in-memory copies outweighs any benefits of caching the
>>>> -        * compression work.
>>>> -        *
>>>> -        * (Most swapins go through the swapcache. The notable
>>>> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
>>>> -        * files, which reads into a private page and may free it if
>>>> -        * the fault fails. We remain the primary owner of the entry.)
>>>> -        */
>>>> -       if (swapcache)
>>>> -               entry = xa_erase(tree, offset);
>>>> -       else
>>>> -               entry = xa_load(tree, offset);
>>>> -
>>>> -       if (!entry)
>>>> +       if (!zswap_present_test(folio->swap, nr_pages))
>>>>                 return false;
>>>
>>> Hi Usama,
>>>
>>> Is there any chance that zswap_present_test() returns true
>>> in do_swap_page() but false in zswap_load()? If that’s
>>> possible, could we be missing something? For example,
>>> could it be that zswap has been partially released (with
>>> part of it still present) during an mTHP swap-in?
>>>
>>> If this happens with an mTHP, my understanding is that
>>> we shouldn't proceed with reading corrupted data from the
>>> disk backend.
>>>
>>
>> If its not swapcache, the zswap entry is not deleted so I think
>> it should be ok?
>>
>> We can check over here if the entire folio is in zswap,
>> and if not, return true without marking the folio uptodate
>> to give an error.
> 
> We have swapcache_prepare() called in do_swap_page(), which should
> have protected these entries from being partially freed by other processes
> (for example, if someone falls back to small folios for the same address).
> Therefore, I believe that zswap_present_test() cannot be false for mTHP in
> the current case where only synchronous I/O is supported.
> 
> the below might help detect the bug?
> 
> if (!zswap_present_test(folio->swap, nr_pages)) {
>      if (WARN_ON_ONCE(nr_pages > 1))
>                 return true;
>      return false;
> }
> 

I think this isn't correct. If nr_pages > 1 and the entire folio is not in zswap,
it should still return false. So would need to check the whole folio if we want to
warn. But I think if we are sure the code is ok, it is an unnecessary check.

> the code seems quite ugly :-) do we have some way to unify the code
> for large and small folios?
> 
> not quite sure about shmem though....
> 

If its shmem, and the swap_count goes to 1, I think its still ok? because
then the folio will be gotten from swap_cache_get_folio if it has already
been in swapcache.

>>
>>
>>>>
>>>> -       zswap_decompress(entry, &folio->page);
>>>> +       for (i = 0; i < nr_pages; ++i) {
>>>> +               tree = swap_zswap_tree(swp_entry(type, offset + i));
>>>> +               /*
>>>> +                * When reading into the swapcache, invalidate our entry. The
>>>> +                * swapcache can be the authoritative owner of the page and
>>>> +                * its mappings, and the pressure that results from having two
>>>> +                * in-memory copies outweighs any benefits of caching the
>>>> +                * compression work.
>>>> +                *
>>>> +                * (Swapins with swap count > 1 go through the swapcache.
>>>> +                * For swap count == 1, the swapcache is skipped and we
>>>> +                * remain the primary owner of the entry.)
>>>> +                */
>>>> +               if (swapcache)
>>>> +                       entry = xa_erase(tree, offset + i);
>>>> +               else
>>>> +                       entry = xa_load(tree, offset + i);
>>>>
>>>> -       count_vm_event(ZSWPIN);
>>>> -       if (entry->objcg)
>>>> -               count_objcg_events(entry->objcg, ZSWPIN, 1);
>>>> +               zswap_decompress(entry, folio_page(folio, i));
>>>>
>>>> -       if (swapcache) {
>>>> -               zswap_entry_free(entry);
>>>> -               folio_mark_dirty(folio);
>>>> +               if (entry->objcg)
>>>> +                       count_objcg_events(entry->objcg, ZSWPIN, 1);
>>>> +               if (swapcache)
>>>> +                       zswap_entry_free(entry);
>>>>         }
>>>>
>>>> +       count_vm_events(ZSWPIN, nr_pages);
>>>> +       if (swapcache)
>>>> +               folio_mark_dirty(folio);
>>>> +
>>>>         folio_mark_uptodate(folio);
>>>>         return true;
>>>>  }
>>>> --
>>>> 2.43.5
>>>>
>>>
> 
> Thanks
> barry

Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Barry Song 1 month ago
On Tue, Oct 22, 2024 at 1:21 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 21/10/2024 11:55, Barry Song wrote:
> > On Mon, Oct 21, 2024 at 11:44 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >>
> >>
> >> On 21/10/2024 06:49, Barry Song wrote:
> >>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>
> >>>> At time of folio allocation, alloc_swap_folio checks if the entire
> >>>> folio is in zswap to determine folio order.
> >>>> During swap_read_folio, zswap_load will check if the entire folio
> >>>> is in zswap, and if it is, it will iterate through the pages in
> >>>> folio and decompress them.
> >>>> This will mean the benefits of large folios (fewer page faults, batched
> >>>> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
> >>>> and amd) are not lost at swap out when using zswap.
> >>>> This patch does not add support for hybrid backends (i.e. folios
> >>>> partly present swap and zswap).
> >>>>
> >>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>> ---
> >>>>  mm/memory.c | 13 +++-------
> >>>>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
> >>>>  2 files changed, 34 insertions(+), 47 deletions(-)
> >>>>
> >>>> diff --git a/mm/memory.c b/mm/memory.c
> >>>> index 49d243131169..75f7b9f5fb32 100644
> >>>> --- a/mm/memory.c
> >>>> +++ b/mm/memory.c
> >>>> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
> >>>>
> >>>>         /*
> >>>>          * swap_read_folio() can't handle the case a large folio is hybridly
> >>>> -        * from different backends. And they are likely corner cases. Similar
> >>>> -        * things might be added once zswap support large folios.
> >>>> +        * from different backends. And they are likely corner cases.
> >>>>          */
> >>>>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
> >>>>                 return false;
> >>>>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> >>>>                 return false;
> >>>> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
> >>>> +               return false;
> >>>>
> >>>>         return true;
> >>>>  }
> >>>> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >>>>         if (unlikely(userfaultfd_armed(vma)))
> >>>>                 goto fallback;
> >>>>
> >>>> -       /*
> >>>> -        * A large swapped out folio could be partially or fully in zswap. We
> >>>> -        * lack handling for such cases, so fallback to swapping in order-0
> >>>> -        * folio.
> >>>> -        */
> >>>> -       if (!zswap_never_enabled())
> >>>> -               goto fallback;
> >>>> -
> >>>>         entry = pte_to_swp_entry(vmf->orig_pte);
> >>>>         /*
> >>>>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
> >>>> diff --git a/mm/zswap.c b/mm/zswap.c
> >>>> index 9cc91ae31116..a5aa86c24060 100644
> >>>> --- a/mm/zswap.c
> >>>> +++ b/mm/zswap.c
> >>>> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
> >>>>
> >>>>  bool zswap_load(struct folio *folio)
> >>>>  {
> >>>> +       int nr_pages = folio_nr_pages(folio);
> >>>>         swp_entry_t swp = folio->swap;
> >>>> +       unsigned int type = swp_type(swp);
> >>>>         pgoff_t offset = swp_offset(swp);
> >>>>         bool swapcache = folio_test_swapcache(folio);
> >>>> -       struct xarray *tree = swap_zswap_tree(swp);
> >>>> +       struct xarray *tree;
> >>>>         struct zswap_entry *entry;
> >>>> +       int i;
> >>>>
> >>>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> >>>>
> >>>>         if (zswap_never_enabled())
> >>>>                 return false;
> >>>>
> >>>> -       /*
> >>>> -        * Large folios should not be swapped in while zswap is being used, as
> >>>> -        * they are not properly handled. Zswap does not properly load large
> >>>> -        * folios, and a large folio may only be partially in zswap.
> >>>> -        *
> >>>> -        * Return true without marking the folio uptodate so that an IO error is
> >>>> -        * emitted (e.g. do_swap_page() will sigbus).
> >>>> -        */
> >>>> -       if (WARN_ON_ONCE(folio_test_large(folio)))
> >>>> -               return true;
> >>>> -
> >>>> -       /*
> >>>> -        * When reading into the swapcache, invalidate our entry. The
> >>>> -        * swapcache can be the authoritative owner of the page and
> >>>> -        * its mappings, and the pressure that results from having two
> >>>> -        * in-memory copies outweighs any benefits of caching the
> >>>> -        * compression work.
> >>>> -        *
> >>>> -        * (Most swapins go through the swapcache. The notable
> >>>> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
> >>>> -        * files, which reads into a private page and may free it if
> >>>> -        * the fault fails. We remain the primary owner of the entry.)
> >>>> -        */
> >>>> -       if (swapcache)
> >>>> -               entry = xa_erase(tree, offset);
> >>>> -       else
> >>>> -               entry = xa_load(tree, offset);
> >>>> -
> >>>> -       if (!entry)
> >>>> +       if (!zswap_present_test(folio->swap, nr_pages))
> >>>>                 return false;
> >>>
> >>> Hi Usama,
> >>>
> >>> Is there any chance that zswap_present_test() returns true
> >>> in do_swap_page() but false in zswap_load()? If that’s
> >>> possible, could we be missing something? For example,
> >>> could it be that zswap has been partially released (with
> >>> part of it still present) during an mTHP swap-in?
> >>>
> >>> If this happens with an mTHP, my understanding is that
> >>> we shouldn't proceed with reading corrupted data from the
> >>> disk backend.
> >>>
> >>
> >> If its not swapcache, the zswap entry is not deleted so I think
> >> it should be ok?
> >>
> >> We can check over here if the entire folio is in zswap,
> >> and if not, return true without marking the folio uptodate
> >> to give an error.
> >
> > We have swapcache_prepare() called in do_swap_page(), which should
> > have protected these entries from being partially freed by other processes
> > (for example, if someone falls back to small folios for the same address).
> > Therefore, I believe that zswap_present_test() cannot be false for mTHP in
> > the current case where only synchronous I/O is supported.
> >
> > the below might help detect the bug?
> >
> > if (!zswap_present_test(folio->swap, nr_pages)) {
> >      if (WARN_ON_ONCE(nr_pages > 1))
> >                 return true;
> >      return false;
> > }
> >
>
> I think this isn't correct. If nr_pages > 1 and the entire folio is not in zswap,
> it should still return false. So would need to check the whole folio if we want to
> warn. But I think if we are sure the code is ok, it is an unnecessary check.

my point is that zswap_present_test() can't differentiate
1. the *whole* folio is not in zswap
2. the folio is *partially* not in zswap

in case 2, returning false is wrong.

And when nr_pages > 1, we have already confirmed earlier in
do_swap_page() that zswap_present_test() is true. At this point,
it must always be true; if it's false, it indicates a bug.

>
> > the code seems quite ugly :-) do we have some way to unify the code
> > for large and small folios?
> >
> > not quite sure about shmem though....
> >
>
> If its shmem, and the swap_count goes to 1, I think its still ok? because
> then the folio will be gotten from swap_cache_get_folio if it has already
> been in swapcache.
>
> >>
> >>
> >>>>
> >>>> -       zswap_decompress(entry, &folio->page);
> >>>> +       for (i = 0; i < nr_pages; ++i) {
> >>>> +               tree = swap_zswap_tree(swp_entry(type, offset + i));
> >>>> +               /*
> >>>> +                * When reading into the swapcache, invalidate our entry. The
> >>>> +                * swapcache can be the authoritative owner of the page and
> >>>> +                * its mappings, and the pressure that results from having two
> >>>> +                * in-memory copies outweighs any benefits of caching the
> >>>> +                * compression work.
> >>>> +                *
> >>>> +                * (Swapins with swap count > 1 go through the swapcache.
> >>>> +                * For swap count == 1, the swapcache is skipped and we
> >>>> +                * remain the primary owner of the entry.)
> >>>> +                */
> >>>> +               if (swapcache)
> >>>> +                       entry = xa_erase(tree, offset + i);
> >>>> +               else
> >>>> +                       entry = xa_load(tree, offset + i);
> >>>>
> >>>> -       count_vm_event(ZSWPIN);
> >>>> -       if (entry->objcg)
> >>>> -               count_objcg_events(entry->objcg, ZSWPIN, 1);
> >>>> +               zswap_decompress(entry, folio_page(folio, i));
> >>>>
> >>>> -       if (swapcache) {
> >>>> -               zswap_entry_free(entry);
> >>>> -               folio_mark_dirty(folio);
> >>>> +               if (entry->objcg)
> >>>> +                       count_objcg_events(entry->objcg, ZSWPIN, 1);
> >>>> +               if (swapcache)
> >>>> +                       zswap_entry_free(entry);
> >>>>         }
> >>>>
> >>>> +       count_vm_events(ZSWPIN, nr_pages);
> >>>> +       if (swapcache)
> >>>> +               folio_mark_dirty(folio);
> >>>> +
> >>>>         folio_mark_uptodate(folio);
> >>>>         return true;
> >>>>  }
> >>>> --
> >>>> 2.43.5
> >>>>
> >>>
> >
> > Thanks
> > barry
>
Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Usama Arif 1 month ago

On 21/10/2024 21:28, Barry Song wrote:
> On Tue, Oct 22, 2024 at 1:21 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 21/10/2024 11:55, Barry Song wrote:
>>> On Mon, Oct 21, 2024 at 11:44 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>>
>>>>
>>>> On 21/10/2024 06:49, Barry Song wrote:
>>>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>>
>>>>>> At time of folio allocation, alloc_swap_folio checks if the entire
>>>>>> folio is in zswap to determine folio order.
>>>>>> During swap_read_folio, zswap_load will check if the entire folio
>>>>>> is in zswap, and if it is, it will iterate through the pages in
>>>>>> folio and decompress them.
>>>>>> This will mean the benefits of large folios (fewer page faults, batched
>>>>>> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
>>>>>> and amd) are not lost at swap out when using zswap.
>>>>>> This patch does not add support for hybrid backends (i.e. folios
>>>>>> partly present swap and zswap).
>>>>>>
>>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>>>> ---
>>>>>>  mm/memory.c | 13 +++-------
>>>>>>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
>>>>>>  2 files changed, 34 insertions(+), 47 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>>> index 49d243131169..75f7b9f5fb32 100644
>>>>>> --- a/mm/memory.c
>>>>>> +++ b/mm/memory.c
>>>>>> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
>>>>>>
>>>>>>         /*
>>>>>>          * swap_read_folio() can't handle the case a large folio is hybridly
>>>>>> -        * from different backends. And they are likely corner cases. Similar
>>>>>> -        * things might be added once zswap support large folios.
>>>>>> +        * from different backends. And they are likely corner cases.
>>>>>>          */
>>>>>>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
>>>>>>                 return false;
>>>>>>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
>>>>>>                 return false;
>>>>>> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
>>>>>> +               return false;
>>>>>>
>>>>>>         return true;
>>>>>>  }
>>>>>> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>>>>>>         if (unlikely(userfaultfd_armed(vma)))
>>>>>>                 goto fallback;
>>>>>>
>>>>>> -       /*
>>>>>> -        * A large swapped out folio could be partially or fully in zswap. We
>>>>>> -        * lack handling for such cases, so fallback to swapping in order-0
>>>>>> -        * folio.
>>>>>> -        */
>>>>>> -       if (!zswap_never_enabled())
>>>>>> -               goto fallback;
>>>>>> -
>>>>>>         entry = pte_to_swp_entry(vmf->orig_pte);
>>>>>>         /*
>>>>>>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>>>>> diff --git a/mm/zswap.c b/mm/zswap.c
>>>>>> index 9cc91ae31116..a5aa86c24060 100644
>>>>>> --- a/mm/zswap.c
>>>>>> +++ b/mm/zswap.c
>>>>>> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
>>>>>>
>>>>>>  bool zswap_load(struct folio *folio)
>>>>>>  {
>>>>>> +       int nr_pages = folio_nr_pages(folio);
>>>>>>         swp_entry_t swp = folio->swap;
>>>>>> +       unsigned int type = swp_type(swp);
>>>>>>         pgoff_t offset = swp_offset(swp);
>>>>>>         bool swapcache = folio_test_swapcache(folio);
>>>>>> -       struct xarray *tree = swap_zswap_tree(swp);
>>>>>> +       struct xarray *tree;
>>>>>>         struct zswap_entry *entry;
>>>>>> +       int i;
>>>>>>
>>>>>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>>>>>>
>>>>>>         if (zswap_never_enabled())
>>>>>>                 return false;
>>>>>>
>>>>>> -       /*
>>>>>> -        * Large folios should not be swapped in while zswap is being used, as
>>>>>> -        * they are not properly handled. Zswap does not properly load large
>>>>>> -        * folios, and a large folio may only be partially in zswap.
>>>>>> -        *
>>>>>> -        * Return true without marking the folio uptodate so that an IO error is
>>>>>> -        * emitted (e.g. do_swap_page() will sigbus).
>>>>>> -        */
>>>>>> -       if (WARN_ON_ONCE(folio_test_large(folio)))
>>>>>> -               return true;
>>>>>> -
>>>>>> -       /*
>>>>>> -        * When reading into the swapcache, invalidate our entry. The
>>>>>> -        * swapcache can be the authoritative owner of the page and
>>>>>> -        * its mappings, and the pressure that results from having two
>>>>>> -        * in-memory copies outweighs any benefits of caching the
>>>>>> -        * compression work.
>>>>>> -        *
>>>>>> -        * (Most swapins go through the swapcache. The notable
>>>>>> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
>>>>>> -        * files, which reads into a private page and may free it if
>>>>>> -        * the fault fails. We remain the primary owner of the entry.)
>>>>>> -        */
>>>>>> -       if (swapcache)
>>>>>> -               entry = xa_erase(tree, offset);
>>>>>> -       else
>>>>>> -               entry = xa_load(tree, offset);
>>>>>> -
>>>>>> -       if (!entry)
>>>>>> +       if (!zswap_present_test(folio->swap, nr_pages))
>>>>>>                 return false;
>>>>>
>>>>> Hi Usama,
>>>>>
>>>>> Is there any chance that zswap_present_test() returns true
>>>>> in do_swap_page() but false in zswap_load()? If that’s
>>>>> possible, could we be missing something? For example,
>>>>> could it be that zswap has been partially released (with
>>>>> part of it still present) during an mTHP swap-in?
>>>>>
>>>>> If this happens with an mTHP, my understanding is that
>>>>> we shouldn't proceed with reading corrupted data from the
>>>>> disk backend.
>>>>>
>>>>
>>>> If its not swapcache, the zswap entry is not deleted so I think
>>>> it should be ok?
>>>>
>>>> We can check over here if the entire folio is in zswap,
>>>> and if not, return true without marking the folio uptodate
>>>> to give an error.
>>>
>>> We have swapcache_prepare() called in do_swap_page(), which should
>>> have protected these entries from being partially freed by other processes
>>> (for example, if someone falls back to small folios for the same address).
>>> Therefore, I believe that zswap_present_test() cannot be false for mTHP in
>>> the current case where only synchronous I/O is supported.
>>>
>>> the below might help detect the bug?
>>>
>>> if (!zswap_present_test(folio->swap, nr_pages)) {
>>>      if (WARN_ON_ONCE(nr_pages > 1))
>>>                 return true;
>>>      return false;
>>> }
>>>
>>
>> I think this isn't correct. If nr_pages > 1 and the entire folio is not in zswap,
>> it should still return false. So would need to check the whole folio if we want to
>> warn. But I think if we are sure the code is ok, it is an unnecessary check.
> 
> my point is that zswap_present_test() can't differentiate
> 1. the *whole* folio is not in zswap
> 2. the folio is *partially* not in zswap
> 
> in case 2, returning false is wrong.
> 

Agreed!

> And when nr_pages > 1, we have already confirmed earlier in
> do_swap_page() that zswap_present_test() is true. At this point,
> it must always be true; if it's false, it indicates a bug.
> 

Yes agreed! I was thinking from just zswap_load perspective irrespective
of who calls it.
If someone adds large folio support to swapin_readahead, then I think the
above warn might be an issue.

But just with this patch series, doing what you suggested is correct. I
will add it in next revision. We can deal with it once swap count > 1,
starts supporting large folios.

>>
>>> the code seems quite ugly :-) do we have some way to unify the code
>>> for large and small folios?
>>>
>>> not quite sure about shmem though....
>>>
>>
>> If its shmem, and the swap_count goes to 1, I think its still ok? because
>> then the folio will be gotten from swap_cache_get_folio if it has already
>> been in swapcache.
>>
>>>>
>>>>
>>>>>>
>>>>>> -       zswap_decompress(entry, &folio->page);
>>>>>> +       for (i = 0; i < nr_pages; ++i) {
>>>>>> +               tree = swap_zswap_tree(swp_entry(type, offset + i));
>>>>>> +               /*
>>>>>> +                * When reading into the swapcache, invalidate our entry. The
>>>>>> +                * swapcache can be the authoritative owner of the page and
>>>>>> +                * its mappings, and the pressure that results from having two
>>>>>> +                * in-memory copies outweighs any benefits of caching the
>>>>>> +                * compression work.
>>>>>> +                *
>>>>>> +                * (Swapins with swap count > 1 go through the swapcache.
>>>>>> +                * For swap count == 1, the swapcache is skipped and we
>>>>>> +                * remain the primary owner of the entry.)
>>>>>> +                */
>>>>>> +               if (swapcache)
>>>>>> +                       entry = xa_erase(tree, offset + i);
>>>>>> +               else
>>>>>> +                       entry = xa_load(tree, offset + i);
>>>>>>
>>>>>> -       count_vm_event(ZSWPIN);
>>>>>> -       if (entry->objcg)
>>>>>> -               count_objcg_events(entry->objcg, ZSWPIN, 1);
>>>>>> +               zswap_decompress(entry, folio_page(folio, i));
>>>>>>
>>>>>> -       if (swapcache) {
>>>>>> -               zswap_entry_free(entry);
>>>>>> -               folio_mark_dirty(folio);
>>>>>> +               if (entry->objcg)
>>>>>> +                       count_objcg_events(entry->objcg, ZSWPIN, 1);
>>>>>> +               if (swapcache)
>>>>>> +                       zswap_entry_free(entry);
>>>>>>         }
>>>>>>
>>>>>> +       count_vm_events(ZSWPIN, nr_pages);
>>>>>> +       if (swapcache)
>>>>>> +               folio_mark_dirty(folio);
>>>>>> +
>>>>>>         folio_mark_uptodate(folio);
>>>>>>         return true;
>>>>>>  }
>>>>>> --
>>>>>> 2.43.5
>>>>>>
>>>>>
>>>
>>> Thanks
>>> barry
>>

Re: [RFC 3/4] mm/zswap: add support for large folio zswapin
Posted by Yosry Ahmed 1 month ago
On Mon, Oct 21, 2024 at 1:57 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 21/10/2024 21:28, Barry Song wrote:
> > On Tue, Oct 22, 2024 at 1:21 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >>
> >>
> >> On 21/10/2024 11:55, Barry Song wrote:
> >>> On Mon, Oct 21, 2024 at 11:44 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 21/10/2024 06:49, Barry Song wrote:
> >>>>> On Fri, Oct 18, 2024 at 11:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>>>
> >>>>>> At time of folio allocation, alloc_swap_folio checks if the entire
> >>>>>> folio is in zswap to determine folio order.
> >>>>>> During swap_read_folio, zswap_load will check if the entire folio
> >>>>>> is in zswap, and if it is, it will iterate through the pages in
> >>>>>> folio and decompress them.
> >>>>>> This will mean the benefits of large folios (fewer page faults, batched
> >>>>>> PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
> >>>>>> and amd) are not lost at swap out when using zswap.
> >>>>>> This patch does not add support for hybrid backends (i.e. folios
> >>>>>> partly present swap and zswap).
> >>>>>>
> >>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>>>> ---
> >>>>>>  mm/memory.c | 13 +++-------
> >>>>>>  mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
> >>>>>>  2 files changed, 34 insertions(+), 47 deletions(-)
> >>>>>>
> >>>>>> diff --git a/mm/memory.c b/mm/memory.c
> >>>>>> index 49d243131169..75f7b9f5fb32 100644
> >>>>>> --- a/mm/memory.c
> >>>>>> +++ b/mm/memory.c
> >>>>>> @@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
> >>>>>>
> >>>>>>         /*
> >>>>>>          * swap_read_folio() can't handle the case a large folio is hybridly
> >>>>>> -        * from different backends. And they are likely corner cases. Similar
> >>>>>> -        * things might be added once zswap support large folios.
> >>>>>> +        * from different backends. And they are likely corner cases.
> >>>>>>          */
> >>>>>>         if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
> >>>>>>                 return false;
> >>>>>>         if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> >>>>>>                 return false;
> >>>>>> +       if (unlikely(!zswap_present_test(entry, nr_pages)))
> >>>>>> +               return false;

Hmm if the entire folio is not in zswap, this will prevent the large
folio swapin, right?

Also, I think this is racy, see the comments below and in patch 1.

> >>>>>>
> >>>>>>         return true;
> >>>>>>  }
> >>>>>> @@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> >>>>>>         if (unlikely(userfaultfd_armed(vma)))
> >>>>>>                 goto fallback;
> >>>>>>
> >>>>>> -       /*
> >>>>>> -        * A large swapped out folio could be partially or fully in zswap. We
> >>>>>> -        * lack handling for such cases, so fallback to swapping in order-0
> >>>>>> -        * folio.
> >>>>>> -        */
> >>>>>> -       if (!zswap_never_enabled())
> >>>>>> -               goto fallback;
> >>>>>> -
> >>>>>>         entry = pte_to_swp_entry(vmf->orig_pte);
> >>>>>>         /*
> >>>>>>          * Get a list of all the (large) orders below PMD_ORDER that are enabled
> >>>>>> diff --git a/mm/zswap.c b/mm/zswap.c
> >>>>>> index 9cc91ae31116..a5aa86c24060 100644
> >>>>>> --- a/mm/zswap.c
> >>>>>> +++ b/mm/zswap.c
> >>>>>> @@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
> >>>>>>
> >>>>>>  bool zswap_load(struct folio *folio)
> >>>>>>  {
> >>>>>> +       int nr_pages = folio_nr_pages(folio);
> >>>>>>         swp_entry_t swp = folio->swap;
> >>>>>> +       unsigned int type = swp_type(swp);
> >>>>>>         pgoff_t offset = swp_offset(swp);
> >>>>>>         bool swapcache = folio_test_swapcache(folio);
> >>>>>> -       struct xarray *tree = swap_zswap_tree(swp);
> >>>>>> +       struct xarray *tree;
> >>>>>>         struct zswap_entry *entry;
> >>>>>> +       int i;
> >>>>>>
> >>>>>>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
> >>>>>>
> >>>>>>         if (zswap_never_enabled())
> >>>>>>                 return false;
> >>>>>>
> >>>>>> -       /*
> >>>>>> -        * Large folios should not be swapped in while zswap is being used, as
> >>>>>> -        * they are not properly handled. Zswap does not properly load large
> >>>>>> -        * folios, and a large folio may only be partially in zswap.
> >>>>>> -        *
> >>>>>> -        * Return true without marking the folio uptodate so that an IO error is
> >>>>>> -        * emitted (e.g. do_swap_page() will sigbus).
> >>>>>> -        */
> >>>>>> -       if (WARN_ON_ONCE(folio_test_large(folio)))
> >>>>>> -               return true;
> >>>>>> -
> >>>>>> -       /*
> >>>>>> -        * When reading into the swapcache, invalidate our entry. The
> >>>>>> -        * swapcache can be the authoritative owner of the page and
> >>>>>> -        * its mappings, and the pressure that results from having two
> >>>>>> -        * in-memory copies outweighs any benefits of caching the
> >>>>>> -        * compression work.
> >>>>>> -        *
> >>>>>> -        * (Most swapins go through the swapcache. The notable
> >>>>>> -        * exception is the singleton fault on SWP_SYNCHRONOUS_IO
> >>>>>> -        * files, which reads into a private page and may free it if
> >>>>>> -        * the fault fails. We remain the primary owner of the entry.)
> >>>>>> -        */
> >>>>>> -       if (swapcache)
> >>>>>> -               entry = xa_erase(tree, offset);
> >>>>>> -       else
> >>>>>> -               entry = xa_load(tree, offset);
> >>>>>> -
> >>>>>> -       if (!entry)
> >>>>>> +       if (!zswap_present_test(folio->swap, nr_pages))
> >>>>>>                 return false;
> >>>>>
> >>>>> Hi Usama,
> >>>>>
> >>>>> Is there any chance that zswap_present_test() returns true
> >>>>> in do_swap_page() but false in zswap_load()? If that’s
> >>>>> possible, could we be missing something? For example,
> >>>>> could it be that zswap has been partially released (with
> >>>>> part of it still present) during an mTHP swap-in?

As I mentioned in patch 1, we need to document when the result of
zswap_present_test() is stable, and we can't race with other stores,
exclusive loads, writeback, or invalidation.

> >>>>>
> >>>>> If this happens with an mTHP, my understanding is that
> >>>>> we shouldn't proceed with reading corrupted data from the
> >>>>> disk backend.
> >>>>>
> >>>>
> >>>> If its not swapcache, the zswap entry is not deleted so I think
> >>>> it should be ok?

Can we race with things like writeback and other exclusive loads
because swapcache_prepare() is not called yet?

> >>>>
> >>>> We can check over here if the entire folio is in zswap,
> >>>> and if not, return true without marking the folio uptodate
> >>>> to give an error.
> >>>
> >>> We have swapcache_prepare() called in do_swap_page(), which should
> >>> have protected these entries from being partially freed by other processes
> >>> (for example, if someone falls back to small folios for the same address).
> >>> Therefore, I believe that zswap_present_test() cannot be false for mTHP in
> >>> the current case where only synchronous I/O is supported.
> >>>
> >>> the below might help detect the bug?
> >>>
> >>> if (!zswap_present_test(folio->swap, nr_pages)) {
> >>>      if (WARN_ON_ONCE(nr_pages > 1))
> >>>                 return true;
> >>>      return false;
> >>> }
> >>>
> >>
> >> I think this isn't correct. If nr_pages > 1 and the entire folio is not in zswap,
> >> it should still return false. So would need to check the whole folio if we want to
> >> warn. But I think if we are sure the code is ok, it is an unnecessary check.
> >
> > my point is that zswap_present_test() can't differentiate
> > 1. the *whole* folio is not in zswap
> > 2. the folio is *partially* not in zswap
> >
> > in case 2, returning false is wrong.
> >
>
> Agreed!
>
> > And when nr_pages > 1, we have already confirmed earlier in
> > do_swap_page() that zswap_present_test() is true. At this point,
> > it must always be true; if it's false, it indicates a bug.
> >
>
> Yes agreed! I was thinking from just zswap_load perspective irrespective
> of who calls it.
> If someone adds large folio support to swapin_readahead, then I think the
> above warn might be an issue.
>
> But just with this patch series, doing what you suggested is correct. I
> will add it in next revision. We can deal with it once swap count > 1,
> starts supporting large folios.

I think I don't follow this part of the conversation properly, but it
seems like we want to catch the case where we end up in zswap_load()
and only part of the folio is in zswap. Can we use something like the
approach we used for swap_zeromap_batch()?
[RFC 4/4] mm/zswap: count successful large folio zswap loads
Posted by Usama Arif 1 month, 1 week ago
Added a new MTHP_STAT_ZSWPIN entry to the sysfs transparent_hugepage
stats so that successful large folio zswap stores can be accounted under
the per-order sysfs zswpin stats:

/sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpin

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 3 +++
 include/linux/huge_mm.h                    | 1 +
 mm/huge_memory.c                           | 3 +++
 mm/page_io.c                               | 1 +
 4 files changed, 8 insertions(+)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 2a171ed5206e..68a9790908b2 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -534,6 +534,9 @@ zswpout
 	is incremented every time a huge page is swapped out to zswap in one
 	piece without splitting.
 
+zswpin
+	is incremented every time a huge page is swapped in from zswap.
+
 swpout
 	is incremented every time a huge page is swapped out to a non-zswap
 	swap device in one piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3eca60f3d512..28a275d3107a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -120,6 +120,7 @@ enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_FALLBACK,
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
 	MTHP_STAT_ZSWPOUT,
+	MTHP_STAT_ZSWPIN,
 	MTHP_STAT_SWPOUT,
 	MTHP_STAT_SWPOUT_FALLBACK,
 	MTHP_STAT_SHMEM_ALLOC,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a7b05f4c2a5e..587f7dd81500 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -612,6 +612,7 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
+DEFINE_MTHP_STAT_ATTR(zswpin, MTHP_STAT_ZSWPIN);
 DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
 DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
 #ifdef CONFIG_SHMEM
@@ -631,6 +632,7 @@ static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_fallback_charge_attr.attr,
 #ifndef CONFIG_SHMEM
 	&zswpout_attr.attr,
+	&zswpin_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
 #endif
@@ -662,6 +664,7 @@ static struct attribute_group file_stats_attr_grp = {
 static struct attribute *any_stats_attrs[] = {
 #ifdef CONFIG_SHMEM
 	&zswpout_attr.attr,
+	&zswpin_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
 #endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 2a15b197968a..477f9d4fc009 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -620,6 +620,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
 		folio_unlock(folio);
 		goto finish;
 	} else if (zswap_load(folio)) {
+		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPIN);
 		folio_unlock(folio);
 		goto finish;
 	}
-- 
2.43.5