After the rbtree to xarray conversion, and dropping zswap_entry.refcount
and zswap_entry.value, the only members of zswap_entry utilized by
zero-filled pages are zswap_entry.length (always 0) and
zswap_entry.objcg. Store the objcg pointer directly in the xarray as a
tagged pointer and avoid allocating a zswap_entry completely for
zero-filled pages.
This simplifies the code as we no longer need to special case
zero-length cases. We are also able to further separate the zero-filled
pages handling logic and completely isolate them within store/load
helpers. Handling tagged xarray pointers is handled in these two
helpers, as well as the newly introduced helper for freeing tree
elements, zswap_tree_free_element().
There is also a small performance improvement observed over 50 runs of
kernel build test (kernbench) comparing the mean build time on a skylake
machine when building the kernel in a cgroup v1 container with a 3G
limit. This is on top of the improvement from dropping support for
non-zero same-filled pages:
base patched % diff
real 69.915 69.757 -0.229%
user 2956.147 2955.244 -0.031%
sys 2594.718 2575.747 -0.731%
This probably comes from avoiding the zswap_entry allocation and
cleanup/freeing for zero-filled pages. Note that the percentage of
zero-filled pages during this test was only around 1.5% on average.
Practical workloads could have a larger proportion of such pages (e.g.
Johannes observed around 10% [1]), so the performance improvement should
be larger.
This change also saves a small amount of memory due to less allocated
zswap_entry's. In the kernel build test above, we save around 2M of
slab usage when we swap out 3G to zswap.
[1]https://lore.kernel.org/linux-mm/20240320210716.GH294822@cmpxchg.org/
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
mm/zswap.c | 137 ++++++++++++++++++++++++++++++-----------------------
1 file changed, 78 insertions(+), 59 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 413d9242cf500..efc323bab2f22 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -183,12 +183,11 @@ static struct shrinker *zswap_shrinker;
* struct zswap_entry
*
* This structure contains the metadata for tracking a single compressed
- * page within zswap.
+ * page within zswap, it does not track zero-filled pages.
*
* swpentry - associated swap entry, the offset indexes into the red-black tree
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a zero-filled page length is 0, and both
- * pool and lru are invalid and must be ignored.
+ * decompression.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* objcg - the obj_cgroup that the compressed memory is charged to
@@ -794,30 +793,35 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))];
}
-/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
static void zswap_entry_free(struct zswap_entry *entry)
{
- if (!entry->length)
- atomic_dec(&zswap_zero_filled_pages);
- else {
- zswap_lru_del(&zswap_list_lru, entry);
- zpool_free(zswap_find_zpool(entry), entry->handle);
- zswap_pool_put(entry->pool);
- }
+ zswap_lru_del(&zswap_list_lru, entry);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
+ zswap_pool_put(entry->pool);
if (entry->objcg) {
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
}
zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
}
/*********************************
* zswap tree functions
**********************************/
+static void zswap_tree_free_element(void *elem)
+{
+ if (!elem)
+ return;
+
+ if (xa_pointer_tag(elem)) {
+ obj_cgroup_put(xa_untag_pointer(elem));
+ atomic_dec(&zswap_zero_filled_pages);
+ } else {
+ zswap_entry_free((struct zswap_entry *)elem);
+ }
+ atomic_dec(&zswap_stored_pages);
+}
+
static int zswap_tree_store(struct xarray *tree, pgoff_t offset, void *new)
{
void *old;
@@ -834,7 +838,7 @@ static int zswap_tree_store(struct xarray *tree, pgoff_t offset, void *new)
* the folio was redirtied and now the new version is being
* swapped out. Get rid of the old.
*/
- zswap_entry_free(old);
+ zswap_tree_free_element(old);
}
return err;
}
@@ -1089,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPWB);
- zswap_entry_free(entry);
+ zswap_tree_free_element(entry);
/* folio is up to date */
folio_mark_uptodate(folio);
@@ -1373,6 +1377,33 @@ static void shrink_worker(struct work_struct *w)
} while (zswap_total_pages() > thr);
}
+/*********************************
+* zero-filled functions
+**********************************/
+#define ZSWAP_ZERO_FILLED_TAG 1UL
+
+static int zswap_store_zero_filled(struct xarray *tree, pgoff_t offset,
+ struct obj_cgroup *objcg)
+{
+ int err = zswap_tree_store(tree, offset,
+ xa_tag_pointer(objcg, ZSWAP_ZERO_FILLED_TAG));
+
+ if (!err)
+ atomic_inc(&zswap_zero_filled_pages);
+ return err;
+}
+
+static bool zswap_load_zero_filled(void *elem, struct page *page,
+ struct obj_cgroup **objcg)
+{
+ if (!xa_pointer_tag(elem))
+ return false;
+
+ clear_highpage(page);
+ *objcg = xa_untag_pointer(elem);
+ return true;
+}
+
static bool zswap_is_folio_zero_filled(struct folio *folio)
{
unsigned long *kaddr;
@@ -1432,22 +1463,21 @@ bool zswap_store(struct folio *folio)
if (!zswap_check_limit())
goto reject;
- /* allocate entry */
+ if (zswap_is_folio_zero_filled(folio)) {
+ if (zswap_store_zero_filled(tree, offset, objcg))
+ goto reject;
+ goto stored;
+ }
+
+ if (!zswap_non_zero_filled_pages_enabled)
+ goto reject;
+
entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
if (!entry) {
zswap_reject_kmemcache_fail++;
goto reject;
}
- if (zswap_is_folio_zero_filled(folio)) {
- entry->length = 0;
- atomic_inc(&zswap_zero_filled_pages);
- goto insert_entry;
- }
-
- if (!zswap_non_zero_filled_pages_enabled)
- goto freepage;
-
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool)
@@ -1465,17 +1495,14 @@ bool zswap_store(struct folio *folio)
if (!zswap_compress(folio, entry))
goto put_pool;
-insert_entry:
entry->swpentry = swp;
entry->objcg = objcg;
if (zswap_tree_store(tree, offset, entry))
goto store_failed;
- if (objcg) {
+ if (objcg)
obj_cgroup_charge_zswap(objcg, entry->length);
- count_objcg_event(objcg, ZSWPOUT);
- }
/*
* We finish initializing the entry while it's already in xarray.
@@ -1487,25 +1514,21 @@ bool zswap_store(struct folio *folio)
* The publishing order matters to prevent writeback from seeing
* an incoherent entry.
*/
- if (entry->length) {
- INIT_LIST_HEAD(&entry->lru);
- zswap_lru_add(&zswap_list_lru, entry);
- }
+ INIT_LIST_HEAD(&entry->lru);
+ zswap_lru_add(&zswap_list_lru, entry);
- /* update stats */
+stored:
+ if (objcg)
+ count_objcg_event(objcg, ZSWPOUT);
atomic_inc(&zswap_stored_pages);
count_vm_event(ZSWPOUT);
return true;
store_failed:
- if (!entry->length)
- atomic_dec(&zswap_zero_filled_pages);
- else {
- zpool_free(zswap_find_zpool(entry), entry->handle);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
put_pool:
- zswap_pool_put(entry->pool);
- }
+ zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);
reject:
@@ -1518,9 +1541,7 @@ bool zswap_store(struct folio *folio)
* possibly stale entry which was previously stored at this offset.
* Otherwise, writeback could overwrite the new data in the swapfile.
*/
- entry = xa_erase(tree, offset);
- if (entry)
- zswap_entry_free(entry);
+ zswap_tree_free_element(xa_erase(tree, offset));
return false;
}
@@ -1531,26 +1552,27 @@ bool zswap_load(struct folio *folio)
struct page *page = &folio->page;
struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
+ struct obj_cgroup *objcg;
+ void *elem;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
- entry = xa_erase(tree, offset);
- if (!entry)
+ elem = xa_erase(tree, offset);
+ if (!elem)
return false;
- if (entry->length)
+ if (!zswap_load_zero_filled(elem, page, &objcg)) {
+ entry = elem;
+ objcg = entry->objcg;
zswap_decompress(entry, page);
- else
- clear_highpage(page);
+ }
count_vm_event(ZSWPIN);
- if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPIN);
-
- zswap_entry_free(entry);
+ if (objcg)
+ count_objcg_event(objcg, ZSWPIN);
+ zswap_tree_free_element(elem);
folio_mark_dirty(folio);
-
return true;
}
@@ -1558,11 +1580,8 @@ void zswap_invalidate(swp_entry_t swp)
{
pgoff_t offset = swp_offset(swp);
struct xarray *tree = swap_zswap_tree(swp);
- struct zswap_entry *entry;
- entry = xa_erase(tree, offset);
- if (entry)
- zswap_entry_free(entry);
+ zswap_tree_free_element(xa_erase(tree, offset));
}
int zswap_swapon(int type, unsigned long nr_pages)
--
2.44.0.396.g6e790dbe36-goog
On Mon, Mar 25, 2024 at 11:50:15PM +0000, Yosry Ahmed wrote: > After the rbtree to xarray conversion, and dropping zswap_entry.refcount > and zswap_entry.value, the only members of zswap_entry utilized by > zero-filled pages are zswap_entry.length (always 0) and > zswap_entry.objcg. Store the objcg pointer directly in the xarray as a > tagged pointer and avoid allocating a zswap_entry completely for > zero-filled pages. > > This simplifies the code as we no longer need to special case > zero-length cases. We are also able to further separate the zero-filled > pages handling logic and completely isolate them within store/load > helpers. Handling tagged xarray pointers is handled in these two > helpers, as well as the newly introduced helper for freeing tree > elements, zswap_tree_free_element(). > > There is also a small performance improvement observed over 50 runs of > kernel build test (kernbench) comparing the mean build time on a skylake > machine when building the kernel in a cgroup v1 container with a 3G > limit. This is on top of the improvement from dropping support for > non-zero same-filled pages: > > base patched % diff > real 69.915 69.757 -0.229% > user 2956.147 2955.244 -0.031% > sys 2594.718 2575.747 -0.731% > > This probably comes from avoiding the zswap_entry allocation and > cleanup/freeing for zero-filled pages. Note that the percentage of > zero-filled pages during this test was only around 1.5% on average. > Practical workloads could have a larger proportion of such pages (e.g. > Johannes observed around 10% [1]), so the performance improvement should > be larger. > > This change also saves a small amount of memory due to less allocated > zswap_entry's. In the kernel build test above, we save around 2M of > slab usage when we swap out 3G to zswap. > > [1]https://lore.kernel.org/linux-mm/20240320210716.GH294822@cmpxchg.org/ > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> > --- > mm/zswap.c | 137 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 78 insertions(+), 59 deletions(-) Tbh, I think this makes the code worse overall. Instead of increasing type safety, it actually reduces it, and places that previously dealt with a struct zswap_entry now deal with a void *. If we go ahead with this series, I think it makes more sense to either a) do the explicit subtyping of struct zswap_entry I had proposed, or b) at least keep the specialness handling of the xarray entry as local as possible, so that instead of a proliferating API that operates on void *, you have explicit filtering only where the tree is accessed, and then work on struct zswap_entry as much as possible.
On Thu, Mar 28, 2024 at 12:38 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Mon, Mar 25, 2024 at 11:50:15PM +0000, Yosry Ahmed wrote: > > After the rbtree to xarray conversion, and dropping zswap_entry.refcount > > and zswap_entry.value, the only members of zswap_entry utilized by > > zero-filled pages are zswap_entry.length (always 0) and > > zswap_entry.objcg. Store the objcg pointer directly in the xarray as a > > tagged pointer and avoid allocating a zswap_entry completely for > > zero-filled pages. > > > > This simplifies the code as we no longer need to special case > > zero-length cases. We are also able to further separate the zero-filled > > pages handling logic and completely isolate them within store/load > > helpers. Handling tagged xarray pointers is handled in these two > > helpers, as well as the newly introduced helper for freeing tree > > elements, zswap_tree_free_element(). > > > > There is also a small performance improvement observed over 50 runs of > > kernel build test (kernbench) comparing the mean build time on a skylake > > machine when building the kernel in a cgroup v1 container with a 3G > > limit. This is on top of the improvement from dropping support for > > non-zero same-filled pages: > > > > base patched % diff > > real 69.915 69.757 -0.229% > > user 2956.147 2955.244 -0.031% > > sys 2594.718 2575.747 -0.731% > > > > This probably comes from avoiding the zswap_entry allocation and > > cleanup/freeing for zero-filled pages. Note that the percentage of > > zero-filled pages during this test was only around 1.5% on average. > > Practical workloads could have a larger proportion of such pages (e.g. > > Johannes observed around 10% [1]), so the performance improvement should > > be larger. > > > > This change also saves a small amount of memory due to less allocated > > zswap_entry's. In the kernel build test above, we save around 2M of > > slab usage when we swap out 3G to zswap. > > > > [1]https://lore.kernel.org/linux-mm/20240320210716.GH294822@cmpxchg.org/ > > > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com> > > --- > > mm/zswap.c | 137 ++++++++++++++++++++++++++++++----------------------- > > 1 file changed, 78 insertions(+), 59 deletions(-) > > Tbh, I think this makes the code worse overall. Instead of increasing > type safety, it actually reduces it, and places that previously dealt > with a struct zswap_entry now deal with a void *. > > If we go ahead with this series, I think it makes more sense to either > > a) do the explicit subtyping of struct zswap_entry I had proposed, or I suspect we won't get the small performance improvements (and memory saving) with this approach. Neither are too significant, but it'd be nice if we could keep them. > > b) at least keep the specialness handling of the xarray entry as local > as possible, so that instead of a proliferating API that operates > on void *, you have explicit filtering only where the tree is > accessed, and then work on struct zswap_entry as much as possible. I was trying to go for option (b) by isolating filtering and casting to the correct type in a few functions (zswap_tree_free_element(), zswap_store_zero_filled(), and zswap_load_zero_filled()). If we open-code filtering it will be repeated in a few places. What did you have in mind?
On 2024/3/26 07:50, Yosry Ahmed wrote:
> After the rbtree to xarray conversion, and dropping zswap_entry.refcount
> and zswap_entry.value, the only members of zswap_entry utilized by
> zero-filled pages are zswap_entry.length (always 0) and
> zswap_entry.objcg. Store the objcg pointer directly in the xarray as a
> tagged pointer and avoid allocating a zswap_entry completely for
> zero-filled pages.
>
> This simplifies the code as we no longer need to special case
> zero-length cases. We are also able to further separate the zero-filled
> pages handling logic and completely isolate them within store/load
> helpers. Handling tagged xarray pointers is handled in these two
> helpers, as well as the newly introduced helper for freeing tree
> elements, zswap_tree_free_element().
>
> There is also a small performance improvement observed over 50 runs of
> kernel build test (kernbench) comparing the mean build time on a skylake
> machine when building the kernel in a cgroup v1 container with a 3G
> limit. This is on top of the improvement from dropping support for
> non-zero same-filled pages:
>
> base patched % diff
> real 69.915 69.757 -0.229%
> user 2956.147 2955.244 -0.031%
> sys 2594.718 2575.747 -0.731%
>
> This probably comes from avoiding the zswap_entry allocation and
> cleanup/freeing for zero-filled pages. Note that the percentage of
> zero-filled pages during this test was only around 1.5% on average.
> Practical workloads could have a larger proportion of such pages (e.g.
> Johannes observed around 10% [1]), so the performance improvement should
> be larger.
>
> This change also saves a small amount of memory due to less allocated
> zswap_entry's. In the kernel build test above, we save around 2M of
> slab usage when we swap out 3G to zswap.
>
> [1]https://lore.kernel.org/linux-mm/20240320210716.GH294822@cmpxchg.org/
>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
The code looks good, just one comment below.
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
> ---
> mm/zswap.c | 137 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 78 insertions(+), 59 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 413d9242cf500..efc323bab2f22 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -183,12 +183,11 @@ static struct shrinker *zswap_shrinker;
> * struct zswap_entry
> *
[..]
>
> @@ -1531,26 +1552,27 @@ bool zswap_load(struct folio *folio)
> struct page *page = &folio->page;
> struct xarray *tree = swap_zswap_tree(swp);
> struct zswap_entry *entry;
> + struct obj_cgroup *objcg;
> + void *elem;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
>
> - entry = xa_erase(tree, offset);
> - if (!entry)
> + elem = xa_erase(tree, offset);
> + if (!elem)
> return false;
>
> - if (entry->length)
> + if (!zswap_load_zero_filled(elem, page, &objcg)) {
> + entry = elem;
nit: entry seems no use anymore.
> + objcg = entry->objcg;
> zswap_decompress(entry, page);
> - else
> - clear_highpage(page);
> + }
>
> count_vm_event(ZSWPIN);
> - if (entry->objcg)
> - count_objcg_event(entry->objcg, ZSWPIN);
> -
> - zswap_entry_free(entry);
> + if (objcg)
> + count_objcg_event(objcg, ZSWPIN);
>
> + zswap_tree_free_element(elem);
> folio_mark_dirty(folio);
> -
> return true;
> }
[..]
On Thu, Mar 28, 2024 at 1:12 AM Chengming Zhou <chengming.zhou@linux.dev> wrote:
>
> On 2024/3/26 07:50, Yosry Ahmed wrote:
> > After the rbtree to xarray conversion, and dropping zswap_entry.refcount
> > and zswap_entry.value, the only members of zswap_entry utilized by
> > zero-filled pages are zswap_entry.length (always 0) and
> > zswap_entry.objcg. Store the objcg pointer directly in the xarray as a
> > tagged pointer and avoid allocating a zswap_entry completely for
> > zero-filled pages.
> >
> > This simplifies the code as we no longer need to special case
> > zero-length cases. We are also able to further separate the zero-filled
> > pages handling logic and completely isolate them within store/load
> > helpers. Handling tagged xarray pointers is handled in these two
> > helpers, as well as the newly introduced helper for freeing tree
> > elements, zswap_tree_free_element().
> >
> > There is also a small performance improvement observed over 50 runs of
> > kernel build test (kernbench) comparing the mean build time on a skylake
> > machine when building the kernel in a cgroup v1 container with a 3G
> > limit. This is on top of the improvement from dropping support for
> > non-zero same-filled pages:
> >
> > base patched % diff
> > real 69.915 69.757 -0.229%
> > user 2956.147 2955.244 -0.031%
> > sys 2594.718 2575.747 -0.731%
> >
> > This probably comes from avoiding the zswap_entry allocation and
> > cleanup/freeing for zero-filled pages. Note that the percentage of
> > zero-filled pages during this test was only around 1.5% on average.
> > Practical workloads could have a larger proportion of such pages (e.g.
> > Johannes observed around 10% [1]), so the performance improvement should
> > be larger.
> >
> > This change also saves a small amount of memory due to less allocated
> > zswap_entry's. In the kernel build test above, we save around 2M of
> > slab usage when we swap out 3G to zswap.
> >
> > [1]https://lore.kernel.org/linux-mm/20240320210716.GH294822@cmpxchg.org/
> >
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
>
> The code looks good, just one comment below.
>
> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Thanks!
>
> > ---
> > mm/zswap.c | 137 ++++++++++++++++++++++++++++++-----------------------
> > 1 file changed, 78 insertions(+), 59 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 413d9242cf500..efc323bab2f22 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -183,12 +183,11 @@ static struct shrinker *zswap_shrinker;
> > * struct zswap_entry
> > *
> [..]
> >
> > @@ -1531,26 +1552,27 @@ bool zswap_load(struct folio *folio)
> > struct page *page = &folio->page;
> > struct xarray *tree = swap_zswap_tree(swp);
> > struct zswap_entry *entry;
> > + struct obj_cgroup *objcg;
> > + void *elem;
> >
> > VM_WARN_ON_ONCE(!folio_test_locked(folio));
> >
> > - entry = xa_erase(tree, offset);
> > - if (!entry)
> > + elem = xa_erase(tree, offset);
> > + if (!elem)
> > return false;
> >
> > - if (entry->length)
> > + if (!zswap_load_zero_filled(elem, page, &objcg)) {
> > + entry = elem;
>
> nit: entry seems no use anymore.
I left it here on purpose to avoid casting elem in the next two lines,
it is just more aesthetic.
>
> > + objcg = entry->objcg;
> > zswap_decompress(entry, page);
> > - else
> > - clear_highpage(page);
> > + }
> >
> > count_vm_event(ZSWPIN);
> > - if (entry->objcg)
> > - count_objcg_event(entry->objcg, ZSWPIN);
> > -
> > - zswap_entry_free(entry);
> > + if (objcg)
> > + count_objcg_event(objcg, ZSWPIN);
> >
> > + zswap_tree_free_element(elem);
> > folio_mark_dirty(folio);
> > -
> > return true;
> > }
> [..]
© 2016 - 2026 Red Hat, Inc.