mm/zsmalloc.c | 95 ++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 55 deletions(-)
Currently, zsmalloc creates kmem_cache of handles and zspages
for each pool, which may be suboptimal from the memory usage
point of view (extra internal fragmentation per pool). Systems
that create multiple zsmalloc pools may benefit from shared
common zsmalloc caches.
Make handles and zspages kmem caches global.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
mm/zsmalloc.c | 95 ++++++++++++++++++++++-----------------------------
1 file changed, 40 insertions(+), 55 deletions(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5abb8bc0956a..05ed3539aa1e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -198,12 +198,13 @@ struct link_free {
};
};
+static struct kmem_cache *handle_cachep;
+static struct kmem_cache *zspage_cachep;
+
struct zs_pool {
const char *name;
struct size_class *size_class[ZS_SIZE_CLASSES];
- struct kmem_cache *handle_cachep;
- struct kmem_cache *zspage_cachep;
atomic_long_t pages_allocated;
@@ -376,60 +377,28 @@ static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
#endif
-static int create_cache(struct zs_pool *pool)
+static unsigned long cache_alloc_handle(gfp_t gfp)
{
- char *name;
-
- name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
- if (!name)
- return -ENOMEM;
- pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
- 0, 0, NULL);
- kfree(name);
- if (!pool->handle_cachep)
- return -EINVAL;
-
- name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
- if (!name)
- return -ENOMEM;
- pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
- 0, 0, NULL);
- kfree(name);
- if (!pool->zspage_cachep) {
- kmem_cache_destroy(pool->handle_cachep);
- pool->handle_cachep = NULL;
- return -EINVAL;
- }
-
- return 0;
-}
+ gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
-static void destroy_cache(struct zs_pool *pool)
-{
- kmem_cache_destroy(pool->handle_cachep);
- kmem_cache_destroy(pool->zspage_cachep);
+ return (unsigned long)kmem_cache_alloc(handle_cachep, gfp);
}
-static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
+static void cache_free_handle(unsigned long handle)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ kmem_cache_free(handle_cachep, (void *)handle);
}
-static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
+static struct zspage *cache_alloc_zspage(gfp_t gfp)
{
- kmem_cache_free(pool->handle_cachep, (void *)handle);
-}
+ gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
-static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
-{
- return kmem_cache_zalloc(pool->zspage_cachep,
- flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ return kmem_cache_zalloc(zspage_cachep, gfp);
}
-static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+static void cache_free_zspage(struct zspage *zspage)
{
- kmem_cache_free(pool->zspage_cachep, zspage);
+ kmem_cache_free(zspage_cachep, zspage);
}
/* class->lock(which owns the handle) synchronizes races */
@@ -858,7 +827,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
zpdesc = next;
} while (zpdesc != NULL);
- cache_free_zspage(pool, zspage);
+ cache_free_zspage(zspage);
class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
@@ -971,7 +940,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
{
int i;
struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
- struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+ struct zspage *zspage = cache_alloc_zspage(gfp);
if (!zspage)
return NULL;
@@ -993,7 +962,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
zpdesc_dec_zone_page_state(zpdescs[i]);
free_zpdesc(zpdescs[i]);
}
- cache_free_zspage(pool, zspage);
+ cache_free_zspage(zspage);
return NULL;
}
__zpdesc_set_zsmalloc(zpdesc);
@@ -1346,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
if (unlikely(size > ZS_MAX_ALLOC_SIZE))
return (unsigned long)ERR_PTR(-ENOSPC);
- handle = cache_alloc_handle(pool, gfp);
+ handle = cache_alloc_handle(gfp);
if (!handle)
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1370,7 +1339,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
- cache_free_handle(pool, handle);
+ cache_free_handle(handle);
return (unsigned long)ERR_PTR(-ENOMEM);
}
@@ -1450,7 +1419,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
free_zspage(pool, class, zspage);
spin_unlock(&class->lock);
- cache_free_handle(pool, handle);
+ cache_free_handle(handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -2112,9 +2081,6 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
- if (create_cache(pool))
- goto err;
-
/*
* Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
@@ -2236,7 +2202,6 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
- destroy_cache(pool);
kfree(pool->name);
kfree(pool);
}
@@ -2246,10 +2211,28 @@ static int __init zs_init(void)
{
int rc __maybe_unused;
+ handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0,
+ NULL);
+ if (!handle_cachep)
+ return -ENOMEM;
+
+ zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), 0,
+ 0, NULL);
+ if (!zspage_cachep) {
+ kmem_cache_destroy(handle_cachep);
+ handle_cachep = NULL;
+ return -ENOMEM;
+ }
+
#ifdef CONFIG_COMPACTION
rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
- if (rc)
+ if (rc) {
+ kmem_cache_destroy(zspage_cachep);
+ kmem_cache_destroy(handle_cachep);
+ zspage_cachep = NULL;
+ handle_cachep = NULL;
return rc;
+ }
#endif
zs_stat_init();
return 0;
@@ -2261,6 +2244,8 @@ static void __exit zs_exit(void)
set_movable_ops(NULL, PGTY_zsmalloc);
#endif
zs_stat_exit();
+ kmem_cache_destroy(zspage_cachep);
+ kmem_cache_destroy(handle_cachep);
}
module_init(zs_init);
--
2.52.0.457.g6b5491de43-goog
On Thu, Jan 15, 2026 at 8:49 PM Sergey Senozhatsky <senozhatsky@chromium.org> wrote: > > Currently, zsmalloc creates kmem_cache of handles and zspages > for each pool, which may be suboptimal from the memory usage > point of view (extra internal fragmentation per pool). Systems > that create multiple zsmalloc pools may benefit from shared > common zsmalloc caches. > > Make handles and zspages kmem caches global. Hmm yeah this sounds reasonable to me. No reason to have dedicated kmem_cache per zs_pool (in the case of zswap, I suppose it's one for each compression algorithm, which is usually just one - but still...). Is there any lock contention implications? >
On (26/01/19 13:43), Nhat Pham wrote: > On Thu, Jan 15, 2026 at 8:49 PM Sergey Senozhatsky > <senozhatsky@chromium.org> wrote: > > > > Currently, zsmalloc creates kmem_cache of handles and zspages > > for each pool, which may be suboptimal from the memory usage > > point of view (extra internal fragmentation per pool). Systems > > that create multiple zsmalloc pools may benefit from shared > > common zsmalloc caches. > > > > Make handles and zspages kmem caches global. > > Hmm yeah this sounds reasonable to me. No reason to have dedicated > kmem_cache per zs_pool (in the case of zswap, I suppose it's one for > each compression algorithm, which is usually just one - but still...). > > Is there any lock contention implications? cache_alloc_handle()/cache_alloc_zspage() (and their free counterparts) are called outside of scope of any zsmalloc locks, so the upper boundary on the number of concurrent callers is the same - num_online_cpus().
On Fri, Jan 16, 2026 at 01:48:41PM +0900, Sergey Senozhatsky wrote:
> Currently, zsmalloc creates kmem_cache of handles and zspages
> for each pool, which may be suboptimal from the memory usage
> point of view (extra internal fragmentation per pool). Systems
> that create multiple zsmalloc pools may benefit from shared
> common zsmalloc caches.
I had a similar patch internally when we had 32 zsmalloc pools with
zswap.
You can calculate the savings by using /proc/slabinfo. The unused memory
is (num_objs-active_objs)*objsize. You can sum this across all caches
when you have multiple pools, and compare it to the unused memory with a
single cache.
>
> Make handles and zspages kmem caches global.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
> mm/zsmalloc.c | 95 ++++++++++++++++++++++-----------------------------
> 1 file changed, 40 insertions(+), 55 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 5abb8bc0956a..05ed3539aa1e 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -198,12 +198,13 @@ struct link_free {
> };
> };
>
> +static struct kmem_cache *handle_cachep;
> +static struct kmem_cache *zspage_cachep;
> +
> struct zs_pool {
> const char *name;
>
> struct size_class *size_class[ZS_SIZE_CLASSES];
> - struct kmem_cache *handle_cachep;
> - struct kmem_cache *zspage_cachep;
>
> atomic_long_t pages_allocated;
>
> @@ -376,60 +377,28 @@ static void init_deferred_free(struct zs_pool *pool) {}
> static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
> #endif
>
> -static int create_cache(struct zs_pool *pool)
> +static unsigned long cache_alloc_handle(gfp_t gfp)
> {
> - char *name;
> -
> - name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
> - if (!name)
> - return -ENOMEM;
> - pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
> - 0, 0, NULL);
> - kfree(name);
> - if (!pool->handle_cachep)
> - return -EINVAL;
> -
> - name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
> - if (!name)
> - return -ENOMEM;
> - pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
> - 0, 0, NULL);
> - kfree(name);
> - if (!pool->zspage_cachep) {
> - kmem_cache_destroy(pool->handle_cachep);
> - pool->handle_cachep = NULL;
> - return -EINVAL;
> - }
> -
> - return 0;
> -}
> + gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
>
> -static void destroy_cache(struct zs_pool *pool)
> -{
> - kmem_cache_destroy(pool->handle_cachep);
> - kmem_cache_destroy(pool->zspage_cachep);
> + return (unsigned long)kmem_cache_alloc(handle_cachep, gfp);
> }
>
> -static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
> +static void cache_free_handle(unsigned long handle)
> {
> - return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
> - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> + kmem_cache_free(handle_cachep, (void *)handle);
> }
>
> -static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
> +static struct zspage *cache_alloc_zspage(gfp_t gfp)
> {
> - kmem_cache_free(pool->handle_cachep, (void *)handle);
> -}
> + gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
>
> -static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
> -{
> - return kmem_cache_zalloc(pool->zspage_cachep,
> - flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> + return kmem_cache_zalloc(zspage_cachep, gfp);
> }
>
> -static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
> +static void cache_free_zspage(struct zspage *zspage)
> {
> - kmem_cache_free(pool->zspage_cachep, zspage);
> + kmem_cache_free(zspage_cachep, zspage);
> }
>
> /* class->lock(which owns the handle) synchronizes races */
> @@ -858,7 +827,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> zpdesc = next;
> } while (zpdesc != NULL);
>
> - cache_free_zspage(pool, zspage);
> + cache_free_zspage(zspage);
>
> class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
> atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
> @@ -971,7 +940,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> {
> int i;
> struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
> - struct zspage *zspage = cache_alloc_zspage(pool, gfp);
> + struct zspage *zspage = cache_alloc_zspage(gfp);
>
> if (!zspage)
> return NULL;
> @@ -993,7 +962,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> zpdesc_dec_zone_page_state(zpdescs[i]);
> free_zpdesc(zpdescs[i]);
> }
> - cache_free_zspage(pool, zspage);
> + cache_free_zspage(zspage);
> return NULL;
> }
> __zpdesc_set_zsmalloc(zpdesc);
> @@ -1346,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
> if (unlikely(size > ZS_MAX_ALLOC_SIZE))
> return (unsigned long)ERR_PTR(-ENOSPC);
>
> - handle = cache_alloc_handle(pool, gfp);
> + handle = cache_alloc_handle(gfp);
> if (!handle)
> return (unsigned long)ERR_PTR(-ENOMEM);
>
> @@ -1370,7 +1339,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
>
> zspage = alloc_zspage(pool, class, gfp, nid);
> if (!zspage) {
> - cache_free_handle(pool, handle);
> + cache_free_handle(handle);
> return (unsigned long)ERR_PTR(-ENOMEM);
> }
>
> @@ -1450,7 +1419,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> free_zspage(pool, class, zspage);
>
> spin_unlock(&class->lock);
> - cache_free_handle(pool, handle);
> + cache_free_handle(handle);
> }
> EXPORT_SYMBOL_GPL(zs_free);
>
> @@ -2112,9 +2081,6 @@ struct zs_pool *zs_create_pool(const char *name)
> if (!pool->name)
> goto err;
>
> - if (create_cache(pool))
> - goto err;
> -
> /*
> * Iterate reversely, because, size of size_class that we want to use
> * for merging should be larger or equal to current size.
> @@ -2236,7 +2202,6 @@ void zs_destroy_pool(struct zs_pool *pool)
> kfree(class);
> }
>
> - destroy_cache(pool);
> kfree(pool->name);
> kfree(pool);
> }
> @@ -2246,10 +2211,28 @@ static int __init zs_init(void)
> {
> int rc __maybe_unused;
>
> + handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0,
> + NULL);
> + if (!handle_cachep)
> + return -ENOMEM;
> +
> + zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), 0,
> + 0, NULL);
> + if (!zspage_cachep) {
> + kmem_cache_destroy(handle_cachep);
> + handle_cachep = NULL;
> + return -ENOMEM;
> + }
> +
> #ifdef CONFIG_COMPACTION
> rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
> - if (rc)
> + if (rc) {
> + kmem_cache_destroy(zspage_cachep);
> + kmem_cache_destroy(handle_cachep);
> + zspage_cachep = NULL;
> + handle_cachep = NULL;
> return rc;
> + }
> #endif
> zs_stat_init();
> return 0;
> @@ -2261,6 +2244,8 @@ static void __exit zs_exit(void)
> set_movable_ops(NULL, PGTY_zsmalloc);
> #endif
> zs_stat_exit();
> + kmem_cache_destroy(zspage_cachep);
> + kmem_cache_destroy(handle_cachep);
> }
Hmm instead of the repeated kmem_cache_destroy() calls, can we do sth
like this:
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index dccb88d52c07..86e2ca95ac4c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2235,14 +2235,43 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+static void __init zs_destroy_caches(void)
+{
+ kmem_cache_destroy(zs_handle_cache);
+ zs_handle_cache = NULL;
+ kmem_cache_destroy(zspage_cache);
+ zspage_cache = NULL;
+}
+
+static int __init zs_init_caches(void)
+{
+ zs_handle_cache = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ zspage_cache = kmem_cache_create("zspage", sizeof(struct zspage),
+ 0, 0, NULL);
+
+ if (!zs_handle_cache || !zspage_cache) {
+ zs_destroy_caches();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+
static int __init zs_init(void)
{
- int rc __maybe_unused;
+ int rc;
+
+ rc = zs_init_caches();
+ if (rc)
+ return rc;
#ifdef CONFIG_COMPACTION
rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
- if (rc)
+ if (rc) {
+ zs_destroy_caches();
return rc;
+ }
#endif
zs_stat_init();
return 0;
>
> module_init(zs_init);
> --
> 2.52.0.457.g6b5491de43-goog
>
On (26/01/16 20:49), Yosry Ahmed wrote: > On Fri, Jan 16, 2026 at 01:48:41PM +0900, Sergey Senozhatsky wrote: > > Currently, zsmalloc creates kmem_cache of handles and zspages > > for each pool, which may be suboptimal from the memory usage > > point of view (extra internal fragmentation per pool). Systems > > that create multiple zsmalloc pools may benefit from shared > > common zsmalloc caches. > > I had a similar patch internally when we had 32 zsmalloc pools with > zswap. Oh, nice. > You can calculate the savings by using /proc/slabinfo. The unused memory > is (num_objs-active_objs)*objsize. You can sum this across all caches > when you have multiple pools, and compare it to the unused memory with a > single cache. Right. Just curious, do you recall any numbers? [..] > Hmm instead of the repeated kmem_cache_destroy() calls, can we do sth > like this: Sure.
On Sat, Jan 17, 2026 at 11:24:01AM +0900, Sergey Senozhatsky wrote:
> On (26/01/16 20:49), Yosry Ahmed wrote:
> > On Fri, Jan 16, 2026 at 01:48:41PM +0900, Sergey Senozhatsky wrote:
> > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > for each pool, which may be suboptimal from the memory usage
> > > point of view (extra internal fragmentation per pool). Systems
> > > that create multiple zsmalloc pools may benefit from shared
> > > common zsmalloc caches.
> >
> > I had a similar patch internally when we had 32 zsmalloc pools with
> > zswap.
>
> Oh, nice.
>
> > You can calculate the savings by using /proc/slabinfo. The unused memory
> > is (num_objs-active_objs)*objsize. You can sum this across all caches
> > when you have multiple pools, and compare it to the unused memory with a
> > single cache.
>
> Right. Just curious, do you recall any numbers?
I have the exact numbers actually, from /proc/slabinfo while running a
zswap (internal) test:
*** Before:
# name <active_objs> <num_objs> <objsize> ..
zs_handle 35637 35760 16 ...
zs_handle 35577 35760 16 ...
zs_handle 35638 35760 16 ...
zs_handle 35700 35760 16 ...
zs_handle 35937 36240 16 ...
zs_handle 35518 35760 16 ...
zs_handle 35700 36000 16 ...
zs_handle 35517 35760 16 ...
zs_handle 35818 36000 16 ...
zs_handle 35698 35760 16 ...
zs_handle 35536 35760 16 ...
zs_handle 35877 36240 16 ...
zs_handle 35757 36000 16 ...
zs_handle 35760 36000 16 ...
zs_handle 35820 36000 16 ...
zs_handle 35999 36000 16 ...
zs_handle 35700 36000 16 ...
zs_handle 35817 36000 16 ...
zs_handle 35698 36000 16 ...
zs_handle 35699 36000 16 ...
zs_handle 35580 35760 16 ...
zs_handle 35578 35760 16 ...
zs_handle 35820 36000 16 ...
zs_handle 35517 35760 16 ...
zs_handle 35700 36000 16 ...
zs_handle 35640 35760 16 ...
zs_handle 35820 36000 16 ...
zs_handle 35578 35760 16 ...
zs_handle 35578 35760 16 ...
zs_handle 35817 36000 16 ...
zs_handle 35518 35760 16 ...
zs_handle 35940 36240 16 ...
zspage 991 1079 48 ...
zspage 936 996 48 ...
zspage 940 996 48 ...
zspage 1050 1079 48 ...
zspage 973 1079 48 ...
zspage 942 996 48 ...
zspage 1065 1162 48 ...
zspage 885 996 48 ...
zspage 887 913 48 ...
zspage 1053 1079 48 ...
zspage 983 996 48 ...
zspage 966 996 48 ...
zspage 970 1079 48 ...
zspage 880 913 48 ...
zspage 1006 1079 48 ...
zspage 998 1079 48 ...
zspage 1129 1162 48 ...
zspage 903 913 48 ...
zspage 833 996 48 ...
zspage 861 913 48 ...
zspage 764 913 48 ...
zspage 898 913 48 ...
zspage 973 1079 48 ...
zspage 945 996 48 ...
zspage 943 1079 48 ...
zspage 1024 1079 48 ...
zspage 820 913 48 ...
zspage 702 830 48 ...
zspage 1049 1079 48 ...
zspage 990 1162 48 ...
zspage 988 1079 48 ...
zspage 932 996 48 ...
Unused memory = $(awk '{s += $4*($3-$2)} END {print s}') = 218416 bytes
*** After:
# name <active_objs> <num_objs> <objsize> ..
zs_handle 1054440 1054800 16 ...
zspage 5720 5810 48 ...
Unused memory = (1054800-1054440)*16 + (5810-5720)*48 = 10080 bytes
That was about ~20 times reduction in waste when using 32 pools with
zswap. I suspect we wouldn't be using that many pools with zram.
>
> [..]
> > Hmm instead of the repeated kmem_cache_destroy() calls, can we do sth
> > like this:
>
> Sure.
On (26/01/21 01:30), Yosry Ahmed wrote: > > That was about ~20 times reduction in waste when using 32 pools with > zswap. Nice, thanks! > I suspect we wouldn't be using that many pools with zram. Hard to tell. We have users that setup many zram devices, even multiple swap zram devices. In terms of numbers - I think swap zram users still setup less devices than users that use zram as a normal block device (that number in theory can be quite high, depending on use case).
On (26/01/16 13:48), Sergey Senozhatsky wrote: > Currently, zsmalloc creates kmem_cache of handles and zspages > for each pool, which may be suboptimal from the memory usage > point of view (extra internal fragmentation per pool). Systems > that create multiple zsmalloc pools may benefit from shared > common zsmalloc caches. This is step 1. Step 2 is to look into possibility of sharing zsmalloc pools. E.g. if there are N zram devices in the system, do we really need N zsmalloc pools? Can we just share a single pool between them?
On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky <senozhatsky@chromium.org> wrote: > > On (26/01/16 13:48), Sergey Senozhatsky wrote: > > Currently, zsmalloc creates kmem_cache of handles and zspages > > for each pool, which may be suboptimal from the memory usage > > point of view (extra internal fragmentation per pool). Systems > > that create multiple zsmalloc pools may benefit from shared > > common zsmalloc caches. > > This is step 1. > > Step 2 is to look into possibility of sharing zsmalloc pools. > E.g. if there are N zram devices in the system, do we really need > N zsmalloc pools? Can we just share a single pool between them? Ditto for zswap (although here, we almost always only have a single zswap pool).
On (26/01/19 13:44), Nhat Pham wrote:
> On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > for each pool, which may be suboptimal from the memory usage
> > > point of view (extra internal fragmentation per pool). Systems
> > > that create multiple zsmalloc pools may benefit from shared
> > > common zsmalloc caches.
> >
> > This is step 1.
> >
> > Step 2 is to look into possibility of sharing zsmalloc pools.
> > E.g. if there are N zram devices in the system, do we really need
> > N zsmalloc pools? Can we just share a single pool between them?
>
> Ditto for zswap (although here, we almost always only have a single zswap pool).
COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting
an "Oops: stack guard page: 0000" early during boot).
So I'm thinking of something like below. Basically have a Kconfig
option to turn zsmalloc into a singleton pool mode, transparently
for zsmalloc users.
---
mm/Kconfig | 11 ++++++++
mm/zsmalloc.c | 73 ++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 72 insertions(+), 12 deletions(-)
diff --git a/mm/Kconfig b/mm/Kconfig
index 4fc1a171dffa..ff6855e74c3d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,6 +132,17 @@ menu "Zsmalloc allocator options"
comment "Zsmalloc is a common backend allocator for zswap & zram"
+config ZSMALLOC_SINGLETON_POOL
+ bool "Use a singleton zsmalloc pool"
+ default n
+ help
+ This option enables the use of a single global zsmalloc pool
+ instance for all users of zsmalloc (e.g., zswap, zram). This
+ reduces memory overhead and fragmentation by sharing size class
+ configurations and memory between different users.
+
+ If unsure, say N.
+
config ZSMALLOC_STAT
bool "Export zsmalloc statistics"
select DEBUG_FS
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8df45aa1b5c8..acd14b001342 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -224,6 +224,10 @@ struct zs_pool {
atomic_t compaction_in_progress;
};
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+static struct zs_pool *zs_singleton_pool;
+#endif
+
static inline void zpdesc_set_first(struct zpdesc *zpdesc)
{
SetPagePrivate(zpdesc_page(zpdesc));
@@ -2051,17 +2055,7 @@ static int calculate_zspage_chain_size(int class_size)
return chain_size;
}
-/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @name: pool name to be created
- *
- * This function must be called before anything when using
- * the zsmalloc allocator.
- *
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
- */
-struct zs_pool *zs_create_pool(const char *name)
+static struct zs_pool *__zs_create_pool(const char *name)
{
int i;
struct zs_pool *pool;
@@ -2170,9 +2164,29 @@ struct zs_pool *zs_create_pool(const char *name)
zs_destroy_pool(pool);
return NULL;
}
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @name: pool name to be created
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(const char *name)
+{
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+ return zs_singleton_pool;
+#else
+ return __zs_create_pool(name);
+#endif
+
+}
EXPORT_SYMBOL_GPL(zs_create_pool);
-void zs_destroy_pool(struct zs_pool *pool)
+static void __zs_destroy_pool(struct zs_pool *pool)
{
int i;
@@ -2203,8 +2217,35 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(pool->name);
kfree(pool);
}
+
+void zs_destroy_pool(struct zs_pool *pool __maybe_unused)
+{
+#ifndef CONFIG_ZSMALLOC_SINGLETON_POOL
+ __zs_destroy_pool(pool);
+#endif
+}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+static void zs_destroy_singleton_pool(void)
+{
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+ if (zs_singleton_pool) {
+ __zs_destroy_pool(zs_singleton_pool);
+ zs_singleton_pool = NULL;
+ }
+#endif
+}
+
+static int zs_create_singleton_pool(void)
+{
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+ zs_singleton_pool = __zs_create_pool("zsmalloc");
+ if (!zs_singleton_pool)
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
static void zs_destroy_caches(void)
{
kmem_cache_destroy(handle_cachep);
@@ -2235,9 +2276,16 @@ static int __init zs_init(void)
if (rc)
return rc;
+ rc = zs_create_singleton_pool();
+ if (rc) {
+ zs_destroy_caches();
+ return rc;
+ }
+
#ifdef CONFIG_COMPACTION
rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
if (rc) {
+ zs_destroy_singleton_pool();
zs_destroy_caches();
return rc;
}
@@ -2252,6 +2300,7 @@ static void __exit zs_exit(void)
set_movable_ops(NULL, PGTY_zsmalloc);
#endif
zs_stat_exit();
+ zs_destroy_singleton_pool();
zs_destroy_caches();
}
--
2.52.0.457.g6b5491de43-goog
On Wed, Jan 21, 2026 at 12:41:39PM +0900, Sergey Senozhatsky wrote:
> On (26/01/19 13:44), Nhat Pham wrote:
> > On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
> > <senozhatsky@chromium.org> wrote:
> > >
> > > On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > > for each pool, which may be suboptimal from the memory usage
> > > > point of view (extra internal fragmentation per pool). Systems
> > > > that create multiple zsmalloc pools may benefit from shared
> > > > common zsmalloc caches.
> > >
> > > This is step 1.
> > >
> > > Step 2 is to look into possibility of sharing zsmalloc pools.
> > > E.g. if there are N zram devices in the system, do we really need
> > > N zsmalloc pools? Can we just share a single pool between them?
> >
> > Ditto for zswap (although here, we almost always only have a single zswap pool).
>
> COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting
> an "Oops: stack guard page: 0000" early during boot).
>
> So I'm thinking of something like below. Basically have a Kconfig
> option to turn zsmalloc into a singleton pool mode, transparently
> for zsmalloc users.
Why do we need a config option? Is the main concern with a single pool
lock contention? If yes, we can probably measure it by spawning many
zram devices and stressing them at the same time.
>
> ---
> mm/Kconfig | 11 ++++++++
> mm/zsmalloc.c | 73 ++++++++++++++++++++++++++++++++++++++++++---------
> 2 files changed, 72 insertions(+), 12 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 4fc1a171dffa..ff6855e74c3d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -132,6 +132,17 @@ menu "Zsmalloc allocator options"
>
> comment "Zsmalloc is a common backend allocator for zswap & zram"
>
> +config ZSMALLOC_SINGLETON_POOL
> + bool "Use a singleton zsmalloc pool"
> + default n
> + help
> + This option enables the use of a single global zsmalloc pool
> + instance for all users of zsmalloc (e.g., zswap, zram). This
> + reduces memory overhead and fragmentation by sharing size class
> + configurations and memory between different users.
> +
> + If unsure, say N.
> +
> config ZSMALLOC_STAT
> bool "Export zsmalloc statistics"
> select DEBUG_FS
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 8df45aa1b5c8..acd14b001342 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -224,6 +224,10 @@ struct zs_pool {
> atomic_t compaction_in_progress;
> };
>
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> +static struct zs_pool *zs_singleton_pool;
> +#endif
> +
> static inline void zpdesc_set_first(struct zpdesc *zpdesc)
> {
> SetPagePrivate(zpdesc_page(zpdesc));
> @@ -2051,17 +2055,7 @@ static int calculate_zspage_chain_size(int class_size)
> return chain_size;
> }
>
> -/**
> - * zs_create_pool - Creates an allocation pool to work from.
> - * @name: pool name to be created
> - *
> - * This function must be called before anything when using
> - * the zsmalloc allocator.
> - *
> - * On success, a pointer to the newly created pool is returned,
> - * otherwise NULL.
> - */
> -struct zs_pool *zs_create_pool(const char *name)
> +static struct zs_pool *__zs_create_pool(const char *name)
> {
> int i;
> struct zs_pool *pool;
> @@ -2170,9 +2164,29 @@ struct zs_pool *zs_create_pool(const char *name)
> zs_destroy_pool(pool);
> return NULL;
> }
> +
> +/**
> + * zs_create_pool - Creates an allocation pool to work from.
> + * @name: pool name to be created
> + *
> + * This function must be called before anything when using
> + * the zsmalloc allocator.
> + *
> + * On success, a pointer to the newly created pool is returned,
> + * otherwise NULL.
> + */
> +struct zs_pool *zs_create_pool(const char *name)
> +{
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> + return zs_singleton_pool;
> +#else
> + return __zs_create_pool(name);
> +#endif
> +
> +}
> EXPORT_SYMBOL_GPL(zs_create_pool);
>
> -void zs_destroy_pool(struct zs_pool *pool)
> +static void __zs_destroy_pool(struct zs_pool *pool)
> {
> int i;
>
> @@ -2203,8 +2217,35 @@ void zs_destroy_pool(struct zs_pool *pool)
> kfree(pool->name);
> kfree(pool);
> }
> +
> +void zs_destroy_pool(struct zs_pool *pool __maybe_unused)
> +{
> +#ifndef CONFIG_ZSMALLOC_SINGLETON_POOL
> + __zs_destroy_pool(pool);
> +#endif
> +}
> EXPORT_SYMBOL_GPL(zs_destroy_pool);
>
> +static void zs_destroy_singleton_pool(void)
> +{
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> + if (zs_singleton_pool) {
> + __zs_destroy_pool(zs_singleton_pool);
> + zs_singleton_pool = NULL;
> + }
> +#endif
> +}
> +
> +static int zs_create_singleton_pool(void)
> +{
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> + zs_singleton_pool = __zs_create_pool("zsmalloc");
> + if (!zs_singleton_pool)
> + return -ENOMEM;
> +#endif
> + return 0;
> +}
> +
> static void zs_destroy_caches(void)
> {
> kmem_cache_destroy(handle_cachep);
> @@ -2235,9 +2276,16 @@ static int __init zs_init(void)
> if (rc)
> return rc;
>
> + rc = zs_create_singleton_pool();
> + if (rc) {
> + zs_destroy_caches();
> + return rc;
> + }
> +
> #ifdef CONFIG_COMPACTION
> rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
> if (rc) {
> + zs_destroy_singleton_pool();
> zs_destroy_caches();
> return rc;
> }
> @@ -2252,6 +2300,7 @@ static void __exit zs_exit(void)
> set_movable_ops(NULL, PGTY_zsmalloc);
> #endif
> zs_stat_exit();
> + zs_destroy_singleton_pool();
> zs_destroy_caches();
> }
>
> --
> 2.52.0.457.g6b5491de43-goog
>
On (26/01/21 23:58), Yosry Ahmed wrote: > On Wed, Jan 21, 2026 at 12:41:39PM +0900, Sergey Senozhatsky wrote: > > On (26/01/19 13:44), Nhat Pham wrote: > > > On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky > > > <senozhatsky@chromium.org> wrote: > > > > > > > > On (26/01/16 13:48), Sergey Senozhatsky wrote: > > > > > Currently, zsmalloc creates kmem_cache of handles and zspages > > > > > for each pool, which may be suboptimal from the memory usage > > > > > point of view (extra internal fragmentation per pool). Systems > > > > > that create multiple zsmalloc pools may benefit from shared > > > > > common zsmalloc caches. > > > > > > > > This is step 1. > > > > > > > > Step 2 is to look into possibility of sharing zsmalloc pools. > > > > E.g. if there are N zram devices in the system, do we really need > > > > N zsmalloc pools? Can we just share a single pool between them? > > > > > > Ditto for zswap (although here, we almost always only have a single zswap pool). > > > > COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting > > an "Oops: stack guard page: 0000" early during boot). > > > > So I'm thinking of something like below. Basically have a Kconfig > > option to turn zsmalloc into a singleton pool mode, transparently > > for zsmalloc users. > > Why do we need a config option? Is the main concern with a single pool > lock contention? If yes, we can probably measure it by spawning many > zram devices and stressing them at the same time. That's a good question. I haven't thought about just converting zsmalloc to a singleton pool by default. I don't think I'm concerned with lock contention, the thing is we should have the same upper boundary contention wise (there are only num_online_cpus() tasks that can concurrently access any zsmalloc pool, be it a singleton or not). I certainly will try to measure once I have linux-next booting again. What was the reason why you allocated many zsmalloc pool in zswap?
On Thu, Jan 22, 2026 at 12:28:56PM +0900, Sergey Senozhatsky wrote: > On (26/01/21 23:58), Yosry Ahmed wrote: > > On Wed, Jan 21, 2026 at 12:41:39PM +0900, Sergey Senozhatsky wrote: > > > On (26/01/19 13:44), Nhat Pham wrote: > > > > On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky > > > > <senozhatsky@chromium.org> wrote: > > > > > > > > > > On (26/01/16 13:48), Sergey Senozhatsky wrote: > > > > > > Currently, zsmalloc creates kmem_cache of handles and zspages > > > > > > for each pool, which may be suboptimal from the memory usage > > > > > > point of view (extra internal fragmentation per pool). Systems > > > > > > that create multiple zsmalloc pools may benefit from shared > > > > > > common zsmalloc caches. > > > > > > > > > > This is step 1. > > > > > > > > > > Step 2 is to look into possibility of sharing zsmalloc pools. > > > > > E.g. if there are N zram devices in the system, do we really need > > > > > N zsmalloc pools? Can we just share a single pool between them? > > > > > > > > Ditto for zswap (although here, we almost always only have a single zswap pool). > > > > > > COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting > > > an "Oops: stack guard page: 0000" early during boot). > > > > > > So I'm thinking of something like below. Basically have a Kconfig > > > option to turn zsmalloc into a singleton pool mode, transparently > > > for zsmalloc users. > > > > Why do we need a config option? Is the main concern with a single pool > > lock contention? If yes, we can probably measure it by spawning many > > zram devices and stressing them at the same time. > > That's a good question. I haven't thought about just converting > zsmalloc to a singleton pool by default. I don't think I'm > concerned with lock contention, the thing is we should have the > same upper boundary contention wise (there are only num_online_cpus() > tasks that can concurrently access any zsmalloc pool, be it a singleton > or not). I certainly will try to measure once I have linux-next booting > again. > > What was the reason why you allocated many zsmalloc pool in zswap? IIRC it was actually lock contention, specifically the pool spinlock. When the change was made to per-class spinlocks, we dropped the multiple pools: http://lore.kernel.org/linux-mm/20240617-zsmalloc-lock-mm-everything-v1-0-5e5081ea11b3@linux.dev/. So having multiple pools does mitigate lock contention in some cases. Even though the upper boundary might be the same, the actual number of CPUs contending on the same lock would go down in practice. While looking for this, I actually found something more interesting. I did propose more-or-less the same exact patch back when zswap used multiple pools: https://lore.kernel.org/all/20240604175340.218175-1-yosryahmed@google.com/. Seems like Minchan had some concerns back then. I wonder if those still apply.
On (26/01/22 03:39), Yosry Ahmed wrote: [..] > > That's a good question. I haven't thought about just converting > > zsmalloc to a singleton pool by default. I don't think I'm > > concerned with lock contention, the thing is we should have the > > same upper boundary contention wise (there are only num_online_cpus() > > tasks that can concurrently access any zsmalloc pool, be it a singleton > > or not). I certainly will try to measure once I have linux-next booting > > again. > > > > What was the reason why you allocated many zsmalloc pool in zswap? > > IIRC it was actually lock contention, specifically the pool spinlock. > When the change was made to per-class spinlocks, we dropped the multiple > pools: > http://lore.kernel.org/linux-mm/20240617-zsmalloc-lock-mm-everything-v1-0-5e5081ea11b3@linux.dev/. > > So having multiple pools does mitigate lock contention in some cases. > Even though the upper boundary might be the same, the actual number of > CPUs contending on the same lock would go down in practice. > > While looking for this, I actually found something more interesting. I > did propose more-or-less the same exact patch back when zswap used > multiple pools: > https://lore.kernel.org/all/20240604175340.218175-1-yosryahmed@google.com/. > > Seems like Minchan had some concerns back then. I wonder if those still > apply. Interesting. Lifecycles are completely random, I don't see how we can make any assumptions about them and how we can rely on them to avoid/control fragmentation. I think we should have global caches.
© 2016 - 2026 Red Hat, Inc.