[v1] zsmalloc: make common caches global

[RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 3 weeks, 2 days ago

Currently, zsmalloc creates kmem_cache of handles and zspages
for each pool, which may be suboptimal from the memory usage
point of view (extra internal fragmentation per pool).  Systems
that create multiple zsmalloc pools may benefit from shared
common zsmalloc caches.

Make handles and zspages kmem caches global.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 mm/zsmalloc.c | 95 ++++++++++++++++++++++-----------------------------
 1 file changed, 40 insertions(+), 55 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5abb8bc0956a..05ed3539aa1e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -198,12 +198,13 @@ struct link_free {
 	};
 };
 
+static struct kmem_cache *handle_cachep;
+static struct kmem_cache *zspage_cachep;
+
 struct zs_pool {
 	const char *name;
 
 	struct size_class *size_class[ZS_SIZE_CLASSES];
-	struct kmem_cache *handle_cachep;
-	struct kmem_cache *zspage_cachep;
 
 	atomic_long_t pages_allocated;
 
@@ -376,60 +377,28 @@ static void init_deferred_free(struct zs_pool *pool) {}
 static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
 #endif
 
-static int create_cache(struct zs_pool *pool)
+static unsigned long cache_alloc_handle(gfp_t gfp)
 {
-	char *name;
-
-	name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
-	if (!name)
-		return -ENOMEM;
-	pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
-						0, 0, NULL);
-	kfree(name);
-	if (!pool->handle_cachep)
-		return -EINVAL;
-
-	name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
-	if (!name)
-		return -ENOMEM;
-	pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
-						0, 0, NULL);
-	kfree(name);
-	if (!pool->zspage_cachep) {
-		kmem_cache_destroy(pool->handle_cachep);
-		pool->handle_cachep = NULL;
-		return -EINVAL;
-	}
-
-	return 0;
-}
+	gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
 
-static void destroy_cache(struct zs_pool *pool)
-{
-	kmem_cache_destroy(pool->handle_cachep);
-	kmem_cache_destroy(pool->zspage_cachep);
+	return (unsigned long)kmem_cache_alloc(handle_cachep, gfp);
 }
 
-static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
+static void cache_free_handle(unsigned long handle)
 {
-	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-			gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+	kmem_cache_free(handle_cachep, (void *)handle);
 }
 
-static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
+static struct zspage *cache_alloc_zspage(gfp_t gfp)
 {
-	kmem_cache_free(pool->handle_cachep, (void *)handle);
-}
+	gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
 
-static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
-{
-	return kmem_cache_zalloc(pool->zspage_cachep,
-			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+	return kmem_cache_zalloc(zspage_cachep, gfp);
 }
 
-static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+static void cache_free_zspage(struct zspage *zspage)
 {
-	kmem_cache_free(pool->zspage_cachep, zspage);
+	kmem_cache_free(zspage_cachep, zspage);
 }
 
 /* class->lock(which owns the handle) synchronizes races */
@@ -858,7 +827,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
 		zpdesc = next;
 	} while (zpdesc != NULL);
 
-	cache_free_zspage(pool, zspage);
+	cache_free_zspage(zspage);
 
 	class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
 	atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
@@ -971,7 +940,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 {
 	int i;
 	struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
-	struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+	struct zspage *zspage = cache_alloc_zspage(gfp);
 
 	if (!zspage)
 		return NULL;
@@ -993,7 +962,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 				zpdesc_dec_zone_page_state(zpdescs[i]);
 				free_zpdesc(zpdescs[i]);
 			}
-			cache_free_zspage(pool, zspage);
+			cache_free_zspage(zspage);
 			return NULL;
 		}
 		__zpdesc_set_zsmalloc(zpdesc);
@@ -1346,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
 	if (unlikely(size > ZS_MAX_ALLOC_SIZE))
 		return (unsigned long)ERR_PTR(-ENOSPC);
 
-	handle = cache_alloc_handle(pool, gfp);
+	handle = cache_alloc_handle(gfp);
 	if (!handle)
 		return (unsigned long)ERR_PTR(-ENOMEM);
 
@@ -1370,7 +1339,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
 
 	zspage = alloc_zspage(pool, class, gfp, nid);
 	if (!zspage) {
-		cache_free_handle(pool, handle);
+		cache_free_handle(handle);
 		return (unsigned long)ERR_PTR(-ENOMEM);
 	}
 
@@ -1450,7 +1419,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 		free_zspage(pool, class, zspage);
 
 	spin_unlock(&class->lock);
-	cache_free_handle(pool, handle);
+	cache_free_handle(handle);
 }
 EXPORT_SYMBOL_GPL(zs_free);
 
@@ -2112,9 +2081,6 @@ struct zs_pool *zs_create_pool(const char *name)
 	if (!pool->name)
 		goto err;
 
-	if (create_cache(pool))
-		goto err;
-
 	/*
 	 * Iterate reversely, because, size of size_class that we want to use
 	 * for merging should be larger or equal to current size.
@@ -2236,7 +2202,6 @@ void zs_destroy_pool(struct zs_pool *pool)
 		kfree(class);
 	}
 
-	destroy_cache(pool);
 	kfree(pool->name);
 	kfree(pool);
 }
@@ -2246,10 +2211,28 @@ static int __init zs_init(void)
 {
 	int rc __maybe_unused;
 
+	handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0,
+					  NULL);
+	if (!handle_cachep)
+		return -ENOMEM;
+
+	zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), 0,
+					  0, NULL);
+	if (!zspage_cachep) {
+		kmem_cache_destroy(handle_cachep);
+		handle_cachep = NULL;
+		return -ENOMEM;
+	}
+
 #ifdef CONFIG_COMPACTION
 	rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
-	if (rc)
+	if (rc) {
+		kmem_cache_destroy(zspage_cachep);
+		kmem_cache_destroy(handle_cachep);
+		zspage_cachep = NULL;
+		handle_cachep = NULL;
 		return rc;
+	}
 #endif
 	zs_stat_init();
 	return 0;
@@ -2261,6 +2244,8 @@ static void __exit zs_exit(void)
 	set_movable_ops(NULL, PGTY_zsmalloc);
 #endif
 	zs_stat_exit();
+	kmem_cache_destroy(zspage_cachep);
+	kmem_cache_destroy(handle_cachep);
 }
 
 module_init(zs_init);
-- 
2.52.0.457.g6b5491de43-goog

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Nhat Pham 2 weeks, 5 days ago

On Thu, Jan 15, 2026 at 8:49 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Currently, zsmalloc creates kmem_cache of handles and zspages
> for each pool, which may be suboptimal from the memory usage
> point of view (extra internal fragmentation per pool).  Systems
> that create multiple zsmalloc pools may benefit from shared
> common zsmalloc caches.
>
> Make handles and zspages kmem caches global.

Hmm yeah this sounds reasonable to me. No reason to have dedicated
kmem_cache per zs_pool (in the case of zswap, I suppose it's one for
each compression algorithm, which is usually just one - but still...).

Is there any lock contention implications?

>

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 2 weeks, 5 days ago

On (26/01/19 13:43), Nhat Pham wrote:
> On Thu, Jan 15, 2026 at 8:49 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > Currently, zsmalloc creates kmem_cache of handles and zspages
> > for each pool, which may be suboptimal from the memory usage
> > point of view (extra internal fragmentation per pool).  Systems
> > that create multiple zsmalloc pools may benefit from shared
> > common zsmalloc caches.
> >
> > Make handles and zspages kmem caches global.
> 
> Hmm yeah this sounds reasonable to me. No reason to have dedicated
> kmem_cache per zs_pool (in the case of zswap, I suppose it's one for
> each compression algorithm, which is usually just one - but still...).
> 
> Is there any lock contention implications?

cache_alloc_handle()/cache_alloc_zspage() (and their free counterparts)
are called outside of scope of any zsmalloc locks, so the upper boundary
on the number of concurrent callers is the same - num_online_cpus().

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Yosry Ahmed 3 weeks, 1 day ago

On Fri, Jan 16, 2026 at 01:48:41PM +0900, Sergey Senozhatsky wrote:
> Currently, zsmalloc creates kmem_cache of handles and zspages
> for each pool, which may be suboptimal from the memory usage
> point of view (extra internal fragmentation per pool).  Systems
> that create multiple zsmalloc pools may benefit from shared
> common zsmalloc caches.

I had a similar patch internally when we had 32 zsmalloc pools with
zswap.

You can calculate the savings by using /proc/slabinfo. The unused memory
is (num_objs-active_objs)*objsize. You can sum this across all caches
when you have multiple pools, and compare it to the unused memory with a
single cache.

> 
> Make handles and zspages kmem caches global.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  mm/zsmalloc.c | 95 ++++++++++++++++++++++-----------------------------
>  1 file changed, 40 insertions(+), 55 deletions(-)
> 
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 5abb8bc0956a..05ed3539aa1e 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -198,12 +198,13 @@ struct link_free {
>  	};
>  };
>  
> +static struct kmem_cache *handle_cachep;
> +static struct kmem_cache *zspage_cachep;
> +
>  struct zs_pool {
>  	const char *name;
>  
>  	struct size_class *size_class[ZS_SIZE_CLASSES];
> -	struct kmem_cache *handle_cachep;
> -	struct kmem_cache *zspage_cachep;
>  
>  	atomic_long_t pages_allocated;
>  
> @@ -376,60 +377,28 @@ static void init_deferred_free(struct zs_pool *pool) {}
>  static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
>  #endif
>  
> -static int create_cache(struct zs_pool *pool)
> +static unsigned long cache_alloc_handle(gfp_t gfp)
>  {
> -	char *name;
> -
> -	name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
> -	if (!name)
> -		return -ENOMEM;
> -	pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
> -						0, 0, NULL);
> -	kfree(name);
> -	if (!pool->handle_cachep)
> -		return -EINVAL;
> -
> -	name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
> -	if (!name)
> -		return -ENOMEM;
> -	pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
> -						0, 0, NULL);
> -	kfree(name);
> -	if (!pool->zspage_cachep) {
> -		kmem_cache_destroy(pool->handle_cachep);
> -		pool->handle_cachep = NULL;
> -		return -EINVAL;
> -	}
> -
> -	return 0;
> -}
> +	gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
>  
> -static void destroy_cache(struct zs_pool *pool)
> -{
> -	kmem_cache_destroy(pool->handle_cachep);
> -	kmem_cache_destroy(pool->zspage_cachep);
> +	return (unsigned long)kmem_cache_alloc(handle_cachep, gfp);
>  }
>  
> -static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
> +static void cache_free_handle(unsigned long handle)
>  {
> -	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
> -			gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> +	kmem_cache_free(handle_cachep, (void *)handle);
>  }
>  
> -static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
> +static struct zspage *cache_alloc_zspage(gfp_t gfp)
>  {
> -	kmem_cache_free(pool->handle_cachep, (void *)handle);
> -}
> +	gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
>  
> -static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
> -{
> -	return kmem_cache_zalloc(pool->zspage_cachep,
> -			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> +	return kmem_cache_zalloc(zspage_cachep, gfp);
>  }
>  
> -static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
> +static void cache_free_zspage(struct zspage *zspage)
>  {
> -	kmem_cache_free(pool->zspage_cachep, zspage);
> +	kmem_cache_free(zspage_cachep, zspage);
>  }
>  
>  /* class->lock(which owns the handle) synchronizes races */
> @@ -858,7 +827,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
>  		zpdesc = next;
>  	} while (zpdesc != NULL);
>  
> -	cache_free_zspage(pool, zspage);
> +	cache_free_zspage(zspage);
>  
>  	class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
>  	atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
> @@ -971,7 +940,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
>  {
>  	int i;
>  	struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
> -	struct zspage *zspage = cache_alloc_zspage(pool, gfp);
> +	struct zspage *zspage = cache_alloc_zspage(gfp);
>  
>  	if (!zspage)
>  		return NULL;
> @@ -993,7 +962,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
>  				zpdesc_dec_zone_page_state(zpdescs[i]);
>  				free_zpdesc(zpdescs[i]);
>  			}
> -			cache_free_zspage(pool, zspage);
> +			cache_free_zspage(zspage);
>  			return NULL;
>  		}
>  		__zpdesc_set_zsmalloc(zpdesc);
> @@ -1346,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
>  	if (unlikely(size > ZS_MAX_ALLOC_SIZE))
>  		return (unsigned long)ERR_PTR(-ENOSPC);
>  
> -	handle = cache_alloc_handle(pool, gfp);
> +	handle = cache_alloc_handle(gfp);
>  	if (!handle)
>  		return (unsigned long)ERR_PTR(-ENOMEM);
>  
> @@ -1370,7 +1339,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
>  
>  	zspage = alloc_zspage(pool, class, gfp, nid);
>  	if (!zspage) {
> -		cache_free_handle(pool, handle);
> +		cache_free_handle(handle);
>  		return (unsigned long)ERR_PTR(-ENOMEM);
>  	}
>  
> @@ -1450,7 +1419,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
>  		free_zspage(pool, class, zspage);
>  
>  	spin_unlock(&class->lock);
> -	cache_free_handle(pool, handle);
> +	cache_free_handle(handle);
>  }
>  EXPORT_SYMBOL_GPL(zs_free);
>  
> @@ -2112,9 +2081,6 @@ struct zs_pool *zs_create_pool(const char *name)
>  	if (!pool->name)
>  		goto err;
>  
> -	if (create_cache(pool))
> -		goto err;
> -
>  	/*
>  	 * Iterate reversely, because, size of size_class that we want to use
>  	 * for merging should be larger or equal to current size.
> @@ -2236,7 +2202,6 @@ void zs_destroy_pool(struct zs_pool *pool)
>  		kfree(class);
>  	}
>  
> -	destroy_cache(pool);
>  	kfree(pool->name);
>  	kfree(pool);
>  }
> @@ -2246,10 +2211,28 @@ static int __init zs_init(void)
>  {
>  	int rc __maybe_unused;
>  
> +	handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0,
> +					  NULL);
> +	if (!handle_cachep)
> +		return -ENOMEM;
> +
> +	zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), 0,
> +					  0, NULL);
> +	if (!zspage_cachep) {
> +		kmem_cache_destroy(handle_cachep);
> +		handle_cachep = NULL;
> +		return -ENOMEM;
> +	}
> +
>  #ifdef CONFIG_COMPACTION
>  	rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
> -	if (rc)
> +	if (rc) {
> +		kmem_cache_destroy(zspage_cachep);
> +		kmem_cache_destroy(handle_cachep);
> +		zspage_cachep = NULL;
> +		handle_cachep = NULL;
>  		return rc;
> +	}
>  #endif
>  	zs_stat_init();
>  	return 0;
> @@ -2261,6 +2244,8 @@ static void __exit zs_exit(void)
>  	set_movable_ops(NULL, PGTY_zsmalloc);
>  #endif
>  	zs_stat_exit();
> +	kmem_cache_destroy(zspage_cachep);
> +	kmem_cache_destroy(handle_cachep);
>  }

Hmm instead of the repeated kmem_cache_destroy() calls, can we do sth
like this:

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index dccb88d52c07..86e2ca95ac4c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2235,14 +2235,43 @@ void zs_destroy_pool(struct zs_pool *pool)
 }
 EXPORT_SYMBOL_GPL(zs_destroy_pool);

+static void __init zs_destroy_caches(void)
+{
+       kmem_cache_destroy(zs_handle_cache);
+       zs_handle_cache = NULL;
+       kmem_cache_destroy(zspage_cache);
+       zspage_cache = NULL;
+}
+
+static int __init zs_init_caches(void)
+{
+       zs_handle_cache = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+                                           0, 0, NULL);
+       zspage_cache = kmem_cache_create("zspage", sizeof(struct zspage),
+                                        0, 0, NULL);
+
+       if (!zs_handle_cache || !zspage_cache) {
+               zs_destroy_caches();
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+
 static int __init zs_init(void)
 {
-       int rc __maybe_unused;
+       int rc;
+
+       rc = zs_init_caches();
+       if (rc)
+               return rc;

 #ifdef CONFIG_COMPACTION
        rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
-       if (rc)
+       if (rc) {
+               zs_destroy_caches();
                return rc;
+       }
 #endif
        zs_stat_init();
        return 0;

>  
>  module_init(zs_init);
> -- 
> 2.52.0.457.g6b5491de43-goog
>

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 3 weeks, 1 day ago

On (26/01/16 20:49), Yosry Ahmed wrote:
> On Fri, Jan 16, 2026 at 01:48:41PM +0900, Sergey Senozhatsky wrote:
> > Currently, zsmalloc creates kmem_cache of handles and zspages
> > for each pool, which may be suboptimal from the memory usage
> > point of view (extra internal fragmentation per pool).  Systems
> > that create multiple zsmalloc pools may benefit from shared
> > common zsmalloc caches.
> 
> I had a similar patch internally when we had 32 zsmalloc pools with
> zswap.

Oh, nice.

> You can calculate the savings by using /proc/slabinfo. The unused memory
> is (num_objs-active_objs)*objsize. You can sum this across all caches
> when you have multiple pools, and compare it to the unused memory with a
> single cache.

Right.  Just curious, do you recall any numbers?

[..]
> Hmm instead of the repeated kmem_cache_destroy() calls, can we do sth
> like this:

Sure.

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Yosry Ahmed 2 weeks, 4 days ago

On Sat, Jan 17, 2026 at 11:24:01AM +0900, Sergey Senozhatsky wrote:
> On (26/01/16 20:49), Yosry Ahmed wrote:
> > On Fri, Jan 16, 2026 at 01:48:41PM +0900, Sergey Senozhatsky wrote:
> > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > for each pool, which may be suboptimal from the memory usage
> > > point of view (extra internal fragmentation per pool).  Systems
> > > that create multiple zsmalloc pools may benefit from shared
> > > common zsmalloc caches.
> > 
> > I had a similar patch internally when we had 32 zsmalloc pools with
> > zswap.
> 
> Oh, nice.
> 
> > You can calculate the savings by using /proc/slabinfo. The unused memory
> > is (num_objs-active_objs)*objsize. You can sum this across all caches
> > when you have multiple pools, and compare it to the unused memory with a
> > single cache.
> 
> Right.  Just curious, do you recall any numbers?

I have the exact numbers actually, from /proc/slabinfo while running a
zswap (internal) test:

*** Before:
	# name <active_objs> <num_objs> <objsize> ..
	zs_handle  35637  35760     16  ...
	zs_handle  35577  35760     16  ...
	zs_handle  35638  35760     16  ...
	zs_handle  35700  35760     16  ...
	zs_handle  35937  36240     16  ...
	zs_handle  35518  35760     16  ...
	zs_handle  35700  36000     16  ...
	zs_handle  35517  35760     16  ...
	zs_handle  35818  36000     16  ...
	zs_handle  35698  35760     16  ...
	zs_handle  35536  35760     16  ...
	zs_handle  35877  36240     16  ...
	zs_handle  35757  36000     16  ...
	zs_handle  35760  36000     16  ...
	zs_handle  35820  36000     16  ...
	zs_handle  35999  36000     16  ...
	zs_handle  35700  36000     16  ...
	zs_handle  35817  36000     16  ...
	zs_handle  35698  36000     16  ...
	zs_handle  35699  36000     16  ...
	zs_handle  35580  35760     16  ...
	zs_handle  35578  35760     16  ...
	zs_handle  35820  36000     16  ...
	zs_handle  35517  35760     16  ...
	zs_handle  35700  36000     16  ...
	zs_handle  35640  35760     16  ...
	zs_handle  35820  36000     16  ...
	zs_handle  35578  35760     16  ...
	zs_handle  35578  35760     16  ...
	zs_handle  35817  36000     16  ...
	zs_handle  35518  35760     16  ...
	zs_handle  35940  36240     16  ...
	zspage    991   1079     48   ...
	zspage    936    996     48   ...
	zspage    940    996     48   ...
	zspage   1050   1079     48   ...
	zspage    973   1079     48   ...
	zspage    942    996     48   ...
	zspage   1065   1162     48   ...
	zspage    885    996     48   ...
	zspage    887    913     48   ...
	zspage   1053   1079     48   ...
	zspage    983    996     48   ...
	zspage    966    996     48   ...
	zspage    970   1079     48   ...
	zspage    880    913     48   ...
	zspage   1006   1079     48   ...
	zspage    998   1079     48   ...
	zspage   1129   1162     48   ...
	zspage    903    913     48   ...
	zspage    833    996     48   ...
	zspage    861    913     48   ...
	zspage    764    913     48   ...
	zspage    898    913     48   ...
	zspage    973   1079     48   ...
	zspage    945    996     48   ...
	zspage    943   1079     48   ...
	zspage   1024   1079     48   ...
	zspage    820    913     48   ...
	zspage    702    830     48   ...
	zspage   1049   1079     48   ...
	zspage    990   1162     48   ...
	zspage    988   1079     48   ...
	zspage    932    996     48   ...

Unused memory = $(awk '{s += $4*($3-$2)} END {print s}') = 218416 bytes

*** After:
	# name <active_objs> <num_objs> <objsize> ..
	zs_handle 1054440 1054800     16  ...
	zspage   5720   5810     48   ...

Unused memory = (1054800-1054440)*16 + (5810-5720)*48 = 10080 bytes

That was about ~20 times reduction in waste when using 32 pools with
zswap. I suspect we wouldn't be using that many pools with zram.

> 
> [..]
> > Hmm instead of the repeated kmem_cache_destroy() calls, can we do sth
> > like this:
> 
> Sure.

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 2 weeks, 4 days ago

On (26/01/21 01:30), Yosry Ahmed wrote:
> 
> That was about ~20 times reduction in waste when using 32 pools with
> zswap. 

Nice, thanks!

> I suspect we wouldn't be using that many pools with zram.

Hard to tell.  We have users that setup many zram devices, even
multiple swap zram devices.  In terms of numbers - I think swap
zram users still setup less devices than users that use zram as
a normal block device (that number in theory can be quite high,
depending on use case).

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 3 weeks, 2 days ago

On (26/01/16 13:48), Sergey Senozhatsky wrote:
> Currently, zsmalloc creates kmem_cache of handles and zspages
> for each pool, which may be suboptimal from the memory usage
> point of view (extra internal fragmentation per pool).  Systems
> that create multiple zsmalloc pools may benefit from shared
> common zsmalloc caches.

This is step 1.

Step 2 is to look into possibility of sharing zsmalloc pools.
E.g. if there are N zram devices in the system, do we really need
N zsmalloc pools?  Can we just share a single pool between them?

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Nhat Pham 2 weeks, 5 days ago

On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > Currently, zsmalloc creates kmem_cache of handles and zspages
> > for each pool, which may be suboptimal from the memory usage
> > point of view (extra internal fragmentation per pool).  Systems
> > that create multiple zsmalloc pools may benefit from shared
> > common zsmalloc caches.
>
> This is step 1.
>
> Step 2 is to look into possibility of sharing zsmalloc pools.
> E.g. if there are N zram devices in the system, do we really need
> N zsmalloc pools?  Can we just share a single pool between them?

Ditto for zswap (although here, we almost always only have a single zswap pool).

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 2 weeks, 4 days ago

On (26/01/19 13:44), Nhat Pham wrote:
> On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > for each pool, which may be suboptimal from the memory usage
> > > point of view (extra internal fragmentation per pool).  Systems
> > > that create multiple zsmalloc pools may benefit from shared
> > > common zsmalloc caches.
> >
> > This is step 1.
> >
> > Step 2 is to look into possibility of sharing zsmalloc pools.
> > E.g. if there are N zram devices in the system, do we really need
> > N zsmalloc pools?  Can we just share a single pool between them?
> 
> Ditto for zswap (although here, we almost always only have a single zswap pool).

COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting
an "Oops: stack guard page: 0000" early during boot).

So I'm thinking of something like below.  Basically have a Kconfig
option to turn zsmalloc into a singleton pool mode, transparently
for zsmalloc users.

---
 mm/Kconfig    | 11 ++++++++
 mm/zsmalloc.c | 73 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 4fc1a171dffa..ff6855e74c3d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,6 +132,17 @@ menu "Zsmalloc allocator options"
 
 comment "Zsmalloc is a common backend allocator for zswap & zram"
 
+config ZSMALLOC_SINGLETON_POOL
+	bool "Use a singleton zsmalloc pool"
+	default n
+	help
+	  This option enables the use of a single global zsmalloc pool
+	  instance for all users of zsmalloc (e.g., zswap, zram). This
+	  reduces memory overhead and fragmentation by sharing size class
+	  configurations and memory between different users.
+
+	  If unsure, say N.
+
 config ZSMALLOC_STAT
 	bool "Export zsmalloc statistics"
 	select DEBUG_FS
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8df45aa1b5c8..acd14b001342 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -224,6 +224,10 @@ struct zs_pool {
 	atomic_t compaction_in_progress;
 };
 
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+static struct zs_pool *zs_singleton_pool;
+#endif
+
 static inline void zpdesc_set_first(struct zpdesc *zpdesc)
 {
 	SetPagePrivate(zpdesc_page(zpdesc));
@@ -2051,17 +2055,7 @@ static int calculate_zspage_chain_size(int class_size)
 	return chain_size;
 }
 
-/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @name: pool name to be created
- *
- * This function must be called before anything when using
- * the zsmalloc allocator.
- *
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
- */
-struct zs_pool *zs_create_pool(const char *name)
+static struct zs_pool *__zs_create_pool(const char *name)
 {
 	int i;
 	struct zs_pool *pool;
@@ -2170,9 +2164,29 @@ struct zs_pool *zs_create_pool(const char *name)
 	zs_destroy_pool(pool);
 	return NULL;
 }
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @name: pool name to be created
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(const char *name)
+{
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+	return zs_singleton_pool;
+#else
+	return __zs_create_pool(name);
+#endif
+
+}
 EXPORT_SYMBOL_GPL(zs_create_pool);
 
-void zs_destroy_pool(struct zs_pool *pool)
+static void __zs_destroy_pool(struct zs_pool *pool)
 {
 	int i;
 
@@ -2203,8 +2217,35 @@ void zs_destroy_pool(struct zs_pool *pool)
 	kfree(pool->name);
 	kfree(pool);
 }
+
+void zs_destroy_pool(struct zs_pool *pool __maybe_unused)
+{
+#ifndef CONFIG_ZSMALLOC_SINGLETON_POOL
+	__zs_destroy_pool(pool);
+#endif
+}
 EXPORT_SYMBOL_GPL(zs_destroy_pool);
 
+static void zs_destroy_singleton_pool(void)
+{
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+	if (zs_singleton_pool) {
+		__zs_destroy_pool(zs_singleton_pool);
+		zs_singleton_pool = NULL;
+	}
+#endif
+}
+
+static int zs_create_singleton_pool(void)
+{
+#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
+	zs_singleton_pool = __zs_create_pool("zsmalloc");
+	if (!zs_singleton_pool)
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
 static void zs_destroy_caches(void)
 {
 	kmem_cache_destroy(handle_cachep);
@@ -2235,9 +2276,16 @@ static int __init zs_init(void)
 	if (rc)
 		return rc;
 
+	rc = zs_create_singleton_pool();
+	if (rc) {
+		zs_destroy_caches();
+		return rc;
+	}
+
 #ifdef CONFIG_COMPACTION
 	rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
 	if (rc) {
+		zs_destroy_singleton_pool();
 		zs_destroy_caches();
 		return rc;
 	}
@@ -2252,6 +2300,7 @@ static void __exit zs_exit(void)
 	set_movable_ops(NULL, PGTY_zsmalloc);
 #endif
 	zs_stat_exit();
+	zs_destroy_singleton_pool();
 	zs_destroy_caches();
 }
 
-- 
2.52.0.457.g6b5491de43-goog

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Yosry Ahmed 2 weeks, 3 days ago

On Wed, Jan 21, 2026 at 12:41:39PM +0900, Sergey Senozhatsky wrote:
> On (26/01/19 13:44), Nhat Pham wrote:
> > On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
> > <senozhatsky@chromium.org> wrote:
> > >
> > > On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > > for each pool, which may be suboptimal from the memory usage
> > > > point of view (extra internal fragmentation per pool).  Systems
> > > > that create multiple zsmalloc pools may benefit from shared
> > > > common zsmalloc caches.
> > >
> > > This is step 1.
> > >
> > > Step 2 is to look into possibility of sharing zsmalloc pools.
> > > E.g. if there are N zram devices in the system, do we really need
> > > N zsmalloc pools?  Can we just share a single pool between them?
> > 
> > Ditto for zswap (although here, we almost always only have a single zswap pool).
> 
> COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting
> an "Oops: stack guard page: 0000" early during boot).
> 
> So I'm thinking of something like below.  Basically have a Kconfig
> option to turn zsmalloc into a singleton pool mode, transparently
> for zsmalloc users.

Why do we need a config option? Is the main concern with a single pool
lock contention? If yes, we can probably measure it by spawning many
zram devices and stressing them at the same time.

> 
> ---
>  mm/Kconfig    | 11 ++++++++
>  mm/zsmalloc.c | 73 ++++++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 72 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 4fc1a171dffa..ff6855e74c3d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -132,6 +132,17 @@ menu "Zsmalloc allocator options"
>  
>  comment "Zsmalloc is a common backend allocator for zswap & zram"
>  
> +config ZSMALLOC_SINGLETON_POOL
> +	bool "Use a singleton zsmalloc pool"
> +	default n
> +	help
> +	  This option enables the use of a single global zsmalloc pool
> +	  instance for all users of zsmalloc (e.g., zswap, zram). This
> +	  reduces memory overhead and fragmentation by sharing size class
> +	  configurations and memory between different users.
> +
> +	  If unsure, say N.
> +
>  config ZSMALLOC_STAT
>  	bool "Export zsmalloc statistics"
>  	select DEBUG_FS
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 8df45aa1b5c8..acd14b001342 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -224,6 +224,10 @@ struct zs_pool {
>  	atomic_t compaction_in_progress;
>  };
>  
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> +static struct zs_pool *zs_singleton_pool;
> +#endif
> +
>  static inline void zpdesc_set_first(struct zpdesc *zpdesc)
>  {
>  	SetPagePrivate(zpdesc_page(zpdesc));
> @@ -2051,17 +2055,7 @@ static int calculate_zspage_chain_size(int class_size)
>  	return chain_size;
>  }
>  
> -/**
> - * zs_create_pool - Creates an allocation pool to work from.
> - * @name: pool name to be created
> - *
> - * This function must be called before anything when using
> - * the zsmalloc allocator.
> - *
> - * On success, a pointer to the newly created pool is returned,
> - * otherwise NULL.
> - */
> -struct zs_pool *zs_create_pool(const char *name)
> +static struct zs_pool *__zs_create_pool(const char *name)
>  {
>  	int i;
>  	struct zs_pool *pool;
> @@ -2170,9 +2164,29 @@ struct zs_pool *zs_create_pool(const char *name)
>  	zs_destroy_pool(pool);
>  	return NULL;
>  }
> +
> +/**
> + * zs_create_pool - Creates an allocation pool to work from.
> + * @name: pool name to be created
> + *
> + * This function must be called before anything when using
> + * the zsmalloc allocator.
> + *
> + * On success, a pointer to the newly created pool is returned,
> + * otherwise NULL.
> + */
> +struct zs_pool *zs_create_pool(const char *name)
> +{
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> +	return zs_singleton_pool;
> +#else
> +	return __zs_create_pool(name);
> +#endif
> +
> +}
>  EXPORT_SYMBOL_GPL(zs_create_pool);
>  
> -void zs_destroy_pool(struct zs_pool *pool)
> +static void __zs_destroy_pool(struct zs_pool *pool)
>  {
>  	int i;
>  
> @@ -2203,8 +2217,35 @@ void zs_destroy_pool(struct zs_pool *pool)
>  	kfree(pool->name);
>  	kfree(pool);
>  }
> +
> +void zs_destroy_pool(struct zs_pool *pool __maybe_unused)
> +{
> +#ifndef CONFIG_ZSMALLOC_SINGLETON_POOL
> +	__zs_destroy_pool(pool);
> +#endif
> +}
>  EXPORT_SYMBOL_GPL(zs_destroy_pool);
>  
> +static void zs_destroy_singleton_pool(void)
> +{
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> +	if (zs_singleton_pool) {
> +		__zs_destroy_pool(zs_singleton_pool);
> +		zs_singleton_pool = NULL;
> +	}
> +#endif
> +}
> +
> +static int zs_create_singleton_pool(void)
> +{
> +#ifdef CONFIG_ZSMALLOC_SINGLETON_POOL
> +	zs_singleton_pool = __zs_create_pool("zsmalloc");
> +	if (!zs_singleton_pool)
> +		return -ENOMEM;
> +#endif
> +	return 0;
> +}
> +
>  static void zs_destroy_caches(void)
>  {
>  	kmem_cache_destroy(handle_cachep);
> @@ -2235,9 +2276,16 @@ static int __init zs_init(void)
>  	if (rc)
>  		return rc;
>  
> +	rc = zs_create_singleton_pool();
> +	if (rc) {
> +		zs_destroy_caches();
> +		return rc;
> +	}
> +
>  #ifdef CONFIG_COMPACTION
>  	rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
>  	if (rc) {
> +		zs_destroy_singleton_pool();
>  		zs_destroy_caches();
>  		return rc;
>  	}
> @@ -2252,6 +2300,7 @@ static void __exit zs_exit(void)
>  	set_movable_ops(NULL, PGTY_zsmalloc);
>  #endif
>  	zs_stat_exit();
> +	zs_destroy_singleton_pool();
>  	zs_destroy_caches();
>  }
>  
> -- 
> 2.52.0.457.g6b5491de43-goog
>

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 2 weeks, 3 days ago

On (26/01/21 23:58), Yosry Ahmed wrote:
> On Wed, Jan 21, 2026 at 12:41:39PM +0900, Sergey Senozhatsky wrote:
> > On (26/01/19 13:44), Nhat Pham wrote:
> > > On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
> > > <senozhatsky@chromium.org> wrote:
> > > >
> > > > On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > > > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > > > for each pool, which may be suboptimal from the memory usage
> > > > > point of view (extra internal fragmentation per pool).  Systems
> > > > > that create multiple zsmalloc pools may benefit from shared
> > > > > common zsmalloc caches.
> > > >
> > > > This is step 1.
> > > >
> > > > Step 2 is to look into possibility of sharing zsmalloc pools.
> > > > E.g. if there are N zram devices in the system, do we really need
> > > > N zsmalloc pools?  Can we just share a single pool between them?
> > > 
> > > Ditto for zswap (although here, we almost always only have a single zswap pool).
> > 
> > COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting
> > an "Oops: stack guard page: 0000" early during boot).
> > 
> > So I'm thinking of something like below.  Basically have a Kconfig
> > option to turn zsmalloc into a singleton pool mode, transparently
> > for zsmalloc users.
> 
> Why do we need a config option? Is the main concern with a single pool
> lock contention? If yes, we can probably measure it by spawning many
> zram devices and stressing them at the same time.

That's a good question.  I haven't thought about just converting
zsmalloc to a singleton pool by default.  I don't think I'm
concerned with lock contention, the thing is we should have the
same upper boundary contention wise (there are only num_online_cpus()
tasks that can concurrently access any zsmalloc pool, be it a singleton
or not).  I certainly will try to measure once I have linux-next booting
again.

What was the reason why you allocated many zsmalloc pool in zswap?

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Yosry Ahmed 2 weeks, 3 days ago

On Thu, Jan 22, 2026 at 12:28:56PM +0900, Sergey Senozhatsky wrote:
> On (26/01/21 23:58), Yosry Ahmed wrote:
> > On Wed, Jan 21, 2026 at 12:41:39PM +0900, Sergey Senozhatsky wrote:
> > > On (26/01/19 13:44), Nhat Pham wrote:
> > > > On Thu, Jan 15, 2026 at 9:53 PM Sergey Senozhatsky
> > > > <senozhatsky@chromium.org> wrote:
> > > > >
> > > > > On (26/01/16 13:48), Sergey Senozhatsky wrote:
> > > > > > Currently, zsmalloc creates kmem_cache of handles and zspages
> > > > > > for each pool, which may be suboptimal from the memory usage
> > > > > > point of view (extra internal fragmentation per pool).  Systems
> > > > > > that create multiple zsmalloc pools may benefit from shared
> > > > > > common zsmalloc caches.
> > > > >
> > > > > This is step 1.
> > > > >
> > > > > Step 2 is to look into possibility of sharing zsmalloc pools.
> > > > > E.g. if there are N zram devices in the system, do we really need
> > > > > N zsmalloc pools?  Can we just share a single pool between them?
> > > > 
> > > > Ditto for zswap (although here, we almost always only have a single zswap pool).
> > > 
> > > COMPLETELY UNTESTED (current linux-next doesn't boot for me, hitting
> > > an "Oops: stack guard page: 0000" early during boot).
> > > 
> > > So I'm thinking of something like below.  Basically have a Kconfig
> > > option to turn zsmalloc into a singleton pool mode, transparently
> > > for zsmalloc users.
> > 
> > Why do we need a config option? Is the main concern with a single pool
> > lock contention? If yes, we can probably measure it by spawning many
> > zram devices and stressing them at the same time.
> 
> That's a good question.  I haven't thought about just converting
> zsmalloc to a singleton pool by default.  I don't think I'm
> concerned with lock contention, the thing is we should have the
> same upper boundary contention wise (there are only num_online_cpus()
> tasks that can concurrently access any zsmalloc pool, be it a singleton
> or not).  I certainly will try to measure once I have linux-next booting
> again.
> 
> What was the reason why you allocated many zsmalloc pool in zswap?

IIRC it was actually lock contention, specifically the pool spinlock.
When the change was made to per-class spinlocks, we dropped the multiple
pools:
http://lore.kernel.org/linux-mm/20240617-zsmalloc-lock-mm-everything-v1-0-5e5081ea11b3@linux.dev/.

So having multiple pools does mitigate lock contention in some cases.
Even though the upper boundary might be the same, the actual number of
CPUs contending on the same lock would go down in practice.

While looking for this, I actually found something more interesting. I
did propose more-or-less the same exact patch back when zswap used
multiple pools:
https://lore.kernel.org/all/20240604175340.218175-1-yosryahmed@google.com/.

Seems like Minchan had some concerns back then. I wonder if those still
apply.

Re: [RFC PATCH] zsmalloc: make common caches global

Posted by Sergey Senozhatsky 2 weeks, 3 days ago

On (26/01/22 03:39), Yosry Ahmed wrote:
[..]
> > That's a good question.  I haven't thought about just converting
> > zsmalloc to a singleton pool by default.  I don't think I'm
> > concerned with lock contention, the thing is we should have the
> > same upper boundary contention wise (there are only num_online_cpus()
> > tasks that can concurrently access any zsmalloc pool, be it a singleton
> > or not).  I certainly will try to measure once I have linux-next booting
> > again.
> > 
> > What was the reason why you allocated many zsmalloc pool in zswap?
> 
> IIRC it was actually lock contention, specifically the pool spinlock.
> When the change was made to per-class spinlocks, we dropped the multiple
> pools:
> http://lore.kernel.org/linux-mm/20240617-zsmalloc-lock-mm-everything-v1-0-5e5081ea11b3@linux.dev/.
> 
> So having multiple pools does mitigate lock contention in some cases.
> Even though the upper boundary might be the same, the actual number of
> CPUs contending on the same lock would go down in practice.
> 
> While looking for this, I actually found something more interesting. I
> did propose more-or-less the same exact patch back when zswap used
> multiple pools:
> https://lore.kernel.org/all/20240604175340.218175-1-yosryahmed@google.com/.
> 
> Seems like Minchan had some concerns back then. I wonder if those still
> apply.

Interesting.  Lifecycles are completely random, I don't see how we
can make any assumptions about them and how we can rely on them to
avoid/control fragmentation.  I think we should have global caches.