[v11] zswap compression batching with optimized iaa_crypto driver

[PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Kanchana P Sridhar 2 months ago

This patch sets up zswap for allocating per-CPU resources optimally for
non-batching and batching compressors.

A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper
limit on the number of pages in large folios that will be batch
compressed.

As per Herbert's comments in [2] in response to the
crypto_acomp_batch_compress() and crypto_acomp_batch_decompress() API
proposed in [1], this series does not create new crypto_acomp batching
API. Instead, zswap compression batching uses the existing
crypto_acomp_compress() API in combination with the "void *kernel_data"
member added to "struct acomp_req" earlier in this series.

It is up to the compressor to manage multiple requests, as needed, to
accomplish batch parallelism. zswap only needs to allocate the per-CPU
dst buffers according to the batch size supported by the compressor.

A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
proceeds to allocate the necessary compression dst buffers in the
per-CPU acomp_ctx.

Another "u8 batch_size" member is added to "struct zswap_pool" to store
the unit for batching large folio stores: for batching compressors, this
is the pool->compr_batch_size. For non-batching compressors, this is
ZSWAP_MAX_BATCH_SIZE.

zswap does not use more than one dst buffer yet. Follow-up patches will
actually utilize the multiple acomp_ctx buffers for batch
compression/decompression of multiple pages.

Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used for
batching. There is a small extra memory overhead of allocating
the acomp_ctx->buffers array for compressors that do not support
batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes).

[1]: https://patchwork.kernel.org/project/linux-mm/patch/20250508194134.28392-11-kanchana.p.sridhar@intel.com/
[2]: https://patchwork.kernel.org/comment/26382610

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/zswap.c | 82 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index efd501a7fe294..63a997b999537 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -80,6 +80,9 @@ static bool zswap_pool_reached_full;
 
 #define ZSWAP_PARAM_UNSET ""
 
+/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
+#define ZSWAP_MAX_BATCH_SIZE 8U
+
 static int zswap_setup(void);
 
 /* Enable/disable zswap */
@@ -147,7 +150,7 @@ struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
 	struct acomp_req *req;
 	struct crypto_wait wait;
-	u8 *buffer;
+	u8 **buffers;
 	struct mutex mutex;
 	bool is_sleepable;
 };
@@ -166,6 +169,8 @@ struct zswap_pool {
 	struct work_struct release_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+	u8 compr_batch_size;
+	u8 batch_size;
 };
 
 /* Global LRU lists shared by all zswap pools. */
@@ -258,8 +263,10 @@ static void __zswap_pool_empty(struct percpu_ref *ref);
  *   zswap_cpu_comp_prepare(), not others.
  * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy().
  */
-static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
 {
+	u8 i;
+
 	if (IS_ERR_OR_NULL(acomp_ctx))
 		return;
 
@@ -269,7 +276,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
 	if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 		crypto_free_acomp(acomp_ctx->acomp);
 
-	kfree(acomp_ctx->buffer);
+	if (acomp_ctx->buffers) {
+		for (i = 0; i < nr_buffers; ++i)
+			kfree(acomp_ctx->buffers[i]);
+		kfree(acomp_ctx->buffers);
+	}
 }
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
@@ -290,6 +301,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 			return NULL;
 	}
 
+	/* Many things rely on the zero-initialization. */
 	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
 	if (!pool)
 		return NULL;
@@ -352,13 +364,28 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 		goto ref_fail;
 	INIT_LIST_HEAD(&pool->list);
 
+	/*
+	 * Set the unit of compress batching for large folios, for quick
+	 * retrieval in the zswap_compress() fast path:
+	 * If the compressor is sequential (@pool->compr_batch_size is 1),
+	 * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE
+	 * pages, where each page in the batch is compressed sequentially.
+	 * We see better performance by processing the folio in batches of
+	 * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
+	 * structures.
+	 */
+	pool->batch_size = (pool->compr_batch_size > 1) ?
+				pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
+
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
 ref_fail:
 	for_each_possible_cpu(cpu)
-		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+				  pool->compr_batch_size);
+
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -417,7 +444,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 	zswap_pool_debug("destroying", pool);
 
 	for_each_possible_cpu(cpu)
-		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+				  pool->compr_batch_size);
 
 	free_percpu(pool->acomp_ctx);
 
@@ -876,6 +904,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 	int ret = -ENOMEM;
+	u8 i;
 
 	/*
 	 * The per-CPU pool->acomp_ctx is zero-initialized on allocation.
@@ -888,10 +917,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 		return 0;
 
-	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!acomp_ctx->buffer)
-		return ret;
-
 	acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
 	if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
@@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
 	if (IS_ERR_OR_NULL(acomp_ctx->req)) {
 		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
+			pool->tfm_name);
 		goto fail;
 	}
 
-	crypto_init_wait(&acomp_ctx->wait);
+	/*
+	 * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
+	 * compressor supports batching.
+	 */
+	pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
+				     crypto_acomp_batch_size(acomp_ctx->acomp));
+
+	acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
+					  GFP_KERNEL, cpu_to_node(cpu));
+	if (!acomp_ctx->buffers)
+		goto fail;
+
+	for (i = 0; i < pool->compr_batch_size; ++i) {
+		acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
+						     cpu_to_node(cpu));
+		if (!acomp_ctx->buffers[i])
+			goto fail;
+	}
 
 	/*
 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
 	 * won't be called, crypto_wait_req() will return without blocking.
 	 */
+	crypto_init_wait(&acomp_ctx->wait);
+
 	acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
 
@@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	return 0;
 
 fail:
-	acomp_ctx_dealloc(acomp_ctx);
+	acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
 	return ret;
 }
 
@@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 
 	mutex_lock(&acomp_ctx->mutex);
 
-	dst = acomp_ctx->buffer;
+	dst = acomp_ctx->buffers[0];
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
 
@@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
-	obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+	obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffers[0]);
 
 	/*
 	 * zpool_obj_read_begin() might return a kmap address of highmem when
-	 * acomp_ctx->buffer is not used.  However, sg_init_one() does not
-	 * handle highmem addresses, so copy the object to acomp_ctx->buffer.
+	 * acomp_ctx->buffers[0] is not used.  However, sg_init_one() does not
+	 * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
 	 */
 	if (virt_addr_valid(obj)) {
 		src = obj;
 	} else {
-		WARN_ON_ONCE(obj == acomp_ctx->buffer);
-		memcpy(acomp_ctx->buffer, obj, entry->length);
-		src = acomp_ctx->buffer;
+		WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
+		memcpy(acomp_ctx->buffers[0], obj, entry->length);
+		src = acomp_ctx->buffers[0];
 	}
 
 	sg_init_one(&input, src, entry->length);
-- 
2.27.0

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month, 1 week ago

Hi Kanchana,


[...]
>
> +       /*
> +        * Set the unit of compress batching for large folios, for quick
> +        * retrieval in the zswap_compress() fast path:
> +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> +        * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE
> +        * pages, where each page in the batch is compressed sequentially.
> +        * We see better performance by processing the folio in batches of
> +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> +        * structures.
> +        */
> +       pool->batch_size = (pool->compr_batch_size > 1) ?
> +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> +
>         zswap_pool_debug("created", pool);
>
>         return pool;
>

It’s hard to follow — you add batch_size and compr_batch_size in this
patch, but only use them in another. Could we merge the related changes
into one patch instead of splitting them into several that don’t work
independently?

> -
>         acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
>         if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
>                 pr_err("could not alloc crypto acomp %s : %ld\n",
> @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
>         if (IS_ERR_OR_NULL(acomp_ctx->req)) {
>                 pr_err("could not alloc crypto acomp_request %s\n",
> -                      pool->tfm_name);
> +                       pool->tfm_name);
>                 goto fail;
>         }
>
> -       crypto_init_wait(&acomp_ctx->wait);
> +       /*
> +        * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> +        * compressor supports batching.
> +        */
> +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> +                                    crypto_acomp_batch_size(acomp_ctx->acomp));
> +
> +       acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
> +                                         GFP_KERNEL, cpu_to_node(cpu));
> +       if (!acomp_ctx->buffers)
> +               goto fail;
> +
> +       for (i = 0; i < pool->compr_batch_size; ++i) {
> +               acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> +                                                    cpu_to_node(cpu));
> +               if (!acomp_ctx->buffers[i])
> +                       goto fail;
> +       }

It’s hard to follow — memory is allocated here but only used in another
patch. Could we merge the related changes into a single patch instead of
splitting them into several that don’t work independently?

>
>         /*
>          * if the backend of acomp is async zip, crypto_req_done() will wakeup
>          * crypto_wait_req(); if the backend of acomp is scomp, the callback
>          * won't be called, crypto_wait_req() will return without blocking.
>          */
> +       crypto_init_wait(&acomp_ctx->wait);
> +
>         acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
>                                    crypto_req_done, &acomp_ctx->wait);
>
> @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         return 0;
>
>  fail:
> -       acomp_ctx_dealloc(acomp_ctx);
> +       acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
>         return ret;
>  }
>
> @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>
>         mutex_lock(&acomp_ctx->mutex);
>
> -       dst = acomp_ctx->buffer;
> +       dst = acomp_ctx->buffers[0];
>         sg_init_table(&input, 1);
>         sg_set_page(&input, page, PAGE_SIZE, 0);
>
> @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>         mutex_lock(&acomp_ctx->mutex);
> -       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
> +       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffers[0]);
>
>         /*
>          * zpool_obj_read_begin() might return a kmap address of highmem when
> -        * acomp_ctx->buffer is not used.  However, sg_init_one() does not
> -        * handle highmem addresses, so copy the object to acomp_ctx->buffer.
> +        * acomp_ctx->buffers[0] is not used.  However, sg_init_one() does not
> +        * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
>          */
>         if (virt_addr_valid(obj)) {
>                 src = obj;
>         } else {
> -               WARN_ON_ONCE(obj == acomp_ctx->buffer);
> -               memcpy(acomp_ctx->buffer, obj, entry->length);
> -               src = acomp_ctx->buffer;
> +               WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> +               memcpy(acomp_ctx->buffers[0], obj, entry->length);
> +               src = acomp_ctx->buffers[0];

Hard to understand what is going on if related changes are not kept in
one self-contained patch.

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month, 1 week ago


> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Monday, August 25, 2025 8:48 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> Hi Kanchana,
> 
> 
> [...]
> >
> > +       /*
> > +        * Set the unit of compress batching for large folios, for quick
> > +        * retrieval in the zswap_compress() fast path:
> > +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> > +        * large folios will be compressed in batches of
> ZSWAP_MAX_BATCH_SIZE
> > +        * pages, where each page in the batch is compressed sequentially.
> > +        * We see better performance by processing the folio in batches of
> > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> > +        * structures.
> > +        */
> > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> > +
> >         zswap_pool_debug("created", pool);
> >
> >         return pool;
> >
> 
> It’s hard to follow — you add batch_size and compr_batch_size in this
> patch, but only use them in another. Could we merge the related changes
> into one patch instead of splitting them into several that don’t work
> independently?

Hi Barry,

Thanks for reviewing the code and for your comments! Sure, I can merge
this patch with the next one. I was trying to keep the changes modularized
to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) zswap_compress()
so the changes are broken into smaller parts, but I can see how this can
make the changes appear disjointed.

One thing though: the commit logs for each of the patches will
also probably need to be merged, since I have tried to explain the
changes in detail.

Thanks,
Kanchana




> 
> > -
> >         acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> cpu_to_node(cpu));
> >         if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
> >                 pr_err("could not alloc crypto acomp %s : %ld\n",
> > @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
> >         if (IS_ERR_OR_NULL(acomp_ctx->req)) {
> >                 pr_err("could not alloc crypto acomp_request %s\n",
> > -                      pool->tfm_name);
> > +                       pool->tfm_name);
> >                 goto fail;
> >         }
> >
> > -       crypto_init_wait(&acomp_ctx->wait);
> > +       /*
> > +        * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> > +        * compressor supports batching.
> > +        */
> > +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> > +                                    crypto_acomp_batch_size(acomp_ctx->acomp));
> > +
> > +       acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8
> *),
> > +                                         GFP_KERNEL, cpu_to_node(cpu));
> > +       if (!acomp_ctx->buffers)
> > +               goto fail;
> > +
> > +       for (i = 0; i < pool->compr_batch_size; ++i) {
> > +               acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> GFP_KERNEL,
> > +                                                    cpu_to_node(cpu));
> > +               if (!acomp_ctx->buffers[i])
> > +                       goto fail;
> > +       }
> 
> It’s hard to follow — memory is allocated here but only used in another
> patch. Could we merge the related changes into a single patch instead of
> splitting them into several that don’t work independently?
> 
> >
> >         /*
> >          * if the backend of acomp is async zip, crypto_req_done() will wakeup
> >          * crypto_wait_req(); if the backend of acomp is scomp, the callback
> >          * won't be called, crypto_wait_req() will return without blocking.
> >          */
> > +       crypto_init_wait(&acomp_ctx->wait);
> > +
> >         acomp_request_set_callback(acomp_ctx->req,
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> >                                    crypto_req_done, &acomp_ctx->wait);
> >
> > @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         return 0;
> >
> >  fail:
> > -       acomp_ctx_dealloc(acomp_ctx);
> > +       acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
> >         return ret;
> >  }
> >
> > @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> >
> >         mutex_lock(&acomp_ctx->mutex);
> >
> > -       dst = acomp_ctx->buffer;
> > +       dst = acomp_ctx->buffers[0];
> >         sg_init_table(&input, 1);
> >         sg_set_page(&input, page, PAGE_SIZE, 0);
> >
> > @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct
> zswap_entry *entry, struct folio *folio)
> >
> >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> >         mutex_lock(&acomp_ctx->mutex);
> > -       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
> > +       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx-
> >buffers[0]);
> >
> >         /*
> >          * zpool_obj_read_begin() might return a kmap address of highmem
> when
> > -        * acomp_ctx->buffer is not used.  However, sg_init_one() does not
> > -        * handle highmem addresses, so copy the object to acomp_ctx-
> >buffer.
> > +        * acomp_ctx->buffers[0] is not used.  However, sg_init_one() does not
> > +        * handle highmem addresses, so copy the object to acomp_ctx-
> >buffers[0].
> >          */
> >         if (virt_addr_valid(obj)) {
> >                 src = obj;
> >         } else {
> > -               WARN_ON_ONCE(obj == acomp_ctx->buffer);
> > -               memcpy(acomp_ctx->buffer, obj, entry->length);
> > -               src = acomp_ctx->buffer;
> > +               WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> > +               memcpy(acomp_ctx->buffers[0], obj, entry->length);
> > +               src = acomp_ctx->buffers[0];
> 
> Hard to understand what is going on if related changes are not kept in
> one self-contained patch.
> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month, 1 week ago

On Tue, Aug 26, 2025 at 12:27 PM Sridhar, Kanchana P
<kanchana.p.sridhar@intel.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Barry Song <21cnbao@gmail.com>
> > Sent: Monday, August 25, 2025 8:48 PM
> > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> > herbert@gondor.apana.org.au; davem@davemloft.net;
> > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> > if the compressor supports batching.
> >
> > Hi Kanchana,
> >
> >
> > [...]
> > >
> > > +       /*
> > > +        * Set the unit of compress batching for large folios, for quick
> > > +        * retrieval in the zswap_compress() fast path:
> > > +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> > > +        * large folios will be compressed in batches of
> > ZSWAP_MAX_BATCH_SIZE
> > > +        * pages, where each page in the batch is compressed sequentially.
> > > +        * We see better performance by processing the folio in batches of
> > > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> > > +        * structures.
> > > +        */
> > > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > > +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> > > +
> > >         zswap_pool_debug("created", pool);
> > >
> > >         return pool;
> > >
> >
> > It’s hard to follow — you add batch_size and compr_batch_size in this
> > patch, but only use them in another. Could we merge the related changes
> > into one patch instead of splitting them into several that don’t work
> > independently?
>
> Hi Barry,
>
> Thanks for reviewing the code and for your comments! Sure, I can merge
> this patch with the next one. I was trying to keep the changes modularized
> to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) zswap_compress()
> so the changes are broken into smaller parts, but I can see how this can
> make the changes appear disjointed.
>
> One thing though: the commit logs for each of the patches will
> also probably need to be merged, since I have tried to explain the
> changes in detail.

It’s fine to merge the changelog and present the story as a whole. Do we
really need both pool->batch_size and pool->compr_batch_size? I assume
pool->batch_size = pool->compr_batch_size if HW supports batch; otherwise
pool->compr_batch_size = 1. It seems pool->compr_batch_size should either
be a bool or be dropped. If we drop it, you can still check whether HW
supports batch via crypto_acomp_batch_size() when doing compression:

if (crypto_acomp_batch_size() > 1)
    compress in steps of PAGE_SIZE;
else
    compress in steps of batch_size;

no ?

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month, 1 week ago

> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Monday, August 25, 2025 9:42 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> On Tue, Aug 26, 2025 at 12:27 PM Sridhar, Kanchana P
> <kanchana.p.sridhar@intel.com> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Barry Song <21cnbao@gmail.com>
> > > Sent: Monday, August 25, 2025 8:48 PM
> > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> > > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> > > foundation.org; senozhatsky@chromium.org; linux-
> crypto@vger.kernel.org;
> > > herbert@gondor.apana.org.au; davem@davemloft.net;
> > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching
> resources
> > > if the compressor supports batching.
> > >
> > > Hi Kanchana,
> > >
> > >
> > > [...]
> > > >
> > > > +       /*
> > > > +        * Set the unit of compress batching for large folios, for quick
> > > > +        * retrieval in the zswap_compress() fast path:
> > > > +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> > > > +        * large folios will be compressed in batches of
> > > ZSWAP_MAX_BATCH_SIZE
> > > > +        * pages, where each page in the batch is compressed sequentially.
> > > > +        * We see better performance by processing the folio in batches of
> > > > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> > > > +        * structures.
> > > > +        */
> > > > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > > > +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> > > > +
> > > >         zswap_pool_debug("created", pool);
> > > >
> > > >         return pool;
> > > >
> > >
> > > It’s hard to follow — you add batch_size and compr_batch_size in this
> > > patch, but only use them in another. Could we merge the related changes
> > > into one patch instead of splitting them into several that don’t work
> > > independently?
> >
> > Hi Barry,
> >
> > Thanks for reviewing the code and for your comments! Sure, I can merge
> > this patch with the next one. I was trying to keep the changes modularized
> > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c)
> zswap_compress()
> > so the changes are broken into smaller parts, but I can see how this can
> > make the changes appear disjointed.
> >
> > One thing though: the commit logs for each of the patches will
> > also probably need to be merged, since I have tried to explain the
> > changes in detail.
> 
> It’s fine to merge the changelog and present the story as a whole. Do we

Sure.

> really need both pool->batch_size and pool->compr_batch_size? I assume
> pool->batch_size = pool->compr_batch_size if HW supports batch; otherwise
> pool->compr_batch_size = 1.

Actually not exactly. We have found value in compressing in batches of
ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits
from cache locality of working-set data. Hence the approach that we have
settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if
the compressor does not support batching (i.e., if pool->compr_batch_size is 1).
If it does, then pool->batch_size = pool->compr_batch_size.

Besides this, pool->compr_batch_size helps to distinguish the number of
acomp_ctx resources during zswap_compress(); distinctly from the batch size.

> It seems pool->compr_batch_size should either
> be a bool or be dropped. If we drop it, you can still check whether HW
> supports batch via crypto_acomp_batch_size() when doing compression:
> 
> if (crypto_acomp_batch_size() > 1)
>     compress in steps of PAGE_SIZE;
> else
>     compress in steps of batch_size;
> 
> no ?

I could do this, but it will impact latency. As I was mentioning in an earlier
response to Nhat, this (keeping compr_batch_size and batch_size distinct)
was a small memory overhead per zswap_pool (one more u8 data member),
given that there are very few zswap pools. For this trade off, we are able to
minimize computes in zswap_compress()..

Thanks,
Kanchana

> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month, 1 week ago

> > > > [...]
> > > > >
> > > > > +       /*
> > > > > +        * Set the unit of compress batching for large folios, for quick
> > > > > +        * retrieval in the zswap_compress() fast path:
> > > > > +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> > > > > +        * large folios will be compressed in batches of
> > > > ZSWAP_MAX_BATCH_SIZE
> > > > > +        * pages, where each page in the batch is compressed sequentially.
> > > > > +        * We see better performance by processing the folio in batches of
> > > > > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> > > > > +        * structures.
> > > > > +        */
> > > > > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > > > > +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> > > > > +
> > > > >         zswap_pool_debug("created", pool);
> > > > >
> > > > >         return pool;
> > > > >
> > > >
> > > > It’s hard to follow — you add batch_size and compr_batch_size in this
> > > > patch, but only use them in another. Could we merge the related changes
> > > > into one patch instead of splitting them into several that don’t work
> > > > independently?
> > >
> > > Hi Barry,
> > >
> > > Thanks for reviewing the code and for your comments! Sure, I can merge
> > > this patch with the next one. I was trying to keep the changes modularized
> > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c)
> > zswap_compress()
> > > so the changes are broken into smaller parts, but I can see how this can
> > > make the changes appear disjointed.
> > >
> > > One thing though: the commit logs for each of the patches will
> > > also probably need to be merged, since I have tried to explain the
> > > changes in detail.
> >
> > It’s fine to merge the changelog and present the story as a whole. Do we
>
> Sure.
>
> > really need both pool->batch_size and pool->compr_batch_size? I assume
> > pool->batch_size = pool->compr_batch_size if HW supports batch; otherwise
> > pool->compr_batch_size = 1.
>
> Actually not exactly. We have found value in compressing in batches of
> ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits
> from cache locality of working-set data. Hence the approach that we have
> settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if
> the compressor does not support batching (i.e., if pool->compr_batch_size is 1).
> If it does, then pool->batch_size = pool->compr_batch_size.

I understand that even without a hardware batch, you can still
have some software batching that excludes compression.

However, based on the code below, it looks like
pool->compr_batch_size is almost always either equal to
pool->batch_size or 1:

+       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
+                                    crypto_acomp_batch_size(acomp_ctx->acomp));

+       pool->batch_size = (pool->compr_batch_size > 1) ?
+                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;


It seems one of these two variables may be redundant.
For instance, no matter if pool->compr_batch_size > 1, could we always treat
batch_size as ZSWAP_MAX_BATCH_SIZE?  if we remove
pool->batch_size, could we just use pool->compr_batch_size as the
step size for compression no matter if pool->compr_batch_size > 1.

for example:
       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
                                    crypto_acomp_batch_size(acomp_ctx->acomp));

Then batch the zswap store using ZSWAP_MAX_BATCH_SIZE, but set the
compression step to pool->compr_batch_size.

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month, 1 week ago

> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Monday, August 25, 2025 10:17 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> > > > > [...]
> > > > > >
> > > > > > +       /*
> > > > > > +        * Set the unit of compress batching for large folios, for quick
> > > > > > +        * retrieval in the zswap_compress() fast path:
> > > > > > +        * If the compressor is sequential (@pool->compr_batch_size is
> 1),
> > > > > > +        * large folios will be compressed in batches of
> > > > > ZSWAP_MAX_BATCH_SIZE
> > > > > > +        * pages, where each page in the batch is compressed
> sequentially.
> > > > > > +        * We see better performance by processing the folio in batches
> of
> > > > > > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working
> set
> > > > > > +        * structures.
> > > > > > +        */
> > > > > > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > > > > > +                               pool->compr_batch_size :
> ZSWAP_MAX_BATCH_SIZE;
> > > > > > +
> > > > > >         zswap_pool_debug("created", pool);
> > > > > >
> > > > > >         return pool;
> > > > > >
> > > > >
> > > > > It’s hard to follow — you add batch_size and compr_batch_size in this
> > > > > patch, but only use them in another. Could we merge the related
> changes
> > > > > into one patch instead of splitting them into several that don’t work
> > > > > independently?
> > > >
> > > > Hi Barry,
> > > >
> > > > Thanks for reviewing the code and for your comments! Sure, I can merge
> > > > this patch with the next one. I was trying to keep the changes
> modularized
> > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c)
> > > zswap_compress()
> > > > so the changes are broken into smaller parts, but I can see how this can
> > > > make the changes appear disjointed.
> > > >
> > > > One thing though: the commit logs for each of the patches will
> > > > also probably need to be merged, since I have tried to explain the
> > > > changes in detail.
> > >
> > > It’s fine to merge the changelog and present the story as a whole. Do we
> >
> > Sure.
> >
> > > really need both pool->batch_size and pool->compr_batch_size? I assume
> > > pool->batch_size = pool->compr_batch_size if HW supports batch;
> otherwise
> > > pool->compr_batch_size = 1.
> >
> > Actually not exactly. We have found value in compressing in batches of
> > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits
> > from cache locality of working-set data. Hence the approach that we have
> > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if
> > the compressor does not support batching (i.e., if pool->compr_batch_size is
> 1).
> > If it does, then pool->batch_size = pool->compr_batch_size.
> 
> I understand that even without a hardware batch, you can still
> have some software batching that excludes compression.
> 
> However, based on the code below, it looks like
> pool->compr_batch_size is almost always either equal to
> pool->batch_size or 1:
> 
> +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> +                                    crypto_acomp_batch_size(acomp_ctx->acomp));

I would like to explain some of the considerations in coming up with this
approach:

1) The compression algorithm gets to decide an optimal batch-size.
    For a hardware accelerator such as IAA, this value could be different
    than ZSWAP_MAX_BATCH_SIZE.

2) ZSWAP_MAX_BATCH_SIZE acts as a limiting factor to the # of acomp_ctx
    per-CPU resources that will be allocated in zswap_cpu_comp_prepare();
    as per Yosry's suggestion. This helps limit the memory overhead for
    batching algorithms.

3) If a batching algorithm works with a batch size "X" , where
     1 < X < ZSWAP_MAX_BATCH_SIZE, two things need to happen:
     a) We want to only allocate "X" per-CPU resources.
     b) We want to process the folio in batches of "X", not ZSWAP_MAX_BATCH_SIZE
          to avail of the benefits of hardware parallelism. This is the compression
          step size you also mention.
          In particular, we cannot treat batch_size as ZSWAP_MAX_BATCH_SIZE,
          and send a batch of ZSWAP_MAX_BATCH_SIZE pages to zswap_compress()
          in this case. For e.g., what if the compress step-size is 6, but the new
          zswap_store_pages() introduced in patch 23 sends 8 pages to
          zswap_compress() because ZSWAP_MAX_BATCH_SIZE is set to 8?
          The code in zswap_compress() could get quite messy, which will impact
          latency.
        

> 
> +       pool->batch_size = (pool->compr_batch_size > 1) ?
> +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> 
> 
> It seems one of these two variables may be redundant.
> For instance, no matter if pool->compr_batch_size > 1, could we always treat
> batch_size as ZSWAP_MAX_BATCH_SIZE?  if we remove
> pool->batch_size, could we just use pool->compr_batch_size as the
> step size for compression no matter if pool->compr_batch_size > 1.

To further explain the rationale for keeping these two distinct, we
statically compute the compress step-size by querying the algorithm
in zswap_cpu_comp_prepare() after the acomp_ctx->acomp has been
created. We store it in pool->compr_batch_size.

Next, in zswap_pool_create(), we do a one-time computation to determine
if the pool->batch_size needs to align with a non-1 batching acomp's
compr_batch_size; or, if it is a non-batching compressor, for the
pool->batch_size to be ZSWAP_MAX_BATCH_SIZE.

This enables further code simplifications/unification in zswap_compress();
and quick retrieval of the # of available acomp_ctx batching resources to
set up the SG lists per call to crypto_acomp_compress(). IOW, memo-ization
for latency gains and unified code paths.

I hope this clarifies things further. Thanks again, these are all good questions.

Best regards,
Kanchana

> 
> for example:
>        pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
>                                     crypto_acomp_batch_size(acomp_ctx->acomp));
> 
> Then batch the zswap store using ZSWAP_MAX_BATCH_SIZE, but set the
> compression step to pool->compr_batch_size.
> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month, 1 week ago

On Wed, Aug 27, 2025 at 12:06 PM Sridhar, Kanchana P
<kanchana.p.sridhar@intel.com> wrote:
>
>
> > -----Original Message-----
> > From: Barry Song <21cnbao@gmail.com>
> > Sent: Monday, August 25, 2025 10:17 PM
> > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> > herbert@gondor.apana.org.au; davem@davemloft.net;
> > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> > if the compressor supports batching.
> >
> > > > > > [...]
> > > > > > >
> > > > > > > +       /*
> > > > > > > +        * Set the unit of compress batching for large folios, for quick
> > > > > > > +        * retrieval in the zswap_compress() fast path:
> > > > > > > +        * If the compressor is sequential (@pool->compr_batch_size is
> > 1),
> > > > > > > +        * large folios will be compressed in batches of
> > > > > > ZSWAP_MAX_BATCH_SIZE
> > > > > > > +        * pages, where each page in the batch is compressed
> > sequentially.
> > > > > > > +        * We see better performance by processing the folio in batches
> > of
> > > > > > > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working
> > set
> > > > > > > +        * structures.
> > > > > > > +        */
> > > > > > > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > > > > > > +                               pool->compr_batch_size :
> > ZSWAP_MAX_BATCH_SIZE;
> > > > > > > +
> > > > > > >         zswap_pool_debug("created", pool);
> > > > > > >
> > > > > > >         return pool;
> > > > > > >
> > > > > >
> > > > > > It’s hard to follow — you add batch_size and compr_batch_size in this
> > > > > > patch, but only use them in another. Could we merge the related
> > changes
> > > > > > into one patch instead of splitting them into several that don’t work
> > > > > > independently?
> > > > >
> > > > > Hi Barry,
> > > > >
> > > > > Thanks for reviewing the code and for your comments! Sure, I can merge
> > > > > this patch with the next one. I was trying to keep the changes
> > modularized
> > > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c)
> > > > zswap_compress()
> > > > > so the changes are broken into smaller parts, but I can see how this can
> > > > > make the changes appear disjointed.
> > > > >
> > > > > One thing though: the commit logs for each of the patches will
> > > > > also probably need to be merged, since I have tried to explain the
> > > > > changes in detail.
> > > >
> > > > It’s fine to merge the changelog and present the story as a whole. Do we
> > >
> > > Sure.
> > >
> > > > really need both pool->batch_size and pool->compr_batch_size? I assume
> > > > pool->batch_size = pool->compr_batch_size if HW supports batch;
> > otherwise
> > > > pool->compr_batch_size = 1.
> > >
> > > Actually not exactly. We have found value in compressing in batches of
> > > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits
> > > from cache locality of working-set data. Hence the approach that we have
> > > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if
> > > the compressor does not support batching (i.e., if pool->compr_batch_size is
> > 1).
> > > If it does, then pool->batch_size = pool->compr_batch_size.
> >
> > I understand that even without a hardware batch, you can still
> > have some software batching that excludes compression.
> >
> > However, based on the code below, it looks like
> > pool->compr_batch_size is almost always either equal to
> > pool->batch_size or 1:
> >
> > +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> > +                                    crypto_acomp_batch_size(acomp_ctx->acomp));
>
> I would like to explain some of the considerations in coming up with this
> approach:
>
> 1) The compression algorithm gets to decide an optimal batch-size.
>     For a hardware accelerator such as IAA, this value could be different
>     than ZSWAP_MAX_BATCH_SIZE.
>
> 2) ZSWAP_MAX_BATCH_SIZE acts as a limiting factor to the # of acomp_ctx
>     per-CPU resources that will be allocated in zswap_cpu_comp_prepare();
>     as per Yosry's suggestion. This helps limit the memory overhead for
>     batching algorithms.
>
> 3) If a batching algorithm works with a batch size "X" , where
>      1 < X < ZSWAP_MAX_BATCH_SIZE, two things need to happen:
>      a) We want to only allocate "X" per-CPU resources.
>      b) We want to process the folio in batches of "X", not ZSWAP_MAX_BATCH_SIZE
>           to avail of the benefits of hardware parallelism. This is the compression
>           step size you also mention.
>           In particular, we cannot treat batch_size as ZSWAP_MAX_BATCH_SIZE,
>           and send a batch of ZSWAP_MAX_BATCH_SIZE pages to zswap_compress()
>           in this case. For e.g., what if the compress step-size is 6, but the new
>           zswap_store_pages() introduced in patch 23 sends 8 pages to
>           zswap_compress() because ZSWAP_MAX_BATCH_SIZE is set to 8?
>           The code in zswap_compress() could get quite messy, which will impact
>           latency.

If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching,
compression is done with a step size of 1. If the hardware step size is 4,
compression occurs in two steps. If the hardware step size is 6, the first
compression uses a step size of 6, and the second uses a step size of 2.
Do you think this will work?

I don’t quite understand why you want to save
ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware batching
you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the
case for all platforms except yours.

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month, 1 week ago

> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Thursday, August 28, 2025 2:40 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> On Wed, Aug 27, 2025 at 12:06 PM Sridhar, Kanchana P
> <kanchana.p.sridhar@intel.com> wrote:
> >
> >
> > > -----Original Message-----
> > > From: Barry Song <21cnbao@gmail.com>
> > > Sent: Monday, August 25, 2025 10:17 PM
> > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> > > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> > > foundation.org; senozhatsky@chromium.org; linux-
> crypto@vger.kernel.org;
> > > herbert@gondor.apana.org.au; davem@davemloft.net;
> > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching
> resources
> > > if the compressor supports batching.
> > >
> > > > > > > [...]
> > > > > > > >
> > > > > > > > +       /*
> > > > > > > > +        * Set the unit of compress batching for large folios, for quick
> > > > > > > > +        * retrieval in the zswap_compress() fast path:
> > > > > > > > +        * If the compressor is sequential (@pool-
> >compr_batch_size is
> > > 1),
> > > > > > > > +        * large folios will be compressed in batches of
> > > > > > > ZSWAP_MAX_BATCH_SIZE
> > > > > > > > +        * pages, where each page in the batch is compressed
> > > sequentially.
> > > > > > > > +        * We see better performance by processing the folio in
> batches
> > > of
> > > > > > > > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of
> working
> > > set
> > > > > > > > +        * structures.
> > > > > > > > +        */
> > > > > > > > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > > > > > > > +                               pool->compr_batch_size :
> > > ZSWAP_MAX_BATCH_SIZE;
> > > > > > > > +
> > > > > > > >         zswap_pool_debug("created", pool);
> > > > > > > >
> > > > > > > >         return pool;
> > > > > > > >
> > > > > > >
> > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in
> this
> > > > > > > patch, but only use them in another. Could we merge the related
> > > changes
> > > > > > > into one patch instead of splitting them into several that don’t work
> > > > > > > independently?
> > > > > >
> > > > > > Hi Barry,
> > > > > >
> > > > > > Thanks for reviewing the code and for your comments! Sure, I can
> merge
> > > > > > this patch with the next one. I was trying to keep the changes
> > > modularized
> > > > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c)
> > > > > zswap_compress()
> > > > > > so the changes are broken into smaller parts, but I can see how this
> can
> > > > > > make the changes appear disjointed.
> > > > > >
> > > > > > One thing though: the commit logs for each of the patches will
> > > > > > also probably need to be merged, since I have tried to explain the
> > > > > > changes in detail.
> > > > >
> > > > > It’s fine to merge the changelog and present the story as a whole. Do
> we
> > > >
> > > > Sure.
> > > >
> > > > > really need both pool->batch_size and pool->compr_batch_size? I
> assume
> > > > > pool->batch_size = pool->compr_batch_size if HW supports batch;
> > > otherwise
> > > > > pool->compr_batch_size = 1.
> > > >
> > > > Actually not exactly. We have found value in compressing in batches of
> > > > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency
> benefits
> > > > from cache locality of working-set data. Hence the approach that we
> have
> > > > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if
> > > > the compressor does not support batching (i.e., if pool-
> >compr_batch_size is
> > > 1).
> > > > If it does, then pool->batch_size = pool->compr_batch_size.
> > >
> > > I understand that even without a hardware batch, you can still
> > > have some software batching that excludes compression.
> > >
> > > However, based on the code below, it looks like
> > > pool->compr_batch_size is almost always either equal to
> > > pool->batch_size or 1:
> > >
> > > +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> > > +                                    crypto_acomp_batch_size(acomp_ctx->acomp));
> >
> > I would like to explain some of the considerations in coming up with this
> > approach:
> >
> > 1) The compression algorithm gets to decide an optimal batch-size.
> >     For a hardware accelerator such as IAA, this value could be different
> >     than ZSWAP_MAX_BATCH_SIZE.
> >
> > 2) ZSWAP_MAX_BATCH_SIZE acts as a limiting factor to the # of acomp_ctx
> >     per-CPU resources that will be allocated in zswap_cpu_comp_prepare();
> >     as per Yosry's suggestion. This helps limit the memory overhead for
> >     batching algorithms.
> >
> > 3) If a batching algorithm works with a batch size "X" , where
> >      1 < X < ZSWAP_MAX_BATCH_SIZE, two things need to happen:
> >      a) We want to only allocate "X" per-CPU resources.
> >      b) We want to process the folio in batches of "X", not
> ZSWAP_MAX_BATCH_SIZE
> >           to avail of the benefits of hardware parallelism. This is the
> compression
> >           step size you also mention.
> >           In particular, we cannot treat batch_size as ZSWAP_MAX_BATCH_SIZE,
> >           and send a batch of ZSWAP_MAX_BATCH_SIZE pages to
> zswap_compress()
> >           in this case. For e.g., what if the compress step-size is 6, but the new
> >           zswap_store_pages() introduced in patch 23 sends 8 pages to
> >           zswap_compress() because ZSWAP_MAX_BATCH_SIZE is set to 8?
> >           The code in zswap_compress() could get quite messy, which will
> impact
> >           latency.
> 
> If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching,
> compression is done with a step size of 1. If the hardware step size is 4,
> compression occurs in two steps. If the hardware step size is 6, the first
> compression uses a step size of 6, and the second uses a step size of 2.
> Do you think this will work?

Hi Barry,

This would be non-optimal from code simplicity and latency perspectives.
One of the benefits of using the hardware accelerator's "batch parallelism"
is cost amortization across the batch. We might lose this benefit if we make
multiple calls to zswap_compress() to ask the hardware accelerator to
batch compress in smaller batches. Compression throughput would also
be sub-optimal.

In my patch-series, the rule is simple: if an algorithm has specified a
batch-size, carve out the folio by that "batch-size" # of pages to be
compressed as a batch in zswap_compress(). This custom batch-size
is capped at ZSWAP_MAX_BATCH_SIZE.

If an algorithm has not specified a batch-size, the default batch-size
is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE
# of pages to be compressed as a batch in zswap_compress().

> 
> I don’t quite understand why you want to save
> ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware
> batching
> you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the
> case for all platforms except yours.

Not sure I understand.. Just to clarify, this is not done to save on resources,
rather for the reasons stated above.

We are already saving on resources by only allocating only
"pool->compr_batch_size" number of resources
(*not* ZSWAP_MAX_BATCH_SIZE resources):

	pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
				     crypto_acomp_batch_size(acomp_ctx->acomp));

For non-Intel platforms, this means only 1 dst buffer is allocated, as
explained in the commit log for this patch.

" A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
proceeds to allocate the necessary compression dst buffers in the
per-CPU acomp_ctx."

Thanks,
Kanchana

> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month, 1 week ago

> >
> > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching,
> > compression is done with a step size of 1. If the hardware step size is 4,
> > compression occurs in two steps. If the hardware step size is 6, the first
> > compression uses a step size of 6, and the second uses a step size of 2.
> > Do you think this will work?
>
> Hi Barry,
>
> This would be non-optimal from code simplicity and latency perspectives.
> One of the benefits of using the hardware accelerator's "batch parallelism"
> is cost amortization across the batch. We might lose this benefit if we make
> multiple calls to zswap_compress() to ask the hardware accelerator to
> batch compress in smaller batches. Compression throughput would also
> be sub-optimal.

I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and
pool->compr_batch_size are powers of two. As you mentioned, we still
gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when
pool->compr_batch_size == 1, by compressing pages one by one but
batching other work such as zswap_entries_cache_alloc_batch() ?

>
> In my patch-series, the rule is simple: if an algorithm has specified a
> batch-size, carve out the folio by that "batch-size" # of pages to be
> compressed as a batch in zswap_compress(). This custom batch-size
> is capped at ZSWAP_MAX_BATCH_SIZE.
>
> If an algorithm has not specified a batch-size, the default batch-size
> is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE
> # of pages to be compressed as a batch in zswap_compress().

Yes, I understand your rule. However, having two global variables is still
somewhat confusing. It might be clearer to use a single variable with a
comment, since one variable can clearly determine the value of the other.

Can we get the batch_size at runtime based on pool->compr_batch_size?

/*
 * If hardware compression supports batching, we use the hardware step size.
 * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still compress
 * one page at a time.
 */
batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size :
             ZSWAP_MAX_BATCH_SIZE;

We probably don’t need this if both pool->compr_batch_size and
ZSWAP_MAX_BATCH_SIZE are powers of two?

>
> >
> > I don’t quite understand why you want to save
> > ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware
> > batching
> > you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the
> > case for all platforms except yours.
>
> Not sure I understand.. Just to clarify, this is not done to save on resources,
> rather for the reasons stated above.
>
> We are already saving on resources by only allocating only
> "pool->compr_batch_size" number of resources
> (*not* ZSWAP_MAX_BATCH_SIZE resources):
>
>         pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
>                                      crypto_acomp_batch_size(acomp_ctx->acomp));
>
> For non-Intel platforms, this means only 1 dst buffer is allocated, as
> explained in the commit log for this patch.

you are right. I misunderstood your code :-)

>
> " A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
> Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
> compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
> proceeds to allocate the necessary compression dst buffers in the
> per-CPU acomp_ctx."

This is fine, but it still doesn’t provide a strong reason for having
two global variables when one can fully determine the value of the other.

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month, 1 week ago

> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Thursday, August 28, 2025 4:29 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> > >
> > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching,
> > > compression is done with a step size of 1. If the hardware step size is 4,
> > > compression occurs in two steps. If the hardware step size is 6, the first
> > > compression uses a step size of 6, and the second uses a step size of 2.
> > > Do you think this will work?
> >
> > Hi Barry,
> >
> > This would be non-optimal from code simplicity and latency perspectives.
> > One of the benefits of using the hardware accelerator's "batch parallelism"
> > is cost amortization across the batch. We might lose this benefit if we make
> > multiple calls to zswap_compress() to ask the hardware accelerator to
> > batch compress in smaller batches. Compression throughput would also
> > be sub-optimal.
> 
> I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and
> pool->compr_batch_size are powers of two. As you mentioned, we still
> gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when
> pool->compr_batch_size == 1, by compressing pages one by one but
> batching other work such as zswap_entries_cache_alloc_batch() ?
> 
> >
> > In my patch-series, the rule is simple: if an algorithm has specified a
> > batch-size, carve out the folio by that "batch-size" # of pages to be
> > compressed as a batch in zswap_compress(). This custom batch-size
> > is capped at ZSWAP_MAX_BATCH_SIZE.
> >
> > If an algorithm has not specified a batch-size, the default batch-size
> > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE
> > # of pages to be compressed as a batch in zswap_compress().
> 
> Yes, I understand your rule. However, having two global variables is still
> somewhat confusing. It might be clearer to use a single variable with a
> comment, since one variable can clearly determine the value of the other.
> 
> Can we get the batch_size at runtime based on pool->compr_batch_size?
> 
> /*
>  * If hardware compression supports batching, we use the hardware step size.
>  * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still
> compress
>  * one page at a time.
>  */
> batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size :
>              ZSWAP_MAX_BATCH_SIZE;
> 
> We probably don’t need this if both pool->compr_batch_size and
> ZSWAP_MAX_BATCH_SIZE are powers of two?

I am not sure I understand this rationale, but I do want to reiterate
that the patch-set implements a simple set of rules/design choices
to provide a batching framework for software and hardware compressors,
that has shown good performance improvements with both, while
unifying zswap_store()/zswap_compress() code paths for both.

As explained before, keeping the two variables as distinct u8 members
of struct zswap_pool is a design choice with these benefits:

1) Saves computes by avoiding computing this in performance-critical
    zswap_store() code. I have verified that dynamically computing the
    batch_size based on pool->compr_batch_size impacts latency.

2) The memory overhead is minimal: there is at most one zswap_pool
     active at any given time, other than at compressor transitions. The
     additional overhead is one u8, i.e., 1 byte for 1 runtime struct.

> 
> >
> > >
> > > I don’t quite understand why you want to save
> > > ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware
> > > batching
> > > you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the
> > > case for all platforms except yours.
> >
> > Not sure I understand.. Just to clarify, this is not done to save on resources,
> > rather for the reasons stated above.
> >
> > We are already saving on resources by only allocating only
> > "pool->compr_batch_size" number of resources
> > (*not* ZSWAP_MAX_BATCH_SIZE resources):
> >
> >         pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> >                                      crypto_acomp_batch_size(acomp_ctx->acomp));
> >
> > For non-Intel platforms, this means only 1 dst buffer is allocated, as
> > explained in the commit log for this patch.
> 
> you are right. I misunderstood your code :-)
> 
> >
> > " A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
> > Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
> > compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
> > proceeds to allocate the necessary compression dst buffers in the
> > per-CPU acomp_ctx."
> 
> This is fine, but it still doesn’t provide a strong reason for having
> two global variables when one can fully determine the value of the other.

Hopefully the above provides clarity.

Thanks,
Kanchana

> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month, 1 week ago

On Fri, Aug 29, 2025 at 10:57 AM Sridhar, Kanchana P
<kanchana.p.sridhar@intel.com> wrote:
>
>
> > -----Original Message-----
> > From: Barry Song <21cnbao@gmail.com>
> > Sent: Thursday, August 28, 2025 4:29 PM
> > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> > herbert@gondor.apana.org.au; davem@davemloft.net;
> > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> > if the compressor supports batching.
> >
> > > >
> > > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching,
> > > > compression is done with a step size of 1. If the hardware step size is 4,
> > > > compression occurs in two steps. If the hardware step size is 6, the first
> > > > compression uses a step size of 6, and the second uses a step size of 2.
> > > > Do you think this will work?
> > >
> > > Hi Barry,
> > >
> > > This would be non-optimal from code simplicity and latency perspectives.
> > > One of the benefits of using the hardware accelerator's "batch parallelism"
> > > is cost amortization across the batch. We might lose this benefit if we make
> > > multiple calls to zswap_compress() to ask the hardware accelerator to
> > > batch compress in smaller batches. Compression throughput would also
> > > be sub-optimal.
> >
> > I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and
> > pool->compr_batch_size are powers of two. As you mentioned, we still
> > gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when
> > pool->compr_batch_size == 1, by compressing pages one by one but
> > batching other work such as zswap_entries_cache_alloc_batch() ?
> >
> > >
> > > In my patch-series, the rule is simple: if an algorithm has specified a
> > > batch-size, carve out the folio by that "batch-size" # of pages to be
> > > compressed as a batch in zswap_compress(). This custom batch-size
> > > is capped at ZSWAP_MAX_BATCH_SIZE.
> > >
> > > If an algorithm has not specified a batch-size, the default batch-size
> > > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE
> > > # of pages to be compressed as a batch in zswap_compress().
> >
> > Yes, I understand your rule. However, having two global variables is still
> > somewhat confusing. It might be clearer to use a single variable with a
> > comment, since one variable can clearly determine the value of the other.
> >
> > Can we get the batch_size at runtime based on pool->compr_batch_size?
> >
> > /*
> >  * If hardware compression supports batching, we use the hardware step size.
> >  * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still
> > compress
> >  * one page at a time.
> >  */
> > batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size :
> >              ZSWAP_MAX_BATCH_SIZE;
> >
> > We probably don’t need this if both pool->compr_batch_size and
> > ZSWAP_MAX_BATCH_SIZE are powers of two?
>
> I am not sure I understand this rationale, but I do want to reiterate
> that the patch-set implements a simple set of rules/design choices
> to provide a batching framework for software and hardware compressors,
> that has shown good performance improvements with both, while
> unifying zswap_store()/zswap_compress() code paths for both.

I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and
compr_batch_size = 4, why wouldn’t batch_size = 8 and
compr_batch_size = 4 perform better than batch_size = 4 and
compr_batch_size = 4?

In short, I’d like the case of compr_batch_size == 1 to be treated the same
as compr_batch_size == 2, 4, etc., since you can still see performance
improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size == 1,
as batching occurs even outside compression.

Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to
perform better than when both are 2.

The only thing preventing this from happening is that compr_batch_size
might be 5, 6, or 7, which are not powers of two?

>
> As explained before, keeping the two variables as distinct u8 members
> of struct zswap_pool is a design choice with these benefits:
>
> 1) Saves computes by avoiding computing this in performance-critical
>     zswap_store() code. I have verified that dynamically computing the
>     batch_size based on pool->compr_batch_size impacts latency.

Ok, I’m a bit surprised, since this small computation wouldn’t
cause a branch misprediction at all.

In any case, if you want to keep both variables, that’s fine.
But can we at least rename one of them? For example, use
pool->store_batch_size and pool->compr_batch_size instead of
pool->batch_size and pool->compr_batch_size, since pool->batch_size
generally has a broader semantic scope than compr_batch_size.

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month ago

> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Thursday, August 28, 2025 8:42 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> On Fri, Aug 29, 2025 at 10:57 AM Sridhar, Kanchana P
> <kanchana.p.sridhar@intel.com> wrote:
> >
> >
> > > -----Original Message-----
> > > From: Barry Song <21cnbao@gmail.com>
> > > Sent: Thursday, August 28, 2025 4:29 PM
> > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> > > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> > > foundation.org; senozhatsky@chromium.org; linux-
> crypto@vger.kernel.org;
> > > herbert@gondor.apana.org.au; davem@davemloft.net;
> > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching
> resources
> > > if the compressor supports batching.
> > >
> > > > >
> > > > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware
> batching,
> > > > > compression is done with a step size of 1. If the hardware step size is
> 4,
> > > > > compression occurs in two steps. If the hardware step size is 6, the
> first
> > > > > compression uses a step size of 6, and the second uses a step size of 2.
> > > > > Do you think this will work?
> > > >
> > > > Hi Barry,
> > > >
> > > > This would be non-optimal from code simplicity and latency
> perspectives.
> > > > One of the benefits of using the hardware accelerator's "batch
> parallelism"
> > > > is cost amortization across the batch. We might lose this benefit if we
> make
> > > > multiple calls to zswap_compress() to ask the hardware accelerator to
> > > > batch compress in smaller batches. Compression throughput would also
> > > > be sub-optimal.
> > >
> > > I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and
> > > pool->compr_batch_size are powers of two. As you mentioned, we still
> > > gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when
> > > pool->compr_batch_size == 1, by compressing pages one by one but
> > > batching other work such as zswap_entries_cache_alloc_batch() ?
> > >
> > > >
> > > > In my patch-series, the rule is simple: if an algorithm has specified a
> > > > batch-size, carve out the folio by that "batch-size" # of pages to be
> > > > compressed as a batch in zswap_compress(). This custom batch-size
> > > > is capped at ZSWAP_MAX_BATCH_SIZE.
> > > >
> > > > If an algorithm has not specified a batch-size, the default batch-size
> > > > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE
> > > > # of pages to be compressed as a batch in zswap_compress().
> > >
> > > Yes, I understand your rule. However, having two global variables is still
> > > somewhat confusing. It might be clearer to use a single variable with a
> > > comment, since one variable can clearly determine the value of the other.
> > >
> > > Can we get the batch_size at runtime based on pool->compr_batch_size?
> > >
> > > /*
> > >  * If hardware compression supports batching, we use the hardware step
> size.
> > >  * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still
> > > compress
> > >  * one page at a time.
> > >  */
> > > batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size :
> > >              ZSWAP_MAX_BATCH_SIZE;
> > >
> > > We probably don’t need this if both pool->compr_batch_size and
> > > ZSWAP_MAX_BATCH_SIZE are powers of two?
> >
> > I am not sure I understand this rationale, but I do want to reiterate
> > that the patch-set implements a simple set of rules/design choices
> > to provide a batching framework for software and hardware compressors,
> > that has shown good performance improvements with both, while
> > unifying zswap_store()/zswap_compress() code paths for both.
> 
> I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and
> compr_batch_size = 4, why wouldn’t batch_size = 8 and
> compr_batch_size = 4 perform better than batch_size = 4 and
> compr_batch_size = 4?
> 
> In short, I’d like the case of compr_batch_size == 1 to be treated the same
> as compr_batch_size == 2, 4, etc., since you can still see performance
> improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size ==
> 1,
> as batching occurs even outside compression.
> 
> Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to
> perform better than when both are 2.
> 
> The only thing preventing this from happening is that compr_batch_size
> might be 5, 6, or 7, which are not powers of two?

It would be interesting to see if a generalization of pool->compr_batch_size
being a factor "N" (where N > 1) of ZSWAP_MAX_BATCH_SIZE yields better
performance than the current set of rules. However, we would still need to
handle the case where it is not, as you mention, which might still necessitate
the use of a distinct pool->batch_size to avoid re-calculating this dynamically,
when this information doesn't change after pool creation.

The current implementation gives preference to the algorithm to determine
not just the batch compression step-size, but also the working-set size for
other zswap processing for the batch, i.e., bulk allocation of entries,
zpool writes, etc. The algorithm's batch-size is what zswap uses for the latter
(the zswap_store_pages() in my patch-set). This has been shown to work
well.

To change this design to be driven instead by ZSWAP_MAX_BATCH_SIZE
always (while handling non-factor pool->compr_batch_size) requires more
data gathering. I am inclined to keep the existing implementation and
we can continue to improve upon this if its Ok with you.

> 
> >
> > As explained before, keeping the two variables as distinct u8 members
> > of struct zswap_pool is a design choice with these benefits:
> >
> > 1) Saves computes by avoiding computing this in performance-critical
> >     zswap_store() code. I have verified that dynamically computing the
> >     batch_size based on pool->compr_batch_size impacts latency.
> 
> Ok, I’m a bit surprised, since this small computation wouldn’t
> cause a branch misprediction at all.
> 
> In any case, if you want to keep both variables, that’s fine.
> But can we at least rename one of them? For example, use
> pool->store_batch_size and pool->compr_batch_size instead of
> pool->batch_size and pool->compr_batch_size, since pool->batch_size
> generally has a broader semantic scope than compr_batch_size.

Sure. I will change pool->batch_size to be pool->store_batch_size.

Thanks,
Kanchana

> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Barry Song 1 month ago

> > >
> > > I am not sure I understand this rationale, but I do want to reiterate
> > > that the patch-set implements a simple set of rules/design choices
> > > to provide a batching framework for software and hardware compressors,
> > > that has shown good performance improvements with both, while
> > > unifying zswap_store()/zswap_compress() code paths for both.
> >
> > I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and
> > compr_batch_size = 4, why wouldn’t batch_size = 8 and
> > compr_batch_size = 4 perform better than batch_size = 4 and
> > compr_batch_size = 4?
> >
> > In short, I’d like the case of compr_batch_size == 1 to be treated the same
> > as compr_batch_size == 2, 4, etc., since you can still see performance
> > improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size ==
> > 1,
> > as batching occurs even outside compression.
> >
> > Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to
> > perform better than when both are 2.
> >
> > The only thing preventing this from happening is that compr_batch_size
> > might be 5, 6, or 7, which are not powers of two?
>
> It would be interesting to see if a generalization of pool->compr_batch_size
> being a factor "N" (where N > 1) of ZSWAP_MAX_BATCH_SIZE yields better
> performance than the current set of rules. However, we would still need to
> handle the case where it is not, as you mention, which might still necessitate
> the use of a distinct pool->batch_size to avoid re-calculating this dynamically,
> when this information doesn't change after pool creation.
>
> The current implementation gives preference to the algorithm to determine
> not just the batch compression step-size, but also the working-set size for
> other zswap processing for the batch, i.e., bulk allocation of entries,
> zpool writes, etc. The algorithm's batch-size is what zswap uses for the latter
> (the zswap_store_pages() in my patch-set). This has been shown to work
> well.
>
> To change this design to be driven instead by ZSWAP_MAX_BATCH_SIZE
> always (while handling non-factor pool->compr_batch_size) requires more
> data gathering. I am inclined to keep the existing implementation and
> we can continue to improve upon this if its Ok with you.

Right, I have no objection at this stage. I’m just curious—since some hardware
now supports HW compression with only one queue, and in the future may
increase to two or four queues but not many overall—whether batch_size ==
compr_batch_size is always the best rule.

BTW, is HW compression always better than software? For example, when
kswapd, proactive reclamation, and direct reclamation all run simultaneously,
the CPU-based approach can leverage multiple CPUs to perform compression
in parallel. But if the hardware only provides a limited number of queues,
software might actually perform better. An extreme case is when multiple
threads are running MADV_PAGEOUT at the same time.

I’m not opposing your current patchset, just sharing some side thoughts :-)

Thanks
Barry

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month ago

> -----Original Message-----
> From: Barry Song <21cnbao@gmail.com>
> Sent: Saturday, August 30, 2025 1:41 AM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux-
> foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> > > >
> > > > I am not sure I understand this rationale, but I do want to reiterate
> > > > that the patch-set implements a simple set of rules/design choices
> > > > to provide a batching framework for software and hardware
> compressors,
> > > > that has shown good performance improvements with both, while
> > > > unifying zswap_store()/zswap_compress() code paths for both.
> > >
> > > I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and
> > > compr_batch_size = 4, why wouldn’t batch_size = 8 and
> > > compr_batch_size = 4 perform better than batch_size = 4 and
> > > compr_batch_size = 4?
> > >
> > > In short, I’d like the case of compr_batch_size == 1 to be treated the same
> > > as compr_batch_size == 2, 4, etc., since you can still see performance
> > > improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size
> ==
> > > 1,
> > > as batching occurs even outside compression.
> > >
> > > Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to
> > > perform better than when both are 2.
> > >
> > > The only thing preventing this from happening is that compr_batch_size
> > > might be 5, 6, or 7, which are not powers of two?
> >
> > It would be interesting to see if a generalization of pool->compr_batch_size
> > being a factor "N" (where N > 1) of ZSWAP_MAX_BATCH_SIZE yields better
> > performance than the current set of rules. However, we would still need to
> > handle the case where it is not, as you mention, which might still
> necessitate
> > the use of a distinct pool->batch_size to avoid re-calculating this
> dynamically,
> > when this information doesn't change after pool creation.
> >
> > The current implementation gives preference to the algorithm to determine
> > not just the batch compression step-size, but also the working-set size for
> > other zswap processing for the batch, i.e., bulk allocation of entries,
> > zpool writes, etc. The algorithm's batch-size is what zswap uses for the
> latter
> > (the zswap_store_pages() in my patch-set). This has been shown to work
> > well.
> >
> > To change this design to be driven instead by ZSWAP_MAX_BATCH_SIZE
> > always (while handling non-factor pool->compr_batch_size) requires more
> > data gathering. I am inclined to keep the existing implementation and
> > we can continue to improve upon this if its Ok with you.
> 
> Right, I have no objection at this stage. I’m just curious—since some hardware
> now supports HW compression with only one queue, and in the future may
> increase to two or four queues but not many overall—whether batch_size ==
> compr_batch_size is always the best rule.
> 
> BTW, is HW compression always better than software? For example, when
> kswapd, proactive reclamation, and direct reclamation all run simultaneously,
> the CPU-based approach can leverage multiple CPUs to perform compression
> in parallel. But if the hardware only provides a limited number of queues,
> software might actually perform better. An extreme case is when multiple
> threads are running MADV_PAGEOUT at the same time.

These are great questions that we'll need to run more experiments to answer
and understand the trade-offs. The good thing is the zswap architecture proposed
in this patch-set is flexible enough to allow us to do so with minor changes in
how we set up these two zswap_pool data members (pool->compr_batch_size and
pool->store_batch_size).

One of the next steps that we plan to explore is integrating batching for
hardware parallelism with the kcompressd work, as per Nhat's suggestion. I
believe hardware can be used with multiple compression threads from all these
sources. There are many details to be worked out, but I think this can be done
by striking the right balance between cost amortization, hardware parallelism,
overlapping computes between the CPU and the accelerator, etc.

Another important point is that our hardware roadmap continues to evolve,
consistently improving compression ratios, lowering both compression and
decompression latency and boosting overall throughput. 

To summarize: with hardware compression acceleration and using batching to avail
of parallel hardware compress/decompress, we can improve the reclaim latency for
a given CPU thread. When combined with compression ratio improvements, we save
more memory for equivalent performance; and/or need to reclaim less via
proactive or direct reclaim (reclaim latency improvements result in less memory
pressure) and improve workload performance. This can make a significant impact
on contended systems where CPU threads for reclaim come at a cost, and hardware
compression can help offset this. Hence, I can only see upside, but we'll need to
prove this out :).

Thanks,
Kanchana

> 
> I’m not opposing your current patchset, just sharing some side thoughts :-)
> 
> Thanks
> Barry

Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Nhat Pham 1 month, 3 weeks ago

On Thu, Jul 31, 2025 at 9:36 PM Kanchana P Sridhar
<kanchana.p.sridhar@intel.com> wrote:
>
> This patch sets up zswap for allocating per-CPU resources optimally for
> non-batching and batching compressors.
>
> A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper
> limit on the number of pages in large folios that will be batch
> compressed.
>
> As per Herbert's comments in [2] in response to the
> crypto_acomp_batch_compress() and crypto_acomp_batch_decompress() API
> proposed in [1], this series does not create new crypto_acomp batching
> API. Instead, zswap compression batching uses the existing
> crypto_acomp_compress() API in combination with the "void *kernel_data"
> member added to "struct acomp_req" earlier in this series.
>
> It is up to the compressor to manage multiple requests, as needed, to
> accomplish batch parallelism. zswap only needs to allocate the per-CPU
> dst buffers according to the batch size supported by the compressor.
>
> A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
> Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
> compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
> proceeds to allocate the necessary compression dst buffers in the
> per-CPU acomp_ctx.
>
> Another "u8 batch_size" member is added to "struct zswap_pool" to store
> the unit for batching large folio stores: for batching compressors, this
> is the pool->compr_batch_size. For non-batching compressors, this is
> ZSWAP_MAX_BATCH_SIZE.
>
> zswap does not use more than one dst buffer yet. Follow-up patches will
> actually utilize the multiple acomp_ctx buffers for batch
> compression/decompression of multiple pages.
>
> Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used for
> batching. There is a small extra memory overhead of allocating
> the acomp_ctx->buffers array for compressors that do not support
> batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes).
>
> [1]: https://patchwork.kernel.org/project/linux-mm/patch/20250508194134.28392-11-kanchana.p.sridhar@intel.com/
> [2]: https://patchwork.kernel.org/comment/26382610
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>

Mostly LGTM. Just a couple of questions below:

> ---
>  mm/zswap.c | 82 +++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 63 insertions(+), 19 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index efd501a7fe294..63a997b999537 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -80,6 +80,9 @@ static bool zswap_pool_reached_full;
>
>  #define ZSWAP_PARAM_UNSET ""
>
> +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
> +#define ZSWAP_MAX_BATCH_SIZE 8U
> +
>  static int zswap_setup(void);
>
>  /* Enable/disable zswap */
> @@ -147,7 +150,7 @@ struct crypto_acomp_ctx {
>         struct crypto_acomp *acomp;
>         struct acomp_req *req;
>         struct crypto_wait wait;
> -       u8 *buffer;
> +       u8 **buffers;
>         struct mutex mutex;
>         bool is_sleepable;
>  };
> @@ -166,6 +169,8 @@ struct zswap_pool {
>         struct work_struct release_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> +       u8 compr_batch_size;
> +       u8 batch_size;

Apologies if this is explained elsewhere, but I'm very confused - why
do we need both of these two fields?

Seems like batch_size is defined below, and never changed:

      pool->batch_size = (pool->compr_batch_size > 1) ?
                            pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;

Can we just determine this in zswap_store() as a local variable?


>  };
>
>  /* Global LRU lists shared by all zswap pools. */
> @@ -258,8 +263,10 @@ static void __zswap_pool_empty(struct percpu_ref *ref);
>   *   zswap_cpu_comp_prepare(), not others.
>   * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy().
>   */
> -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
>  {
> +       u8 i;
> +
>         if (IS_ERR_OR_NULL(acomp_ctx))
>                 return;
>
> @@ -269,7 +276,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
>         if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
>                 crypto_free_acomp(acomp_ctx->acomp);
>
> -       kfree(acomp_ctx->buffer);
> +       if (acomp_ctx->buffers) {
> +               for (i = 0; i < nr_buffers; ++i)
> +                       kfree(acomp_ctx->buffers[i]);
> +               kfree(acomp_ctx->buffers);
> +       }
>  }
>
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> @@ -290,6 +301,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>                         return NULL;
>         }
>
> +       /* Many things rely on the zero-initialization. */
>         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
>         if (!pool)
>                 return NULL;
> @@ -352,13 +364,28 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>                 goto ref_fail;
>         INIT_LIST_HEAD(&pool->list);
>
> +       /*
> +        * Set the unit of compress batching for large folios, for quick
> +        * retrieval in the zswap_compress() fast path:
> +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> +        * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE
> +        * pages, where each page in the batch is compressed sequentially.
> +        * We see better performance by processing the folio in batches of
> +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> +        * structures.
> +        */
> +       pool->batch_size = (pool->compr_batch_size > 1) ?
> +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> +
>         zswap_pool_debug("created", pool);
>
>         return pool;
>
>  ref_fail:
>         for_each_possible_cpu(cpu)
> -               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> +               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> +                                 pool->compr_batch_size);
> +
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> @@ -417,7 +444,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
>         zswap_pool_debug("destroying", pool);
>
>         for_each_possible_cpu(cpu)
> -               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> +               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> +                                 pool->compr_batch_size);
>
>         free_percpu(pool->acomp_ctx);
>
> @@ -876,6 +904,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
>         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>         int ret = -ENOMEM;
> +       u8 i;
>
>         /*
>          * The per-CPU pool->acomp_ctx is zero-initialized on allocation.
> @@ -888,10 +917,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
>                 return 0;
>
> -       acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
> -       if (!acomp_ctx->buffer)
> -               return ret;
> -
>         acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
>         if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
>                 pr_err("could not alloc crypto acomp %s : %ld\n",
> @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
>         if (IS_ERR_OR_NULL(acomp_ctx->req)) {
>                 pr_err("could not alloc crypto acomp_request %s\n",
> -                      pool->tfm_name);
> +                       pool->tfm_name);

Is this intentional? :)

>                 goto fail;
>         }
>
> -       crypto_init_wait(&acomp_ctx->wait);
> +       /*
> +        * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> +        * compressor supports batching.
> +        */
> +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> +                                    crypto_acomp_batch_size(acomp_ctx->acomp));
> +
> +       acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
> +                                         GFP_KERNEL, cpu_to_node(cpu));
> +       if (!acomp_ctx->buffers)
> +               goto fail;
> +
> +       for (i = 0; i < pool->compr_batch_size; ++i) {
> +               acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> +                                                    cpu_to_node(cpu));
> +               if (!acomp_ctx->buffers[i])
> +                       goto fail;
> +       }
>
>         /*
>          * if the backend of acomp is async zip, crypto_req_done() will wakeup
>          * crypto_wait_req(); if the backend of acomp is scomp, the callback
>          * won't be called, crypto_wait_req() will return without blocking.
>          */
> +       crypto_init_wait(&acomp_ctx->wait);
> +
>         acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
>                                    crypto_req_done, &acomp_ctx->wait);
>
> @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         return 0;
>
>  fail:
> -       acomp_ctx_dealloc(acomp_ctx);
> +       acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
>         return ret;
>  }
>
> @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>
>         mutex_lock(&acomp_ctx->mutex);
>
> -       dst = acomp_ctx->buffer;
> +       dst = acomp_ctx->buffers[0];
>         sg_init_table(&input, 1);
>         sg_set_page(&input, page, PAGE_SIZE, 0);
>
> @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>         mutex_lock(&acomp_ctx->mutex);
> -       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
> +       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffers[0]);
>
>         /*
>          * zpool_obj_read_begin() might return a kmap address of highmem when
> -        * acomp_ctx->buffer is not used.  However, sg_init_one() does not
> -        * handle highmem addresses, so copy the object to acomp_ctx->buffer.
> +        * acomp_ctx->buffers[0] is not used.  However, sg_init_one() does not
> +        * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
>          */
>         if (virt_addr_valid(obj)) {
>                 src = obj;
>         } else {
> -               WARN_ON_ONCE(obj == acomp_ctx->buffer);
> -               memcpy(acomp_ctx->buffer, obj, entry->length);
> -               src = acomp_ctx->buffer;
> +               WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> +               memcpy(acomp_ctx->buffers[0], obj, entry->length);
> +               src = acomp_ctx->buffers[0];
>         }
>
>         sg_init_one(&input, src, entry->length);
> --
> 2.27.0
>

RE: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources if the compressor supports batching.

Posted by Sridhar, Kanchana P 1 month, 3 weeks ago

> -----Original Message-----
> From: Nhat Pham <nphamcs@gmail.com>
> Sent: Thursday, August 14, 2025 1:58 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> senozhatsky@chromium.org; linux-crypto@vger.kernel.org;
> herbert@gondor.apana.org.au; davem@davemloft.net;
> clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com;
> surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>;
> Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources
> if the compressor supports batching.
> 
> On Thu, Jul 31, 2025 at 9:36 PM Kanchana P Sridhar
> <kanchana.p.sridhar@intel.com> wrote:
> >
> > This patch sets up zswap for allocating per-CPU resources optimally for
> > non-batching and batching compressors.
> >
> > A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper
> > limit on the number of pages in large folios that will be batch
> > compressed.
> >
> > As per Herbert's comments in [2] in response to the
> > crypto_acomp_batch_compress() and crypto_acomp_batch_decompress()
> API
> > proposed in [1], this series does not create new crypto_acomp batching
> > API. Instead, zswap compression batching uses the existing
> > crypto_acomp_compress() API in combination with the "void *kernel_data"
> > member added to "struct acomp_req" earlier in this series.
> >
> > It is up to the compressor to manage multiple requests, as needed, to
> > accomplish batch parallelism. zswap only needs to allocate the per-CPU
> > dst buffers according to the batch size supported by the compressor.
> >
> > A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
> > Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
> > compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
> > proceeds to allocate the necessary compression dst buffers in the
> > per-CPU acomp_ctx.
> >
> > Another "u8 batch_size" member is added to "struct zswap_pool" to store
> > the unit for batching large folio stores: for batching compressors, this
> > is the pool->compr_batch_size. For non-batching compressors, this is
> > ZSWAP_MAX_BATCH_SIZE.
> >
> > zswap does not use more than one dst buffer yet. Follow-up patches will
> > actually utilize the multiple acomp_ctx buffers for batch
> > compression/decompression of multiple pages.
> >
> > Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used
> for
> > batching. There is a small extra memory overhead of allocating
> > the acomp_ctx->buffers array for compressors that do not support
> > batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes).
> >
> > [1]: https://patchwork.kernel.org/project/linux-
> mm/patch/20250508194134.28392-11-kanchana.p.sridhar@intel.com/
> > [2]: https://patchwork.kernel.org/comment/26382610
> >
> > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> 
> Mostly LGTM. Just a couple of questions below:

Hi Nhat,

Thanks for taking the time to review the patches! Sure, these are
great questions, responses are inline.

> 
> > ---
> >  mm/zswap.c | 82 +++++++++++++++++++++++++++++++++++++++++-------
> ------
> >  1 file changed, 63 insertions(+), 19 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index efd501a7fe294..63a997b999537 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -80,6 +80,9 @@ static bool zswap_pool_reached_full;
> >
> >  #define ZSWAP_PARAM_UNSET ""
> >
> > +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
> > +#define ZSWAP_MAX_BATCH_SIZE 8U
> > +
> >  static int zswap_setup(void);
> >
> >  /* Enable/disable zswap */
> > @@ -147,7 +150,7 @@ struct crypto_acomp_ctx {
> >         struct crypto_acomp *acomp;
> >         struct acomp_req *req;
> >         struct crypto_wait wait;
> > -       u8 *buffer;
> > +       u8 **buffers;
> >         struct mutex mutex;
> >         bool is_sleepable;
> >  };
> > @@ -166,6 +169,8 @@ struct zswap_pool {
> >         struct work_struct release_work;
> >         struct hlist_node node;
> >         char tfm_name[CRYPTO_MAX_ALG_NAME];
> > +       u8 compr_batch_size;
> > +       u8 batch_size;
> 
> Apologies if this is explained elsewhere, but I'm very confused - why
> do we need both of these two fields?

No worries. This was my thinking in keeping these separate:

  "compr_batch_size" is indicative of the number of batching resources
  allocated per-CPU. Hence, zswap_compress() uses this to determine if
  we need to compress one page at a time in the input batch of pages.

  "batch_size" represents the number of pages that will be sent to
  zswap_compress() as a batch.

> 
> Seems like batch_size is defined below, and never changed:
> 
>       pool->batch_size = (pool->compr_batch_size > 1) ?
>                             pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> 
> Can we just determine this in zswap_store() as a local variable?

I figured since the number of zswap_pools at any given time is less than
or equal to 2 (IIRC), it should be a good compromise to add these two
u8 members for latency reasons, so that this doesn't have to be
computed per call to zswap_store(). 

> 
> 
> >  };
> >
> >  /* Global LRU lists shared by all zswap pools. */
> > @@ -258,8 +263,10 @@ static void __zswap_pool_empty(struct
> percpu_ref *ref);
> >   *   zswap_cpu_comp_prepare(), not others.
> >   * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy().
> >   */
> > -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> > +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8
> nr_buffers)
> >  {
> > +       u8 i;
> > +
> >         if (IS_ERR_OR_NULL(acomp_ctx))
> >                 return;
> >
> > @@ -269,7 +276,11 @@ static void acomp_ctx_dealloc(struct
> crypto_acomp_ctx *acomp_ctx)
> >         if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> >                 crypto_free_acomp(acomp_ctx->acomp);
> >
> > -       kfree(acomp_ctx->buffer);
> > +       if (acomp_ctx->buffers) {
> > +               for (i = 0; i < nr_buffers; ++i)
> > +                       kfree(acomp_ctx->buffers[i]);
> > +               kfree(acomp_ctx->buffers);
> > +       }
> >  }
> >
> >  static struct zswap_pool *zswap_pool_create(char *type, char
> *compressor)
> > @@ -290,6 +301,7 @@ static struct zswap_pool *zswap_pool_create(char
> *type, char *compressor)
> >                         return NULL;
> >         }
> >
> > +       /* Many things rely on the zero-initialization. */
> >         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
> >         if (!pool)
> >                 return NULL;
> > @@ -352,13 +364,28 @@ static struct zswap_pool
> *zswap_pool_create(char *type, char *compressor)
> >                 goto ref_fail;
> >         INIT_LIST_HEAD(&pool->list);
> >
> > +       /*
> > +        * Set the unit of compress batching for large folios, for quick
> > +        * retrieval in the zswap_compress() fast path:
> > +        * If the compressor is sequential (@pool->compr_batch_size is 1),
> > +        * large folios will be compressed in batches of
> ZSWAP_MAX_BATCH_SIZE
> > +        * pages, where each page in the batch is compressed sequentially.
> > +        * We see better performance by processing the folio in batches of
> > +        * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
> > +        * structures.
> > +        */
> > +       pool->batch_size = (pool->compr_batch_size > 1) ?
> > +                               pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> > +
> >         zswap_pool_debug("created", pool);
> >
> >         return pool;
> >
> >  ref_fail:
> >         for_each_possible_cpu(cpu)
> > -               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> > +               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> > +                                 pool->compr_batch_size);
> > +
> >  error:
> >         if (pool->acomp_ctx)
> >                 free_percpu(pool->acomp_ctx);
> > @@ -417,7 +444,8 @@ static void zswap_pool_destroy(struct zswap_pool
> *pool)
> >         zswap_pool_debug("destroying", pool);
> >
> >         for_each_possible_cpu(cpu)
> > -               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> > +               acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> > +                                 pool->compr_batch_size);
> >
> >         free_percpu(pool->acomp_ctx);
> >
> > @@ -876,6 +904,7 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> >         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx,
> cpu);
> >         int ret = -ENOMEM;
> > +       u8 i;
> >
> >         /*
> >          * The per-CPU pool->acomp_ctx is zero-initialized on allocation.
> > @@ -888,10 +917,6 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> >                 return 0;
> >
> > -       acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> cpu_to_node(cpu));
> > -       if (!acomp_ctx->buffer)
> > -               return ret;
> > -
> >         acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> cpu_to_node(cpu));
> >         if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
> >                 pr_err("could not alloc crypto acomp %s : %ld\n",
> > @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
> >         if (IS_ERR_OR_NULL(acomp_ctx->req)) {
> >                 pr_err("could not alloc crypto acomp_request %s\n",
> > -                      pool->tfm_name);
> > +                       pool->tfm_name);
> 
> Is this intentional? :)

Yes, it is indeed :). No issue if I should revert.

Thanks,
Kanchana

> 
> >                 goto fail;
> >         }
> >
> > -       crypto_init_wait(&acomp_ctx->wait);
> > +       /*
> > +        * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> > +        * compressor supports batching.
> > +        */
> > +       pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> > +                                    crypto_acomp_batch_size(acomp_ctx->acomp));
> > +
> > +       acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8
> *),
> > +                                         GFP_KERNEL, cpu_to_node(cpu));
> > +       if (!acomp_ctx->buffers)
> > +               goto fail;
> > +
> > +       for (i = 0; i < pool->compr_batch_size; ++i) {
> > +               acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> GFP_KERNEL,
> > +                                                    cpu_to_node(cpu));
> > +               if (!acomp_ctx->buffers[i])
> > +                       goto fail;
> > +       }
> >
> >         /*
> >          * if the backend of acomp is async zip, crypto_req_done() will wakeup
> >          * crypto_wait_req(); if the backend of acomp is scomp, the callback
> >          * won't be called, crypto_wait_req() will return without blocking.
> >          */
> > +       crypto_init_wait(&acomp_ctx->wait);
> > +
> >         acomp_request_set_callback(acomp_ctx->req,
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> >                                    crypto_req_done, &acomp_ctx->wait);
> >
> > @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         return 0;
> >
> >  fail:
> > -       acomp_ctx_dealloc(acomp_ctx);
> > +       acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
> >         return ret;
> >  }
> >
> > @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> >
> >         mutex_lock(&acomp_ctx->mutex);
> >
> > -       dst = acomp_ctx->buffer;
> > +       dst = acomp_ctx->buffers[0];
> >         sg_init_table(&input, 1);
> >         sg_set_page(&input, page, PAGE_SIZE, 0);
> >
> > @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct
> zswap_entry *entry, struct folio *folio)
> >
> >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> >         mutex_lock(&acomp_ctx->mutex);
> > -       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
> > +       obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx-
> >buffers[0]);
> >
> >         /*
> >          * zpool_obj_read_begin() might return a kmap address of highmem
> when
> > -        * acomp_ctx->buffer is not used.  However, sg_init_one() does not
> > -        * handle highmem addresses, so copy the object to acomp_ctx-
> >buffer.
> > +        * acomp_ctx->buffers[0] is not used.  However, sg_init_one() does not
> > +        * handle highmem addresses, so copy the object to acomp_ctx-
> >buffers[0].
> >          */
> >         if (virt_addr_valid(obj)) {
> >                 src = obj;
> >         } else {
> > -               WARN_ON_ONCE(obj == acomp_ctx->buffer);
> > -               memcpy(acomp_ctx->buffer, obj, entry->length);
> > -               src = acomp_ctx->buffer;
> > +               WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> > +               memcpy(acomp_ctx->buffers[0], obj, entry->length);
> > +               src = acomp_ctx->buffers[0];
> >         }
> >
> >         sg_init_one(&input, src, entry->length);
> > --
> > 2.27.0
> >