This patch sets up zswap for allocating per-CPU resources optimally for
non-batching and batching compressors.
A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper
limit on the number of pages in large folios that will be batch
compressed.
As per Herbert's comments in [2] in response to the
crypto_acomp_batch_compress() and crypto_acomp_batch_decompress() API
proposed in [1], this series does not create new crypto_acomp batching
API. Instead, zswap compression batching uses the existing
crypto_acomp_compress() API in combination with the "void *kernel_data"
member added to "struct acomp_req" earlier in this series.
It is up to the compressor to manage multiple requests, as needed, to
accomplish batch parallelism. zswap only needs to allocate the per-CPU
dst buffers according to the batch size supported by the compressor.
A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
Yosry's suggestion. pool->compr_batch_size is set as the minimum of the
compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it
proceeds to allocate the necessary compression dst buffers in the
per-CPU acomp_ctx.
Another "u8 batch_size" member is added to "struct zswap_pool" to store
the unit for batching large folio stores: for batching compressors, this
is the pool->compr_batch_size. For non-batching compressors, this is
ZSWAP_MAX_BATCH_SIZE.
zswap does not use more than one dst buffer yet. Follow-up patches will
actually utilize the multiple acomp_ctx buffers for batch
compression/decompression of multiple pages.
Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used for
batching. There is a small extra memory overhead of allocating
the acomp_ctx->buffers array for compressors that do not support
batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes).
[1]: https://patchwork.kernel.org/project/linux-mm/patch/20250508194134.28392-11-kanchana.p.sridhar@intel.com/
[2]: https://patchwork.kernel.org/comment/26382610
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
mm/zswap.c | 82 +++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 63 insertions(+), 19 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index efd501a7fe294..63a997b999537 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -80,6 +80,9 @@ static bool zswap_pool_reached_full;
#define ZSWAP_PARAM_UNSET ""
+/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
+#define ZSWAP_MAX_BATCH_SIZE 8U
+
static int zswap_setup(void);
/* Enable/disable zswap */
@@ -147,7 +150,7 @@ struct crypto_acomp_ctx {
struct crypto_acomp *acomp;
struct acomp_req *req;
struct crypto_wait wait;
- u8 *buffer;
+ u8 **buffers;
struct mutex mutex;
bool is_sleepable;
};
@@ -166,6 +169,8 @@ struct zswap_pool {
struct work_struct release_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
+ u8 compr_batch_size;
+ u8 batch_size;
};
/* Global LRU lists shared by all zswap pools. */
@@ -258,8 +263,10 @@ static void __zswap_pool_empty(struct percpu_ref *ref);
* zswap_cpu_comp_prepare(), not others.
* - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy().
*/
-static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
{
+ u8 i;
+
if (IS_ERR_OR_NULL(acomp_ctx))
return;
@@ -269,7 +276,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
+ if (acomp_ctx->buffers) {
+ for (i = 0; i < nr_buffers; ++i)
+ kfree(acomp_ctx->buffers[i]);
+ kfree(acomp_ctx->buffers);
+ }
}
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
@@ -290,6 +301,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
return NULL;
}
+ /* Many things rely on the zero-initialization. */
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
return NULL;
@@ -352,13 +364,28 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto ref_fail;
INIT_LIST_HEAD(&pool->list);
+ /*
+ * Set the unit of compress batching for large folios, for quick
+ * retrieval in the zswap_compress() fast path:
+ * If the compressor is sequential (@pool->compr_batch_size is 1),
+ * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE
+ * pages, where each page in the batch is compressed sequentially.
+ * We see better performance by processing the folio in batches of
+ * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
+ * structures.
+ */
+ pool->batch_size = (pool->compr_batch_size > 1) ?
+ pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
+
zswap_pool_debug("created", pool);
return pool;
ref_fail:
for_each_possible_cpu(cpu)
- acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+ acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+ pool->compr_batch_size);
+
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -417,7 +444,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
zswap_pool_debug("destroying", pool);
for_each_possible_cpu(cpu)
- acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+ acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+ pool->compr_batch_size);
free_percpu(pool->acomp_ctx);
@@ -876,6 +904,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
int ret = -ENOMEM;
+ u8 i;
/*
* The per-CPU pool->acomp_ctx is zero-initialized on allocation.
@@ -888,10 +917,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
return 0;
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return ret;
-
acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
@@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
if (IS_ERR_OR_NULL(acomp_ctx->req)) {
pr_err("could not alloc crypto acomp_request %s\n",
- pool->tfm_name);
+ pool->tfm_name);
goto fail;
}
- crypto_init_wait(&acomp_ctx->wait);
+ /*
+ * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
+ * compressor supports batching.
+ */
+ pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
+ crypto_acomp_batch_size(acomp_ctx->acomp));
+
+ acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffers)
+ goto fail;
+
+ for (i = 0; i < pool->compr_batch_size; ++i) {
+ acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!acomp_ctx->buffers[i])
+ goto fail;
+ }
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
* won't be called, crypto_wait_req() will return without blocking.
*/
+ crypto_init_wait(&acomp_ctx->wait);
+
acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
@@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
return 0;
fail:
- acomp_ctx_dealloc(acomp_ctx);
+ acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
return ret;
}
@@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
mutex_lock(&acomp_ctx->mutex);
- dst = acomp_ctx->buffer;
+ dst = acomp_ctx->buffers[0];
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
mutex_lock(&acomp_ctx->mutex);
- obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+ obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffers[0]);
/*
* zpool_obj_read_begin() might return a kmap address of highmem when
- * acomp_ctx->buffer is not used. However, sg_init_one() does not
- * handle highmem addresses, so copy the object to acomp_ctx->buffer.
+ * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not
+ * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
*/
if (virt_addr_valid(obj)) {
src = obj;
} else {
- WARN_ON_ONCE(obj == acomp_ctx->buffer);
- memcpy(acomp_ctx->buffer, obj, entry->length);
- src = acomp_ctx->buffer;
+ WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
+ memcpy(acomp_ctx->buffers[0], obj, entry->length);
+ src = acomp_ctx->buffers[0];
}
sg_init_one(&input, src, entry->length);
--
2.27.0
Hi Kanchana, [...] > > + /* > + * Set the unit of compress batching for large folios, for quick > + * retrieval in the zswap_compress() fast path: > + * If the compressor is sequential (@pool->compr_batch_size is 1), > + * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE > + * pages, where each page in the batch is compressed sequentially. > + * We see better performance by processing the folio in batches of > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > + * structures. > + */ > + pool->batch_size = (pool->compr_batch_size > 1) ? > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > + > zswap_pool_debug("created", pool); > > return pool; > It’s hard to follow — you add batch_size and compr_batch_size in this patch, but only use them in another. Could we merge the related changes into one patch instead of splitting them into several that don’t work independently? > - > acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); > if (IS_ERR_OR_NULL(acomp_ctx->acomp)) { > pr_err("could not alloc crypto acomp %s : %ld\n", > @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) > acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); > if (IS_ERR_OR_NULL(acomp_ctx->req)) { > pr_err("could not alloc crypto acomp_request %s\n", > - pool->tfm_name); > + pool->tfm_name); > goto fail; > } > > - crypto_init_wait(&acomp_ctx->wait); > + /* > + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the > + * compressor supports batching. > + */ > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > + crypto_acomp_batch_size(acomp_ctx->acomp)); > + > + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *), > + GFP_KERNEL, cpu_to_node(cpu)); > + if (!acomp_ctx->buffers) > + goto fail; > + > + for (i = 0; i < pool->compr_batch_size; ++i) { > + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, > + cpu_to_node(cpu)); > + if (!acomp_ctx->buffers[i]) > + goto fail; > + } It’s hard to follow — memory is allocated here but only used in another patch. Could we merge the related changes into a single patch instead of splitting them into several that don’t work independently? > > /* > * if the backend of acomp is async zip, crypto_req_done() will wakeup > * crypto_wait_req(); if the backend of acomp is scomp, the callback > * won't be called, crypto_wait_req() will return without blocking. > */ > + crypto_init_wait(&acomp_ctx->wait); > + > acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, > crypto_req_done, &acomp_ctx->wait); > > @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) > return 0; > > fail: > - acomp_ctx_dealloc(acomp_ctx); > + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size); > return ret; > } > > @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, > > mutex_lock(&acomp_ctx->mutex); > > - dst = acomp_ctx->buffer; > + dst = acomp_ctx->buffers[0]; > sg_init_table(&input, 1); > sg_set_page(&input, page, PAGE_SIZE, 0); > > @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > mutex_lock(&acomp_ctx->mutex); > - obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); > + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffers[0]); > > /* > * zpool_obj_read_begin() might return a kmap address of highmem when > - * acomp_ctx->buffer is not used. However, sg_init_one() does not > - * handle highmem addresses, so copy the object to acomp_ctx->buffer. > + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not > + * handle highmem addresses, so copy the object to acomp_ctx->buffers[0]. > */ > if (virt_addr_valid(obj)) { > src = obj; > } else { > - WARN_ON_ONCE(obj == acomp_ctx->buffer); > - memcpy(acomp_ctx->buffer, obj, entry->length); > - src = acomp_ctx->buffer; > + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]); > + memcpy(acomp_ctx->buffers[0], obj, entry->length); > + src = acomp_ctx->buffers[0]; Hard to understand what is going on if related changes are not kept in one self-contained patch. Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Monday, August 25, 2025 8:48 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > Hi Kanchana, > > > [...] > > > > + /* > > + * Set the unit of compress batching for large folios, for quick > > + * retrieval in the zswap_compress() fast path: > > + * If the compressor is sequential (@pool->compr_batch_size is 1), > > + * large folios will be compressed in batches of > ZSWAP_MAX_BATCH_SIZE > > + * pages, where each page in the batch is compressed sequentially. > > + * We see better performance by processing the folio in batches of > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > > + * structures. > > + */ > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > + > > zswap_pool_debug("created", pool); > > > > return pool; > > > > It’s hard to follow — you add batch_size and compr_batch_size in this > patch, but only use them in another. Could we merge the related changes > into one patch instead of splitting them into several that don’t work > independently? Hi Barry, Thanks for reviewing the code and for your comments! Sure, I can merge this patch with the next one. I was trying to keep the changes modularized to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) zswap_compress() so the changes are broken into smaller parts, but I can see how this can make the changes appear disjointed. One thing though: the commit logs for each of the patches will also probably need to be merged, since I have tried to explain the changes in detail. Thanks, Kanchana > > > - > > acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, > cpu_to_node(cpu)); > > if (IS_ERR_OR_NULL(acomp_ctx->acomp)) { > > pr_err("could not alloc crypto acomp %s : %ld\n", > > @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int > cpu, struct hlist_node *node) > > acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); > > if (IS_ERR_OR_NULL(acomp_ctx->req)) { > > pr_err("could not alloc crypto acomp_request %s\n", > > - pool->tfm_name); > > + pool->tfm_name); > > goto fail; > > } > > > > - crypto_init_wait(&acomp_ctx->wait); > > + /* > > + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the > > + * compressor supports batching. > > + */ > > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > > + crypto_acomp_batch_size(acomp_ctx->acomp)); > > + > > + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 > *), > > + GFP_KERNEL, cpu_to_node(cpu)); > > + if (!acomp_ctx->buffers) > > + goto fail; > > + > > + for (i = 0; i < pool->compr_batch_size; ++i) { > > + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, > GFP_KERNEL, > > + cpu_to_node(cpu)); > > + if (!acomp_ctx->buffers[i]) > > + goto fail; > > + } > > It’s hard to follow — memory is allocated here but only used in another > patch. Could we merge the related changes into a single patch instead of > splitting them into several that don’t work independently? > > > > > /* > > * if the backend of acomp is async zip, crypto_req_done() will wakeup > > * crypto_wait_req(); if the backend of acomp is scomp, the callback > > * won't be called, crypto_wait_req() will return without blocking. > > */ > > + crypto_init_wait(&acomp_ctx->wait); > > + > > acomp_request_set_callback(acomp_ctx->req, > CRYPTO_TFM_REQ_MAY_BACKLOG, > > crypto_req_done, &acomp_ctx->wait); > > > > @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int > cpu, struct hlist_node *node) > > return 0; > > > > fail: > > - acomp_ctx_dealloc(acomp_ctx); > > + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size); > > return ret; > > } > > > > @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, > struct zswap_entry *entry, > > > > mutex_lock(&acomp_ctx->mutex); > > > > - dst = acomp_ctx->buffer; > > + dst = acomp_ctx->buffers[0]; > > sg_init_table(&input, 1); > > sg_set_page(&input, page, PAGE_SIZE, 0); > > > > @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct > zswap_entry *entry, struct folio *folio) > > > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > mutex_lock(&acomp_ctx->mutex); > > - obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); > > + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx- > >buffers[0]); > > > > /* > > * zpool_obj_read_begin() might return a kmap address of highmem > when > > - * acomp_ctx->buffer is not used. However, sg_init_one() does not > > - * handle highmem addresses, so copy the object to acomp_ctx- > >buffer. > > + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not > > + * handle highmem addresses, so copy the object to acomp_ctx- > >buffers[0]. > > */ > > if (virt_addr_valid(obj)) { > > src = obj; > > } else { > > - WARN_ON_ONCE(obj == acomp_ctx->buffer); > > - memcpy(acomp_ctx->buffer, obj, entry->length); > > - src = acomp_ctx->buffer; > > + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]); > > + memcpy(acomp_ctx->buffers[0], obj, entry->length); > > + src = acomp_ctx->buffers[0]; > > Hard to understand what is going on if related changes are not kept in > one self-contained patch. > > Thanks > Barry
On Tue, Aug 26, 2025 at 12:27 PM Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> wrote: > > > > > -----Original Message----- > > From: Barry Song <21cnbao@gmail.com> > > Sent: Monday, August 25, 2025 8:48 PM > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > > chengming.zhou@linux.dev; usamaarif642@gmail.com; > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > > herbert@gondor.apana.org.au; davem@davemloft.net; > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > > if the compressor supports batching. > > > > Hi Kanchana, > > > > > > [...] > > > > > > + /* > > > + * Set the unit of compress batching for large folios, for quick > > > + * retrieval in the zswap_compress() fast path: > > > + * If the compressor is sequential (@pool->compr_batch_size is 1), > > > + * large folios will be compressed in batches of > > ZSWAP_MAX_BATCH_SIZE > > > + * pages, where each page in the batch is compressed sequentially. > > > + * We see better performance by processing the folio in batches of > > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > > > + * structures. > > > + */ > > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > > + > > > zswap_pool_debug("created", pool); > > > > > > return pool; > > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in this > > patch, but only use them in another. Could we merge the related changes > > into one patch instead of splitting them into several that don’t work > > independently? > > Hi Barry, > > Thanks for reviewing the code and for your comments! Sure, I can merge > this patch with the next one. I was trying to keep the changes modularized > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) zswap_compress() > so the changes are broken into smaller parts, but I can see how this can > make the changes appear disjointed. > > One thing though: the commit logs for each of the patches will > also probably need to be merged, since I have tried to explain the > changes in detail. It’s fine to merge the changelog and present the story as a whole. Do we really need both pool->batch_size and pool->compr_batch_size? I assume pool->batch_size = pool->compr_batch_size if HW supports batch; otherwise pool->compr_batch_size = 1. It seems pool->compr_batch_size should either be a bool or be dropped. If we drop it, you can still check whether HW supports batch via crypto_acomp_batch_size() when doing compression: if (crypto_acomp_batch_size() > 1) compress in steps of PAGE_SIZE; else compress in steps of batch_size; no ? Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Monday, August 25, 2025 9:42 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > On Tue, Aug 26, 2025 at 12:27 PM Sridhar, Kanchana P > <kanchana.p.sridhar@intel.com> wrote: > > > > > > > > > -----Original Message----- > > > From: Barry Song <21cnbao@gmail.com> > > > Sent: Monday, August 25, 2025 8:48 PM > > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > > > chengming.zhou@linux.dev; usamaarif642@gmail.com; > > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > > > foundation.org; senozhatsky@chromium.org; linux- > crypto@vger.kernel.org; > > > herbert@gondor.apana.org.au; davem@davemloft.net; > > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching > resources > > > if the compressor supports batching. > > > > > > Hi Kanchana, > > > > > > > > > [...] > > > > > > > > + /* > > > > + * Set the unit of compress batching for large folios, for quick > > > > + * retrieval in the zswap_compress() fast path: > > > > + * If the compressor is sequential (@pool->compr_batch_size is 1), > > > > + * large folios will be compressed in batches of > > > ZSWAP_MAX_BATCH_SIZE > > > > + * pages, where each page in the batch is compressed sequentially. > > > > + * We see better performance by processing the folio in batches of > > > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > > > > + * structures. > > > > + */ > > > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > > > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > > > + > > > > zswap_pool_debug("created", pool); > > > > > > > > return pool; > > > > > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in this > > > patch, but only use them in another. Could we merge the related changes > > > into one patch instead of splitting them into several that don’t work > > > independently? > > > > Hi Barry, > > > > Thanks for reviewing the code and for your comments! Sure, I can merge > > this patch with the next one. I was trying to keep the changes modularized > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) > zswap_compress() > > so the changes are broken into smaller parts, but I can see how this can > > make the changes appear disjointed. > > > > One thing though: the commit logs for each of the patches will > > also probably need to be merged, since I have tried to explain the > > changes in detail. > > It’s fine to merge the changelog and present the story as a whole. Do we Sure. > really need both pool->batch_size and pool->compr_batch_size? I assume > pool->batch_size = pool->compr_batch_size if HW supports batch; otherwise > pool->compr_batch_size = 1. Actually not exactly. We have found value in compressing in batches of ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits from cache locality of working-set data. Hence the approach that we have settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if the compressor does not support batching (i.e., if pool->compr_batch_size is 1). If it does, then pool->batch_size = pool->compr_batch_size. Besides this, pool->compr_batch_size helps to distinguish the number of acomp_ctx resources during zswap_compress(); distinctly from the batch size. > It seems pool->compr_batch_size should either > be a bool or be dropped. If we drop it, you can still check whether HW > supports batch via crypto_acomp_batch_size() when doing compression: > > if (crypto_acomp_batch_size() > 1) > compress in steps of PAGE_SIZE; > else > compress in steps of batch_size; > > no ? I could do this, but it will impact latency. As I was mentioning in an earlier response to Nhat, this (keeping compr_batch_size and batch_size distinct) was a small memory overhead per zswap_pool (one more u8 data member), given that there are very few zswap pools. For this trade off, we are able to minimize computes in zswap_compress().. Thanks, Kanchana > > Thanks > Barry
> > > > [...] > > > > > > > > > > + /* > > > > > + * Set the unit of compress batching for large folios, for quick > > > > > + * retrieval in the zswap_compress() fast path: > > > > > + * If the compressor is sequential (@pool->compr_batch_size is 1), > > > > > + * large folios will be compressed in batches of > > > > ZSWAP_MAX_BATCH_SIZE > > > > > + * pages, where each page in the batch is compressed sequentially. > > > > > + * We see better performance by processing the folio in batches of > > > > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > > > > > + * structures. > > > > > + */ > > > > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > > > > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > > > > + > > > > > zswap_pool_debug("created", pool); > > > > > > > > > > return pool; > > > > > > > > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in this > > > > patch, but only use them in another. Could we merge the related changes > > > > into one patch instead of splitting them into several that don’t work > > > > independently? > > > > > > Hi Barry, > > > > > > Thanks for reviewing the code and for your comments! Sure, I can merge > > > this patch with the next one. I was trying to keep the changes modularized > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) > > zswap_compress() > > > so the changes are broken into smaller parts, but I can see how this can > > > make the changes appear disjointed. > > > > > > One thing though: the commit logs for each of the patches will > > > also probably need to be merged, since I have tried to explain the > > > changes in detail. > > > > It’s fine to merge the changelog and present the story as a whole. Do we > > Sure. > > > really need both pool->batch_size and pool->compr_batch_size? I assume > > pool->batch_size = pool->compr_batch_size if HW supports batch; otherwise > > pool->compr_batch_size = 1. > > Actually not exactly. We have found value in compressing in batches of > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits > from cache locality of working-set data. Hence the approach that we have > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if > the compressor does not support batching (i.e., if pool->compr_batch_size is 1). > If it does, then pool->batch_size = pool->compr_batch_size. I understand that even without a hardware batch, you can still have some software batching that excludes compression. However, based on the code below, it looks like pool->compr_batch_size is almost always either equal to pool->batch_size or 1: + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, + crypto_acomp_batch_size(acomp_ctx->acomp)); + pool->batch_size = (pool->compr_batch_size > 1) ? + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; It seems one of these two variables may be redundant. For instance, no matter if pool->compr_batch_size > 1, could we always treat batch_size as ZSWAP_MAX_BATCH_SIZE? if we remove pool->batch_size, could we just use pool->compr_batch_size as the step size for compression no matter if pool->compr_batch_size > 1. for example: pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, crypto_acomp_batch_size(acomp_ctx->acomp)); Then batch the zswap store using ZSWAP_MAX_BATCH_SIZE, but set the compression step to pool->compr_batch_size. Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Monday, August 25, 2025 10:17 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > > > > > [...] > > > > > > > > > > > > + /* > > > > > > + * Set the unit of compress batching for large folios, for quick > > > > > > + * retrieval in the zswap_compress() fast path: > > > > > > + * If the compressor is sequential (@pool->compr_batch_size is > 1), > > > > > > + * large folios will be compressed in batches of > > > > > ZSWAP_MAX_BATCH_SIZE > > > > > > + * pages, where each page in the batch is compressed > sequentially. > > > > > > + * We see better performance by processing the folio in batches > of > > > > > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working > set > > > > > > + * structures. > > > > > > + */ > > > > > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > > > > > + pool->compr_batch_size : > ZSWAP_MAX_BATCH_SIZE; > > > > > > + > > > > > > zswap_pool_debug("created", pool); > > > > > > > > > > > > return pool; > > > > > > > > > > > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in this > > > > > patch, but only use them in another. Could we merge the related > changes > > > > > into one patch instead of splitting them into several that don’t work > > > > > independently? > > > > > > > > Hi Barry, > > > > > > > > Thanks for reviewing the code and for your comments! Sure, I can merge > > > > this patch with the next one. I was trying to keep the changes > modularized > > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) > > > zswap_compress() > > > > so the changes are broken into smaller parts, but I can see how this can > > > > make the changes appear disjointed. > > > > > > > > One thing though: the commit logs for each of the patches will > > > > also probably need to be merged, since I have tried to explain the > > > > changes in detail. > > > > > > It’s fine to merge the changelog and present the story as a whole. Do we > > > > Sure. > > > > > really need both pool->batch_size and pool->compr_batch_size? I assume > > > pool->batch_size = pool->compr_batch_size if HW supports batch; > otherwise > > > pool->compr_batch_size = 1. > > > > Actually not exactly. We have found value in compressing in batches of > > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits > > from cache locality of working-set data. Hence the approach that we have > > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if > > the compressor does not support batching (i.e., if pool->compr_batch_size is > 1). > > If it does, then pool->batch_size = pool->compr_batch_size. > > I understand that even without a hardware batch, you can still > have some software batching that excludes compression. > > However, based on the code below, it looks like > pool->compr_batch_size is almost always either equal to > pool->batch_size or 1: > > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > + crypto_acomp_batch_size(acomp_ctx->acomp)); I would like to explain some of the considerations in coming up with this approach: 1) The compression algorithm gets to decide an optimal batch-size. For a hardware accelerator such as IAA, this value could be different than ZSWAP_MAX_BATCH_SIZE. 2) ZSWAP_MAX_BATCH_SIZE acts as a limiting factor to the # of acomp_ctx per-CPU resources that will be allocated in zswap_cpu_comp_prepare(); as per Yosry's suggestion. This helps limit the memory overhead for batching algorithms. 3) If a batching algorithm works with a batch size "X" , where 1 < X < ZSWAP_MAX_BATCH_SIZE, two things need to happen: a) We want to only allocate "X" per-CPU resources. b) We want to process the folio in batches of "X", not ZSWAP_MAX_BATCH_SIZE to avail of the benefits of hardware parallelism. This is the compression step size you also mention. In particular, we cannot treat batch_size as ZSWAP_MAX_BATCH_SIZE, and send a batch of ZSWAP_MAX_BATCH_SIZE pages to zswap_compress() in this case. For e.g., what if the compress step-size is 6, but the new zswap_store_pages() introduced in patch 23 sends 8 pages to zswap_compress() because ZSWAP_MAX_BATCH_SIZE is set to 8? The code in zswap_compress() could get quite messy, which will impact latency. > > + pool->batch_size = (pool->compr_batch_size > 1) ? > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > > It seems one of these two variables may be redundant. > For instance, no matter if pool->compr_batch_size > 1, could we always treat > batch_size as ZSWAP_MAX_BATCH_SIZE? if we remove > pool->batch_size, could we just use pool->compr_batch_size as the > step size for compression no matter if pool->compr_batch_size > 1. To further explain the rationale for keeping these two distinct, we statically compute the compress step-size by querying the algorithm in zswap_cpu_comp_prepare() after the acomp_ctx->acomp has been created. We store it in pool->compr_batch_size. Next, in zswap_pool_create(), we do a one-time computation to determine if the pool->batch_size needs to align with a non-1 batching acomp's compr_batch_size; or, if it is a non-batching compressor, for the pool->batch_size to be ZSWAP_MAX_BATCH_SIZE. This enables further code simplifications/unification in zswap_compress(); and quick retrieval of the # of available acomp_ctx batching resources to set up the SG lists per call to crypto_acomp_compress(). IOW, memo-ization for latency gains and unified code paths. I hope this clarifies things further. Thanks again, these are all good questions. Best regards, Kanchana > > for example: > pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > crypto_acomp_batch_size(acomp_ctx->acomp)); > > Then batch the zswap store using ZSWAP_MAX_BATCH_SIZE, but set the > compression step to pool->compr_batch_size. > > Thanks > Barry
On Wed, Aug 27, 2025 at 12:06 PM Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> wrote: > > > > -----Original Message----- > > From: Barry Song <21cnbao@gmail.com> > > Sent: Monday, August 25, 2025 10:17 PM > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > > chengming.zhou@linux.dev; usamaarif642@gmail.com; > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > > herbert@gondor.apana.org.au; davem@davemloft.net; > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > > if the compressor supports batching. > > > > > > > > [...] > > > > > > > > > > > > > > + /* > > > > > > > + * Set the unit of compress batching for large folios, for quick > > > > > > > + * retrieval in the zswap_compress() fast path: > > > > > > > + * If the compressor is sequential (@pool->compr_batch_size is > > 1), > > > > > > > + * large folios will be compressed in batches of > > > > > > ZSWAP_MAX_BATCH_SIZE > > > > > > > + * pages, where each page in the batch is compressed > > sequentially. > > > > > > > + * We see better performance by processing the folio in batches > > of > > > > > > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working > > set > > > > > > > + * structures. > > > > > > > + */ > > > > > > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > > > > > > + pool->compr_batch_size : > > ZSWAP_MAX_BATCH_SIZE; > > > > > > > + > > > > > > > zswap_pool_debug("created", pool); > > > > > > > > > > > > > > return pool; > > > > > > > > > > > > > > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in this > > > > > > patch, but only use them in another. Could we merge the related > > changes > > > > > > into one patch instead of splitting them into several that don’t work > > > > > > independently? > > > > > > > > > > Hi Barry, > > > > > > > > > > Thanks for reviewing the code and for your comments! Sure, I can merge > > > > > this patch with the next one. I was trying to keep the changes > > modularized > > > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) > > > > zswap_compress() > > > > > so the changes are broken into smaller parts, but I can see how this can > > > > > make the changes appear disjointed. > > > > > > > > > > One thing though: the commit logs for each of the patches will > > > > > also probably need to be merged, since I have tried to explain the > > > > > changes in detail. > > > > > > > > It’s fine to merge the changelog and present the story as a whole. Do we > > > > > > Sure. > > > > > > > really need both pool->batch_size and pool->compr_batch_size? I assume > > > > pool->batch_size = pool->compr_batch_size if HW supports batch; > > otherwise > > > > pool->compr_batch_size = 1. > > > > > > Actually not exactly. We have found value in compressing in batches of > > > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency benefits > > > from cache locality of working-set data. Hence the approach that we have > > > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if > > > the compressor does not support batching (i.e., if pool->compr_batch_size is > > 1). > > > If it does, then pool->batch_size = pool->compr_batch_size. > > > > I understand that even without a hardware batch, you can still > > have some software batching that excludes compression. > > > > However, based on the code below, it looks like > > pool->compr_batch_size is almost always either equal to > > pool->batch_size or 1: > > > > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > > + crypto_acomp_batch_size(acomp_ctx->acomp)); > > I would like to explain some of the considerations in coming up with this > approach: > > 1) The compression algorithm gets to decide an optimal batch-size. > For a hardware accelerator such as IAA, this value could be different > than ZSWAP_MAX_BATCH_SIZE. > > 2) ZSWAP_MAX_BATCH_SIZE acts as a limiting factor to the # of acomp_ctx > per-CPU resources that will be allocated in zswap_cpu_comp_prepare(); > as per Yosry's suggestion. This helps limit the memory overhead for > batching algorithms. > > 3) If a batching algorithm works with a batch size "X" , where > 1 < X < ZSWAP_MAX_BATCH_SIZE, two things need to happen: > a) We want to only allocate "X" per-CPU resources. > b) We want to process the folio in batches of "X", not ZSWAP_MAX_BATCH_SIZE > to avail of the benefits of hardware parallelism. This is the compression > step size you also mention. > In particular, we cannot treat batch_size as ZSWAP_MAX_BATCH_SIZE, > and send a batch of ZSWAP_MAX_BATCH_SIZE pages to zswap_compress() > in this case. For e.g., what if the compress step-size is 6, but the new > zswap_store_pages() introduced in patch 23 sends 8 pages to > zswap_compress() because ZSWAP_MAX_BATCH_SIZE is set to 8? > The code in zswap_compress() could get quite messy, which will impact > latency. If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching, compression is done with a step size of 1. If the hardware step size is 4, compression occurs in two steps. If the hardware step size is 6, the first compression uses a step size of 6, and the second uses a step size of 2. Do you think this will work? I don’t quite understand why you want to save ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware batching you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the case for all platforms except yours. Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Thursday, August 28, 2025 2:40 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > On Wed, Aug 27, 2025 at 12:06 PM Sridhar, Kanchana P > <kanchana.p.sridhar@intel.com> wrote: > > > > > > > -----Original Message----- > > > From: Barry Song <21cnbao@gmail.com> > > > Sent: Monday, August 25, 2025 10:17 PM > > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > > > chengming.zhou@linux.dev; usamaarif642@gmail.com; > > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > > > foundation.org; senozhatsky@chromium.org; linux- > crypto@vger.kernel.org; > > > herbert@gondor.apana.org.au; davem@davemloft.net; > > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching > resources > > > if the compressor supports batching. > > > > > > > > > > [...] > > > > > > > > > > > > > > > > + /* > > > > > > > > + * Set the unit of compress batching for large folios, for quick > > > > > > > > + * retrieval in the zswap_compress() fast path: > > > > > > > > + * If the compressor is sequential (@pool- > >compr_batch_size is > > > 1), > > > > > > > > + * large folios will be compressed in batches of > > > > > > > ZSWAP_MAX_BATCH_SIZE > > > > > > > > + * pages, where each page in the batch is compressed > > > sequentially. > > > > > > > > + * We see better performance by processing the folio in > batches > > > of > > > > > > > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of > working > > > set > > > > > > > > + * structures. > > > > > > > > + */ > > > > > > > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > > > > > > > + pool->compr_batch_size : > > > ZSWAP_MAX_BATCH_SIZE; > > > > > > > > + > > > > > > > > zswap_pool_debug("created", pool); > > > > > > > > > > > > > > > > return pool; > > > > > > > > > > > > > > > > > > > > > > It’s hard to follow — you add batch_size and compr_batch_size in > this > > > > > > > patch, but only use them in another. Could we merge the related > > > changes > > > > > > > into one patch instead of splitting them into several that don’t work > > > > > > > independently? > > > > > > > > > > > > Hi Barry, > > > > > > > > > > > > Thanks for reviewing the code and for your comments! Sure, I can > merge > > > > > > this patch with the next one. I was trying to keep the changes > > > modularized > > > > > > to a) zswap_cpu_comp_prepare(), b) zswap_store() and c) > > > > > zswap_compress() > > > > > > so the changes are broken into smaller parts, but I can see how this > can > > > > > > make the changes appear disjointed. > > > > > > > > > > > > One thing though: the commit logs for each of the patches will > > > > > > also probably need to be merged, since I have tried to explain the > > > > > > changes in detail. > > > > > > > > > > It’s fine to merge the changelog and present the story as a whole. Do > we > > > > > > > > Sure. > > > > > > > > > really need both pool->batch_size and pool->compr_batch_size? I > assume > > > > > pool->batch_size = pool->compr_batch_size if HW supports batch; > > > otherwise > > > > > pool->compr_batch_size = 1. > > > > > > > > Actually not exactly. We have found value in compressing in batches of > > > > ZSWAP_MAX_BATCH_SIZE even for software compressors. Latency > benefits > > > > from cache locality of working-set data. Hence the approach that we > have > > > > settled on is pool->batch_size = ZSWAP_MAX_BATCH_SIZE if > > > > the compressor does not support batching (i.e., if pool- > >compr_batch_size is > > > 1). > > > > If it does, then pool->batch_size = pool->compr_batch_size. > > > > > > I understand that even without a hardware batch, you can still > > > have some software batching that excludes compression. > > > > > > However, based on the code below, it looks like > > > pool->compr_batch_size is almost always either equal to > > > pool->batch_size or 1: > > > > > > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > > > + crypto_acomp_batch_size(acomp_ctx->acomp)); > > > > I would like to explain some of the considerations in coming up with this > > approach: > > > > 1) The compression algorithm gets to decide an optimal batch-size. > > For a hardware accelerator such as IAA, this value could be different > > than ZSWAP_MAX_BATCH_SIZE. > > > > 2) ZSWAP_MAX_BATCH_SIZE acts as a limiting factor to the # of acomp_ctx > > per-CPU resources that will be allocated in zswap_cpu_comp_prepare(); > > as per Yosry's suggestion. This helps limit the memory overhead for > > batching algorithms. > > > > 3) If a batching algorithm works with a batch size "X" , where > > 1 < X < ZSWAP_MAX_BATCH_SIZE, two things need to happen: > > a) We want to only allocate "X" per-CPU resources. > > b) We want to process the folio in batches of "X", not > ZSWAP_MAX_BATCH_SIZE > > to avail of the benefits of hardware parallelism. This is the > compression > > step size you also mention. > > In particular, we cannot treat batch_size as ZSWAP_MAX_BATCH_SIZE, > > and send a batch of ZSWAP_MAX_BATCH_SIZE pages to > zswap_compress() > > in this case. For e.g., what if the compress step-size is 6, but the new > > zswap_store_pages() introduced in patch 23 sends 8 pages to > > zswap_compress() because ZSWAP_MAX_BATCH_SIZE is set to 8? > > The code in zswap_compress() could get quite messy, which will > impact > > latency. > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching, > compression is done with a step size of 1. If the hardware step size is 4, > compression occurs in two steps. If the hardware step size is 6, the first > compression uses a step size of 6, and the second uses a step size of 2. > Do you think this will work? Hi Barry, This would be non-optimal from code simplicity and latency perspectives. One of the benefits of using the hardware accelerator's "batch parallelism" is cost amortization across the batch. We might lose this benefit if we make multiple calls to zswap_compress() to ask the hardware accelerator to batch compress in smaller batches. Compression throughput would also be sub-optimal. In my patch-series, the rule is simple: if an algorithm has specified a batch-size, carve out the folio by that "batch-size" # of pages to be compressed as a batch in zswap_compress(). This custom batch-size is capped at ZSWAP_MAX_BATCH_SIZE. If an algorithm has not specified a batch-size, the default batch-size is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE # of pages to be compressed as a batch in zswap_compress(). > > I don’t quite understand why you want to save > ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware > batching > you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the > case for all platforms except yours. Not sure I understand.. Just to clarify, this is not done to save on resources, rather for the reasons stated above. We are already saving on resources by only allocating only "pool->compr_batch_size" number of resources (*not* ZSWAP_MAX_BATCH_SIZE resources): pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, crypto_acomp_batch_size(acomp_ctx->acomp)); For non-Intel platforms, this means only 1 dst buffer is allocated, as explained in the commit log for this patch. " A "u8 compr_batch_size" member is added to "struct zswap_pool", as per Yosry's suggestion. pool->compr_batch_size is set as the minimum of the compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it proceeds to allocate the necessary compression dst buffers in the per-CPU acomp_ctx." Thanks, Kanchana > > Thanks > Barry
> > > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching, > > compression is done with a step size of 1. If the hardware step size is 4, > > compression occurs in two steps. If the hardware step size is 6, the first > > compression uses a step size of 6, and the second uses a step size of 2. > > Do you think this will work? > > Hi Barry, > > This would be non-optimal from code simplicity and latency perspectives. > One of the benefits of using the hardware accelerator's "batch parallelism" > is cost amortization across the batch. We might lose this benefit if we make > multiple calls to zswap_compress() to ask the hardware accelerator to > batch compress in smaller batches. Compression throughput would also > be sub-optimal. I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and pool->compr_batch_size are powers of two. As you mentioned, we still gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when pool->compr_batch_size == 1, by compressing pages one by one but batching other work such as zswap_entries_cache_alloc_batch() ? > > In my patch-series, the rule is simple: if an algorithm has specified a > batch-size, carve out the folio by that "batch-size" # of pages to be > compressed as a batch in zswap_compress(). This custom batch-size > is capped at ZSWAP_MAX_BATCH_SIZE. > > If an algorithm has not specified a batch-size, the default batch-size > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE > # of pages to be compressed as a batch in zswap_compress(). Yes, I understand your rule. However, having two global variables is still somewhat confusing. It might be clearer to use a single variable with a comment, since one variable can clearly determine the value of the other. Can we get the batch_size at runtime based on pool->compr_batch_size? /* * If hardware compression supports batching, we use the hardware step size. * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still compress * one page at a time. */ batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; We probably don’t need this if both pool->compr_batch_size and ZSWAP_MAX_BATCH_SIZE are powers of two? > > > > > I don’t quite understand why you want to save > > ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware > > batching > > you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the > > case for all platforms except yours. > > Not sure I understand.. Just to clarify, this is not done to save on resources, > rather for the reasons stated above. > > We are already saving on resources by only allocating only > "pool->compr_batch_size" number of resources > (*not* ZSWAP_MAX_BATCH_SIZE resources): > > pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > crypto_acomp_batch_size(acomp_ctx->acomp)); > > For non-Intel platforms, this means only 1 dst buffer is allocated, as > explained in the commit log for this patch. you are right. I misunderstood your code :-) > > " A "u8 compr_batch_size" member is added to "struct zswap_pool", as per > Yosry's suggestion. pool->compr_batch_size is set as the minimum of the > compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it > proceeds to allocate the necessary compression dst buffers in the > per-CPU acomp_ctx." This is fine, but it still doesn’t provide a strong reason for having two global variables when one can fully determine the value of the other. Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Thursday, August 28, 2025 4:29 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > > > > > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching, > > > compression is done with a step size of 1. If the hardware step size is 4, > > > compression occurs in two steps. If the hardware step size is 6, the first > > > compression uses a step size of 6, and the second uses a step size of 2. > > > Do you think this will work? > > > > Hi Barry, > > > > This would be non-optimal from code simplicity and latency perspectives. > > One of the benefits of using the hardware accelerator's "batch parallelism" > > is cost amortization across the batch. We might lose this benefit if we make > > multiple calls to zswap_compress() to ask the hardware accelerator to > > batch compress in smaller batches. Compression throughput would also > > be sub-optimal. > > I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and > pool->compr_batch_size are powers of two. As you mentioned, we still > gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when > pool->compr_batch_size == 1, by compressing pages one by one but > batching other work such as zswap_entries_cache_alloc_batch() ? > > > > > In my patch-series, the rule is simple: if an algorithm has specified a > > batch-size, carve out the folio by that "batch-size" # of pages to be > > compressed as a batch in zswap_compress(). This custom batch-size > > is capped at ZSWAP_MAX_BATCH_SIZE. > > > > If an algorithm has not specified a batch-size, the default batch-size > > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE > > # of pages to be compressed as a batch in zswap_compress(). > > Yes, I understand your rule. However, having two global variables is still > somewhat confusing. It might be clearer to use a single variable with a > comment, since one variable can clearly determine the value of the other. > > Can we get the batch_size at runtime based on pool->compr_batch_size? > > /* > * If hardware compression supports batching, we use the hardware step size. > * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still > compress > * one page at a time. > */ > batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size : > ZSWAP_MAX_BATCH_SIZE; > > We probably don’t need this if both pool->compr_batch_size and > ZSWAP_MAX_BATCH_SIZE are powers of two? I am not sure I understand this rationale, but I do want to reiterate that the patch-set implements a simple set of rules/design choices to provide a batching framework for software and hardware compressors, that has shown good performance improvements with both, while unifying zswap_store()/zswap_compress() code paths for both. As explained before, keeping the two variables as distinct u8 members of struct zswap_pool is a design choice with these benefits: 1) Saves computes by avoiding computing this in performance-critical zswap_store() code. I have verified that dynamically computing the batch_size based on pool->compr_batch_size impacts latency. 2) The memory overhead is minimal: there is at most one zswap_pool active at any given time, other than at compressor transitions. The additional overhead is one u8, i.e., 1 byte for 1 runtime struct. > > > > > > > > > I don’t quite understand why you want to save > > > ZSWAP_MAX_BATCH_SIZE - X resources, since even without hardware > > > batching > > > you are still allocating all ZSWAP_MAX_BATCH_SIZE resources. This is the > > > case for all platforms except yours. > > > > Not sure I understand.. Just to clarify, this is not done to save on resources, > > rather for the reasons stated above. > > > > We are already saving on resources by only allocating only > > "pool->compr_batch_size" number of resources > > (*not* ZSWAP_MAX_BATCH_SIZE resources): > > > > pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > > crypto_acomp_batch_size(acomp_ctx->acomp)); > > > > For non-Intel platforms, this means only 1 dst buffer is allocated, as > > explained in the commit log for this patch. > > you are right. I misunderstood your code :-) > > > > > " A "u8 compr_batch_size" member is added to "struct zswap_pool", as per > > Yosry's suggestion. pool->compr_batch_size is set as the minimum of the > > compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it > > proceeds to allocate the necessary compression dst buffers in the > > per-CPU acomp_ctx." > > This is fine, but it still doesn’t provide a strong reason for having > two global variables when one can fully determine the value of the other. Hopefully the above provides clarity. Thanks, Kanchana > > Thanks > Barry
On Fri, Aug 29, 2025 at 10:57 AM Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> wrote: > > > > -----Original Message----- > > From: Barry Song <21cnbao@gmail.com> > > Sent: Thursday, August 28, 2025 4:29 PM > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > > chengming.zhou@linux.dev; usamaarif642@gmail.com; > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > > herbert@gondor.apana.org.au; davem@davemloft.net; > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > > if the compressor supports batching. > > > > > > > > > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware batching, > > > > compression is done with a step size of 1. If the hardware step size is 4, > > > > compression occurs in two steps. If the hardware step size is 6, the first > > > > compression uses a step size of 6, and the second uses a step size of 2. > > > > Do you think this will work? > > > > > > Hi Barry, > > > > > > This would be non-optimal from code simplicity and latency perspectives. > > > One of the benefits of using the hardware accelerator's "batch parallelism" > > > is cost amortization across the batch. We might lose this benefit if we make > > > multiple calls to zswap_compress() to ask the hardware accelerator to > > > batch compress in smaller batches. Compression throughput would also > > > be sub-optimal. > > > > I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and > > pool->compr_batch_size are powers of two. As you mentioned, we still > > gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when > > pool->compr_batch_size == 1, by compressing pages one by one but > > batching other work such as zswap_entries_cache_alloc_batch() ? > > > > > > > > In my patch-series, the rule is simple: if an algorithm has specified a > > > batch-size, carve out the folio by that "batch-size" # of pages to be > > > compressed as a batch in zswap_compress(). This custom batch-size > > > is capped at ZSWAP_MAX_BATCH_SIZE. > > > > > > If an algorithm has not specified a batch-size, the default batch-size > > > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE > > > # of pages to be compressed as a batch in zswap_compress(). > > > > Yes, I understand your rule. However, having two global variables is still > > somewhat confusing. It might be clearer to use a single variable with a > > comment, since one variable can clearly determine the value of the other. > > > > Can we get the batch_size at runtime based on pool->compr_batch_size? > > > > /* > > * If hardware compression supports batching, we use the hardware step size. > > * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still > > compress > > * one page at a time. > > */ > > batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size : > > ZSWAP_MAX_BATCH_SIZE; > > > > We probably don’t need this if both pool->compr_batch_size and > > ZSWAP_MAX_BATCH_SIZE are powers of two? > > I am not sure I understand this rationale, but I do want to reiterate > that the patch-set implements a simple set of rules/design choices > to provide a batching framework for software and hardware compressors, > that has shown good performance improvements with both, while > unifying zswap_store()/zswap_compress() code paths for both. I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size = 4, why wouldn’t batch_size = 8 and compr_batch_size = 4 perform better than batch_size = 4 and compr_batch_size = 4? In short, I’d like the case of compr_batch_size == 1 to be treated the same as compr_batch_size == 2, 4, etc., since you can still see performance improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size == 1, as batching occurs even outside compression. Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to perform better than when both are 2. The only thing preventing this from happening is that compr_batch_size might be 5, 6, or 7, which are not powers of two? > > As explained before, keeping the two variables as distinct u8 members > of struct zswap_pool is a design choice with these benefits: > > 1) Saves computes by avoiding computing this in performance-critical > zswap_store() code. I have verified that dynamically computing the > batch_size based on pool->compr_batch_size impacts latency. Ok, I’m a bit surprised, since this small computation wouldn’t cause a branch misprediction at all. In any case, if you want to keep both variables, that’s fine. But can we at least rename one of them? For example, use pool->store_batch_size and pool->compr_batch_size instead of pool->batch_size and pool->compr_batch_size, since pool->batch_size generally has a broader semantic scope than compr_batch_size. Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Thursday, August 28, 2025 8:42 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > On Fri, Aug 29, 2025 at 10:57 AM Sridhar, Kanchana P > <kanchana.p.sridhar@intel.com> wrote: > > > > > > > -----Original Message----- > > > From: Barry Song <21cnbao@gmail.com> > > > Sent: Thursday, August 28, 2025 4:29 PM > > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > > > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > > > chengming.zhou@linux.dev; usamaarif642@gmail.com; > > > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > > > foundation.org; senozhatsky@chromium.org; linux- > crypto@vger.kernel.org; > > > herbert@gondor.apana.org.au; davem@davemloft.net; > > > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > > > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > > > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > > > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching > resources > > > if the compressor supports batching. > > > > > > > > > > > > > If ZSWAP_MAX_BATCH_SIZE is set to 8 and there is no hardware > batching, > > > > > compression is done with a step size of 1. If the hardware step size is > 4, > > > > > compression occurs in two steps. If the hardware step size is 6, the > first > > > > > compression uses a step size of 6, and the second uses a step size of 2. > > > > > Do you think this will work? > > > > > > > > Hi Barry, > > > > > > > > This would be non-optimal from code simplicity and latency > perspectives. > > > > One of the benefits of using the hardware accelerator's "batch > parallelism" > > > > is cost amortization across the batch. We might lose this benefit if we > make > > > > multiple calls to zswap_compress() to ask the hardware accelerator to > > > > batch compress in smaller batches. Compression throughput would also > > > > be sub-optimal. > > > > > > I guess it wouldn’t be an issue if both ZSWAP_MAX_BATCH_SIZE and > > > pool->compr_batch_size are powers of two. As you mentioned, we still > > > gain improvement with ZSWAP_MAX_BATCH_SIZE batching even when > > > pool->compr_batch_size == 1, by compressing pages one by one but > > > batching other work such as zswap_entries_cache_alloc_batch() ? > > > > > > > > > > > In my patch-series, the rule is simple: if an algorithm has specified a > > > > batch-size, carve out the folio by that "batch-size" # of pages to be > > > > compressed as a batch in zswap_compress(). This custom batch-size > > > > is capped at ZSWAP_MAX_BATCH_SIZE. > > > > > > > > If an algorithm has not specified a batch-size, the default batch-size > > > > is 1. In this case, carve out the folio by ZSWAP_MAX_BATCH_SIZE > > > > # of pages to be compressed as a batch in zswap_compress(). > > > > > > Yes, I understand your rule. However, having two global variables is still > > > somewhat confusing. It might be clearer to use a single variable with a > > > comment, since one variable can clearly determine the value of the other. > > > > > > Can we get the batch_size at runtime based on pool->compr_batch_size? > > > > > > /* > > > * If hardware compression supports batching, we use the hardware step > size. > > > * Otherwise, we use ZSWAP_MAX_BATCH_SIZE for batching, but still > > > compress > > > * one page at a time. > > > */ > > > batch_size = pool->compr_batch_size > 1 ? pool->compr_batch_size : > > > ZSWAP_MAX_BATCH_SIZE; > > > > > > We probably don’t need this if both pool->compr_batch_size and > > > ZSWAP_MAX_BATCH_SIZE are powers of two? > > > > I am not sure I understand this rationale, but I do want to reiterate > > that the patch-set implements a simple set of rules/design choices > > to provide a batching framework for software and hardware compressors, > > that has shown good performance improvements with both, while > > unifying zswap_store()/zswap_compress() code paths for both. > > I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and > compr_batch_size = 4, why wouldn’t batch_size = 8 and > compr_batch_size = 4 perform better than batch_size = 4 and > compr_batch_size = 4? > > In short, I’d like the case of compr_batch_size == 1 to be treated the same > as compr_batch_size == 2, 4, etc., since you can still see performance > improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size == > 1, > as batching occurs even outside compression. > > Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to > perform better than when both are 2. > > The only thing preventing this from happening is that compr_batch_size > might be 5, 6, or 7, which are not powers of two? It would be interesting to see if a generalization of pool->compr_batch_size being a factor "N" (where N > 1) of ZSWAP_MAX_BATCH_SIZE yields better performance than the current set of rules. However, we would still need to handle the case where it is not, as you mention, which might still necessitate the use of a distinct pool->batch_size to avoid re-calculating this dynamically, when this information doesn't change after pool creation. The current implementation gives preference to the algorithm to determine not just the batch compression step-size, but also the working-set size for other zswap processing for the batch, i.e., bulk allocation of entries, zpool writes, etc. The algorithm's batch-size is what zswap uses for the latter (the zswap_store_pages() in my patch-set). This has been shown to work well. To change this design to be driven instead by ZSWAP_MAX_BATCH_SIZE always (while handling non-factor pool->compr_batch_size) requires more data gathering. I am inclined to keep the existing implementation and we can continue to improve upon this if its Ok with you. > > > > > As explained before, keeping the two variables as distinct u8 members > > of struct zswap_pool is a design choice with these benefits: > > > > 1) Saves computes by avoiding computing this in performance-critical > > zswap_store() code. I have verified that dynamically computing the > > batch_size based on pool->compr_batch_size impacts latency. > > Ok, I’m a bit surprised, since this small computation wouldn’t > cause a branch misprediction at all. > > In any case, if you want to keep both variables, that’s fine. > But can we at least rename one of them? For example, use > pool->store_batch_size and pool->compr_batch_size instead of > pool->batch_size and pool->compr_batch_size, since pool->batch_size > generally has a broader semantic scope than compr_batch_size. Sure. I will change pool->batch_size to be pool->store_batch_size. Thanks, Kanchana > > Thanks > Barry
> > > > > > I am not sure I understand this rationale, but I do want to reiterate > > > that the patch-set implements a simple set of rules/design choices > > > to provide a batching framework for software and hardware compressors, > > > that has shown good performance improvements with both, while > > > unifying zswap_store()/zswap_compress() code paths for both. > > > > I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and > > compr_batch_size = 4, why wouldn’t batch_size = 8 and > > compr_batch_size = 4 perform better than batch_size = 4 and > > compr_batch_size = 4? > > > > In short, I’d like the case of compr_batch_size == 1 to be treated the same > > as compr_batch_size == 2, 4, etc., since you can still see performance > > improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size == > > 1, > > as batching occurs even outside compression. > > > > Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to > > perform better than when both are 2. > > > > The only thing preventing this from happening is that compr_batch_size > > might be 5, 6, or 7, which are not powers of two? > > It would be interesting to see if a generalization of pool->compr_batch_size > being a factor "N" (where N > 1) of ZSWAP_MAX_BATCH_SIZE yields better > performance than the current set of rules. However, we would still need to > handle the case where it is not, as you mention, which might still necessitate > the use of a distinct pool->batch_size to avoid re-calculating this dynamically, > when this information doesn't change after pool creation. > > The current implementation gives preference to the algorithm to determine > not just the batch compression step-size, but also the working-set size for > other zswap processing for the batch, i.e., bulk allocation of entries, > zpool writes, etc. The algorithm's batch-size is what zswap uses for the latter > (the zswap_store_pages() in my patch-set). This has been shown to work > well. > > To change this design to be driven instead by ZSWAP_MAX_BATCH_SIZE > always (while handling non-factor pool->compr_batch_size) requires more > data gathering. I am inclined to keep the existing implementation and > we can continue to improve upon this if its Ok with you. Right, I have no objection at this stage. I’m just curious—since some hardware now supports HW compression with only one queue, and in the future may increase to two or four queues but not many overall—whether batch_size == compr_batch_size is always the best rule. BTW, is HW compression always better than software? For example, when kswapd, proactive reclamation, and direct reclamation all run simultaneously, the CPU-based approach can leverage multiple CPUs to perform compression in parallel. But if the hardware only provides a limited number of queues, software might actually perform better. An extreme case is when multiple threads are running MADV_PAGEOUT at the same time. I’m not opposing your current patchset, just sharing some side thoughts :-) Thanks Barry
> -----Original Message----- > From: Barry Song <21cnbao@gmail.com> > Sent: Saturday, August 30, 2025 1:41 AM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; nphamcs@gmail.com; > chengming.zhou@linux.dev; usamaarif642@gmail.com; > ryan.roberts@arm.com; ying.huang@linux.alibaba.com; akpm@linux- > foundation.org; senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > > > > > > > > I am not sure I understand this rationale, but I do want to reiterate > > > > that the patch-set implements a simple set of rules/design choices > > > > to provide a batching framework for software and hardware > compressors, > > > > that has shown good performance improvements with both, while > > > > unifying zswap_store()/zswap_compress() code paths for both. > > > > > > I’m really curious: if ZSWAP_MAX_BATCH_SIZE = 8 and > > > compr_batch_size = 4, why wouldn’t batch_size = 8 and > > > compr_batch_size = 4 perform better than batch_size = 4 and > > > compr_batch_size = 4? > > > > > > In short, I’d like the case of compr_batch_size == 1 to be treated the same > > > as compr_batch_size == 2, 4, etc., since you can still see performance > > > improvements when ZSWAP_MAX_BATCH_SIZE = 8 and compr_batch_size > == > > > 1, > > > as batching occurs even outside compression. > > > > > > Therefore, I would expect batch_size == 8 and compr_batch_size == 2 to > > > perform better than when both are 2. > > > > > > The only thing preventing this from happening is that compr_batch_size > > > might be 5, 6, or 7, which are not powers of two? > > > > It would be interesting to see if a generalization of pool->compr_batch_size > > being a factor "N" (where N > 1) of ZSWAP_MAX_BATCH_SIZE yields better > > performance than the current set of rules. However, we would still need to > > handle the case where it is not, as you mention, which might still > necessitate > > the use of a distinct pool->batch_size to avoid re-calculating this > dynamically, > > when this information doesn't change after pool creation. > > > > The current implementation gives preference to the algorithm to determine > > not just the batch compression step-size, but also the working-set size for > > other zswap processing for the batch, i.e., bulk allocation of entries, > > zpool writes, etc. The algorithm's batch-size is what zswap uses for the > latter > > (the zswap_store_pages() in my patch-set). This has been shown to work > > well. > > > > To change this design to be driven instead by ZSWAP_MAX_BATCH_SIZE > > always (while handling non-factor pool->compr_batch_size) requires more > > data gathering. I am inclined to keep the existing implementation and > > we can continue to improve upon this if its Ok with you. > > Right, I have no objection at this stage. I’m just curious—since some hardware > now supports HW compression with only one queue, and in the future may > increase to two or four queues but not many overall—whether batch_size == > compr_batch_size is always the best rule. > > BTW, is HW compression always better than software? For example, when > kswapd, proactive reclamation, and direct reclamation all run simultaneously, > the CPU-based approach can leverage multiple CPUs to perform compression > in parallel. But if the hardware only provides a limited number of queues, > software might actually perform better. An extreme case is when multiple > threads are running MADV_PAGEOUT at the same time. These are great questions that we'll need to run more experiments to answer and understand the trade-offs. The good thing is the zswap architecture proposed in this patch-set is flexible enough to allow us to do so with minor changes in how we set up these two zswap_pool data members (pool->compr_batch_size and pool->store_batch_size). One of the next steps that we plan to explore is integrating batching for hardware parallelism with the kcompressd work, as per Nhat's suggestion. I believe hardware can be used with multiple compression threads from all these sources. There are many details to be worked out, but I think this can be done by striking the right balance between cost amortization, hardware parallelism, overlapping computes between the CPU and the accelerator, etc. Another important point is that our hardware roadmap continues to evolve, consistently improving compression ratios, lowering both compression and decompression latency and boosting overall throughput. To summarize: with hardware compression acceleration and using batching to avail of parallel hardware compress/decompress, we can improve the reclaim latency for a given CPU thread. When combined with compression ratio improvements, we save more memory for equivalent performance; and/or need to reclaim less via proactive or direct reclaim (reclaim latency improvements result in less memory pressure) and improve workload performance. This can make a significant impact on contended systems where CPU threads for reclaim come at a cost, and hardware compression can help offset this. Hence, I can only see upside, but we'll need to prove this out :). Thanks, Kanchana > > I’m not opposing your current patchset, just sharing some side thoughts :-) > > Thanks > Barry
On Thu, Jul 31, 2025 at 9:36 PM Kanchana P Sridhar <kanchana.p.sridhar@intel.com> wrote: > > This patch sets up zswap for allocating per-CPU resources optimally for > non-batching and batching compressors. > > A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper > limit on the number of pages in large folios that will be batch > compressed. > > As per Herbert's comments in [2] in response to the > crypto_acomp_batch_compress() and crypto_acomp_batch_decompress() API > proposed in [1], this series does not create new crypto_acomp batching > API. Instead, zswap compression batching uses the existing > crypto_acomp_compress() API in combination with the "void *kernel_data" > member added to "struct acomp_req" earlier in this series. > > It is up to the compressor to manage multiple requests, as needed, to > accomplish batch parallelism. zswap only needs to allocate the per-CPU > dst buffers according to the batch size supported by the compressor. > > A "u8 compr_batch_size" member is added to "struct zswap_pool", as per > Yosry's suggestion. pool->compr_batch_size is set as the minimum of the > compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it > proceeds to allocate the necessary compression dst buffers in the > per-CPU acomp_ctx. > > Another "u8 batch_size" member is added to "struct zswap_pool" to store > the unit for batching large folio stores: for batching compressors, this > is the pool->compr_batch_size. For non-batching compressors, this is > ZSWAP_MAX_BATCH_SIZE. > > zswap does not use more than one dst buffer yet. Follow-up patches will > actually utilize the multiple acomp_ctx buffers for batch > compression/decompression of multiple pages. > > Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used for > batching. There is a small extra memory overhead of allocating > the acomp_ctx->buffers array for compressors that do not support > batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes). > > [1]: https://patchwork.kernel.org/project/linux-mm/patch/20250508194134.28392-11-kanchana.p.sridhar@intel.com/ > [2]: https://patchwork.kernel.org/comment/26382610 > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> Mostly LGTM. Just a couple of questions below: > --- > mm/zswap.c | 82 +++++++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 63 insertions(+), 19 deletions(-) > > diff --git a/mm/zswap.c b/mm/zswap.c > index efd501a7fe294..63a997b999537 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -80,6 +80,9 @@ static bool zswap_pool_reached_full; > > #define ZSWAP_PARAM_UNSET "" > > +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */ > +#define ZSWAP_MAX_BATCH_SIZE 8U > + > static int zswap_setup(void); > > /* Enable/disable zswap */ > @@ -147,7 +150,7 @@ struct crypto_acomp_ctx { > struct crypto_acomp *acomp; > struct acomp_req *req; > struct crypto_wait wait; > - u8 *buffer; > + u8 **buffers; > struct mutex mutex; > bool is_sleepable; > }; > @@ -166,6 +169,8 @@ struct zswap_pool { > struct work_struct release_work; > struct hlist_node node; > char tfm_name[CRYPTO_MAX_ALG_NAME]; > + u8 compr_batch_size; > + u8 batch_size; Apologies if this is explained elsewhere, but I'm very confused - why do we need both of these two fields? Seems like batch_size is defined below, and never changed: pool->batch_size = (pool->compr_batch_size > 1) ? pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; Can we just determine this in zswap_store() as a local variable? > }; > > /* Global LRU lists shared by all zswap pools. */ > @@ -258,8 +263,10 @@ static void __zswap_pool_empty(struct percpu_ref *ref); > * zswap_cpu_comp_prepare(), not others. > * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy(). > */ > -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx) > +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers) > { > + u8 i; > + > if (IS_ERR_OR_NULL(acomp_ctx)) > return; > > @@ -269,7 +276,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx) > if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) > crypto_free_acomp(acomp_ctx->acomp); > > - kfree(acomp_ctx->buffer); > + if (acomp_ctx->buffers) { > + for (i = 0; i < nr_buffers; ++i) > + kfree(acomp_ctx->buffers[i]); > + kfree(acomp_ctx->buffers); > + } > } > > static struct zswap_pool *zswap_pool_create(char *type, char *compressor) > @@ -290,6 +301,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) > return NULL; > } > > + /* Many things rely on the zero-initialization. */ > pool = kzalloc(sizeof(*pool), GFP_KERNEL); > if (!pool) > return NULL; > @@ -352,13 +364,28 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) > goto ref_fail; > INIT_LIST_HEAD(&pool->list); > > + /* > + * Set the unit of compress batching for large folios, for quick > + * retrieval in the zswap_compress() fast path: > + * If the compressor is sequential (@pool->compr_batch_size is 1), > + * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE > + * pages, where each page in the batch is compressed sequentially. > + * We see better performance by processing the folio in batches of > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > + * structures. > + */ > + pool->batch_size = (pool->compr_batch_size > 1) ? > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > + > zswap_pool_debug("created", pool); > > return pool; > > ref_fail: > for_each_possible_cpu(cpu) > - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu)); > + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu), > + pool->compr_batch_size); > + > error: > if (pool->acomp_ctx) > free_percpu(pool->acomp_ctx); > @@ -417,7 +444,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool) > zswap_pool_debug("destroying", pool); > > for_each_possible_cpu(cpu) > - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu)); > + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu), > + pool->compr_batch_size); > > free_percpu(pool->acomp_ctx); > > @@ -876,6 +904,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) > struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); > struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); > int ret = -ENOMEM; > + u8 i; > > /* > * The per-CPU pool->acomp_ctx is zero-initialized on allocation. > @@ -888,10 +917,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) > if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) > return 0; > > - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); > - if (!acomp_ctx->buffer) > - return ret; > - > acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); > if (IS_ERR_OR_NULL(acomp_ctx->acomp)) { > pr_err("could not alloc crypto acomp %s : %ld\n", > @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) > acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); > if (IS_ERR_OR_NULL(acomp_ctx->req)) { > pr_err("could not alloc crypto acomp_request %s\n", > - pool->tfm_name); > + pool->tfm_name); Is this intentional? :) > goto fail; > } > > - crypto_init_wait(&acomp_ctx->wait); > + /* > + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the > + * compressor supports batching. > + */ > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > + crypto_acomp_batch_size(acomp_ctx->acomp)); > + > + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *), > + GFP_KERNEL, cpu_to_node(cpu)); > + if (!acomp_ctx->buffers) > + goto fail; > + > + for (i = 0; i < pool->compr_batch_size; ++i) { > + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, > + cpu_to_node(cpu)); > + if (!acomp_ctx->buffers[i]) > + goto fail; > + } > > /* > * if the backend of acomp is async zip, crypto_req_done() will wakeup > * crypto_wait_req(); if the backend of acomp is scomp, the callback > * won't be called, crypto_wait_req() will return without blocking. > */ > + crypto_init_wait(&acomp_ctx->wait); > + > acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, > crypto_req_done, &acomp_ctx->wait); > > @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) > return 0; > > fail: > - acomp_ctx_dealloc(acomp_ctx); > + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size); > return ret; > } > > @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, > > mutex_lock(&acomp_ctx->mutex); > > - dst = acomp_ctx->buffer; > + dst = acomp_ctx->buffers[0]; > sg_init_table(&input, 1); > sg_set_page(&input, page, PAGE_SIZE, 0); > > @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > mutex_lock(&acomp_ctx->mutex); > - obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); > + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffers[0]); > > /* > * zpool_obj_read_begin() might return a kmap address of highmem when > - * acomp_ctx->buffer is not used. However, sg_init_one() does not > - * handle highmem addresses, so copy the object to acomp_ctx->buffer. > + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not > + * handle highmem addresses, so copy the object to acomp_ctx->buffers[0]. > */ > if (virt_addr_valid(obj)) { > src = obj; > } else { > - WARN_ON_ONCE(obj == acomp_ctx->buffer); > - memcpy(acomp_ctx->buffer, obj, entry->length); > - src = acomp_ctx->buffer; > + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]); > + memcpy(acomp_ctx->buffers[0], obj, entry->length); > + src = acomp_ctx->buffers[0]; > } > > sg_init_one(&input, src, entry->length); > -- > 2.27.0 >
> -----Original Message----- > From: Nhat Pham <nphamcs@gmail.com> > Sent: Thursday, August 14, 2025 1:58 PM > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org; > hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev; > usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com; > ying.huang@linux.alibaba.com; akpm@linux-foundation.org; > senozhatsky@chromium.org; linux-crypto@vger.kernel.org; > herbert@gondor.apana.org.au; davem@davemloft.net; > clabbe@baylibre.com; ardb@kernel.org; ebiggers@google.com; > surenb@google.com; Accardi, Kristen C <kristen.c.accardi@intel.com>; > Gomes, Vinicius <vinicius.gomes@intel.com>; Feghali, Wajdi K > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com> > Subject: Re: [PATCH v11 22/24] mm: zswap: Allocate pool batching resources > if the compressor supports batching. > > On Thu, Jul 31, 2025 at 9:36 PM Kanchana P Sridhar > <kanchana.p.sridhar@intel.com> wrote: > > > > This patch sets up zswap for allocating per-CPU resources optimally for > > non-batching and batching compressors. > > > > A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper > > limit on the number of pages in large folios that will be batch > > compressed. > > > > As per Herbert's comments in [2] in response to the > > crypto_acomp_batch_compress() and crypto_acomp_batch_decompress() > API > > proposed in [1], this series does not create new crypto_acomp batching > > API. Instead, zswap compression batching uses the existing > > crypto_acomp_compress() API in combination with the "void *kernel_data" > > member added to "struct acomp_req" earlier in this series. > > > > It is up to the compressor to manage multiple requests, as needed, to > > accomplish batch parallelism. zswap only needs to allocate the per-CPU > > dst buffers according to the batch size supported by the compressor. > > > > A "u8 compr_batch_size" member is added to "struct zswap_pool", as per > > Yosry's suggestion. pool->compr_batch_size is set as the minimum of the > > compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly, it > > proceeds to allocate the necessary compression dst buffers in the > > per-CPU acomp_ctx. > > > > Another "u8 batch_size" member is added to "struct zswap_pool" to store > > the unit for batching large folio stores: for batching compressors, this > > is the pool->compr_batch_size. For non-batching compressors, this is > > ZSWAP_MAX_BATCH_SIZE. > > > > zswap does not use more than one dst buffer yet. Follow-up patches will > > actually utilize the multiple acomp_ctx buffers for batch > > compression/decompression of multiple pages. > > > > Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used > for > > batching. There is a small extra memory overhead of allocating > > the acomp_ctx->buffers array for compressors that do not support > > batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes). > > > > [1]: https://patchwork.kernel.org/project/linux- > mm/patch/20250508194134.28392-11-kanchana.p.sridhar@intel.com/ > > [2]: https://patchwork.kernel.org/comment/26382610 > > > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> > > Mostly LGTM. Just a couple of questions below: Hi Nhat, Thanks for taking the time to review the patches! Sure, these are great questions, responses are inline. > > > --- > > mm/zswap.c | 82 +++++++++++++++++++++++++++++++++++++++++------- > ------ > > 1 file changed, 63 insertions(+), 19 deletions(-) > > > > diff --git a/mm/zswap.c b/mm/zswap.c > > index efd501a7fe294..63a997b999537 100644 > > --- a/mm/zswap.c > > +++ b/mm/zswap.c > > @@ -80,6 +80,9 @@ static bool zswap_pool_reached_full; > > > > #define ZSWAP_PARAM_UNSET "" > > > > +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */ > > +#define ZSWAP_MAX_BATCH_SIZE 8U > > + > > static int zswap_setup(void); > > > > /* Enable/disable zswap */ > > @@ -147,7 +150,7 @@ struct crypto_acomp_ctx { > > struct crypto_acomp *acomp; > > struct acomp_req *req; > > struct crypto_wait wait; > > - u8 *buffer; > > + u8 **buffers; > > struct mutex mutex; > > bool is_sleepable; > > }; > > @@ -166,6 +169,8 @@ struct zswap_pool { > > struct work_struct release_work; > > struct hlist_node node; > > char tfm_name[CRYPTO_MAX_ALG_NAME]; > > + u8 compr_batch_size; > > + u8 batch_size; > > Apologies if this is explained elsewhere, but I'm very confused - why > do we need both of these two fields? No worries. This was my thinking in keeping these separate: "compr_batch_size" is indicative of the number of batching resources allocated per-CPU. Hence, zswap_compress() uses this to determine if we need to compress one page at a time in the input batch of pages. "batch_size" represents the number of pages that will be sent to zswap_compress() as a batch. > > Seems like batch_size is defined below, and never changed: > > pool->batch_size = (pool->compr_batch_size > 1) ? > pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > Can we just determine this in zswap_store() as a local variable? I figured since the number of zswap_pools at any given time is less than or equal to 2 (IIRC), it should be a good compromise to add these two u8 members for latency reasons, so that this doesn't have to be computed per call to zswap_store(). > > > > }; > > > > /* Global LRU lists shared by all zswap pools. */ > > @@ -258,8 +263,10 @@ static void __zswap_pool_empty(struct > percpu_ref *ref); > > * zswap_cpu_comp_prepare(), not others. > > * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy(). > > */ > > -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx) > > +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 > nr_buffers) > > { > > + u8 i; > > + > > if (IS_ERR_OR_NULL(acomp_ctx)) > > return; > > > > @@ -269,7 +276,11 @@ static void acomp_ctx_dealloc(struct > crypto_acomp_ctx *acomp_ctx) > > if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) > > crypto_free_acomp(acomp_ctx->acomp); > > > > - kfree(acomp_ctx->buffer); > > + if (acomp_ctx->buffers) { > > + for (i = 0; i < nr_buffers; ++i) > > + kfree(acomp_ctx->buffers[i]); > > + kfree(acomp_ctx->buffers); > > + } > > } > > > > static struct zswap_pool *zswap_pool_create(char *type, char > *compressor) > > @@ -290,6 +301,7 @@ static struct zswap_pool *zswap_pool_create(char > *type, char *compressor) > > return NULL; > > } > > > > + /* Many things rely on the zero-initialization. */ > > pool = kzalloc(sizeof(*pool), GFP_KERNEL); > > if (!pool) > > return NULL; > > @@ -352,13 +364,28 @@ static struct zswap_pool > *zswap_pool_create(char *type, char *compressor) > > goto ref_fail; > > INIT_LIST_HEAD(&pool->list); > > > > + /* > > + * Set the unit of compress batching for large folios, for quick > > + * retrieval in the zswap_compress() fast path: > > + * If the compressor is sequential (@pool->compr_batch_size is 1), > > + * large folios will be compressed in batches of > ZSWAP_MAX_BATCH_SIZE > > + * pages, where each page in the batch is compressed sequentially. > > + * We see better performance by processing the folio in batches of > > + * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set > > + * structures. > > + */ > > + pool->batch_size = (pool->compr_batch_size > 1) ? > > + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE; > > + > > zswap_pool_debug("created", pool); > > > > return pool; > > > > ref_fail: > > for_each_possible_cpu(cpu) > > - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu)); > > + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu), > > + pool->compr_batch_size); > > + > > error: > > if (pool->acomp_ctx) > > free_percpu(pool->acomp_ctx); > > @@ -417,7 +444,8 @@ static void zswap_pool_destroy(struct zswap_pool > *pool) > > zswap_pool_debug("destroying", pool); > > > > for_each_possible_cpu(cpu) > > - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu)); > > + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu), > > + pool->compr_batch_size); > > > > free_percpu(pool->acomp_ctx); > > > > @@ -876,6 +904,7 @@ static int zswap_cpu_comp_prepare(unsigned int > cpu, struct hlist_node *node) > > struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); > > struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, > cpu); > > int ret = -ENOMEM; > > + u8 i; > > > > /* > > * The per-CPU pool->acomp_ctx is zero-initialized on allocation. > > @@ -888,10 +917,6 @@ static int zswap_cpu_comp_prepare(unsigned int > cpu, struct hlist_node *node) > > if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) > > return 0; > > > > - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, > cpu_to_node(cpu)); > > - if (!acomp_ctx->buffer) > > - return ret; > > - > > acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, > cpu_to_node(cpu)); > > if (IS_ERR_OR_NULL(acomp_ctx->acomp)) { > > pr_err("could not alloc crypto acomp %s : %ld\n", > > @@ -904,17 +929,36 @@ static int zswap_cpu_comp_prepare(unsigned int > cpu, struct hlist_node *node) > > acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); > > if (IS_ERR_OR_NULL(acomp_ctx->req)) { > > pr_err("could not alloc crypto acomp_request %s\n", > > - pool->tfm_name); > > + pool->tfm_name); > > Is this intentional? :) Yes, it is indeed :). No issue if I should revert. Thanks, Kanchana > > > goto fail; > > } > > > > - crypto_init_wait(&acomp_ctx->wait); > > + /* > > + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the > > + * compressor supports batching. > > + */ > > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE, > > + crypto_acomp_batch_size(acomp_ctx->acomp)); > > + > > + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 > *), > > + GFP_KERNEL, cpu_to_node(cpu)); > > + if (!acomp_ctx->buffers) > > + goto fail; > > + > > + for (i = 0; i < pool->compr_batch_size; ++i) { > > + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2, > GFP_KERNEL, > > + cpu_to_node(cpu)); > > + if (!acomp_ctx->buffers[i]) > > + goto fail; > > + } > > > > /* > > * if the backend of acomp is async zip, crypto_req_done() will wakeup > > * crypto_wait_req(); if the backend of acomp is scomp, the callback > > * won't be called, crypto_wait_req() will return without blocking. > > */ > > + crypto_init_wait(&acomp_ctx->wait); > > + > > acomp_request_set_callback(acomp_ctx->req, > CRYPTO_TFM_REQ_MAY_BACKLOG, > > crypto_req_done, &acomp_ctx->wait); > > > > @@ -922,7 +966,7 @@ static int zswap_cpu_comp_prepare(unsigned int > cpu, struct hlist_node *node) > > return 0; > > > > fail: > > - acomp_ctx_dealloc(acomp_ctx); > > + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size); > > return ret; > > } > > > > @@ -942,7 +986,7 @@ static bool zswap_compress(struct page *page, > struct zswap_entry *entry, > > > > mutex_lock(&acomp_ctx->mutex); > > > > - dst = acomp_ctx->buffer; > > + dst = acomp_ctx->buffers[0]; > > sg_init_table(&input, 1); > > sg_set_page(&input, page, PAGE_SIZE, 0); > > > > @@ -1003,19 +1047,19 @@ static bool zswap_decompress(struct > zswap_entry *entry, struct folio *folio) > > > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > mutex_lock(&acomp_ctx->mutex); > > - obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); > > + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx- > >buffers[0]); > > > > /* > > * zpool_obj_read_begin() might return a kmap address of highmem > when > > - * acomp_ctx->buffer is not used. However, sg_init_one() does not > > - * handle highmem addresses, so copy the object to acomp_ctx- > >buffer. > > + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not > > + * handle highmem addresses, so copy the object to acomp_ctx- > >buffers[0]. > > */ > > if (virt_addr_valid(obj)) { > > src = obj; > > } else { > > - WARN_ON_ONCE(obj == acomp_ctx->buffer); > > - memcpy(acomp_ctx->buffer, obj, entry->length); > > - src = acomp_ctx->buffer; > > + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]); > > + memcpy(acomp_ctx->buffers[0], obj, entry->length); > > + src = acomp_ctx->buffers[0]; > > } > > > > sg_init_one(&input, src, entry->length); > > -- > > 2.27.0 > >
© 2016 - 2025 Red Hat, Inc.