[v1] zswap IAA compress batching

[RFC PATCH v1 10/13] mm: zswap: Create multiple reqs/buffers in crypto_acomp_ctx if platform has IAA.

Posted by Kanchana P Sridhar 1 year, 3 months ago

Intel IAA hardware acceleration can be used effectively to improve the
zswap_store() performance of large folios by batching multiple pages in a
folio to be compressed in parallel by IAA. Hence, to build compress batching
of zswap large folio stores using IAA, we need to be able to submit a batch
of compress jobs from zswap to the hardware to compress in parallel if the
iaa_crypto "async" mode is used.

The IAA compress batching paradigm works as follows:

 1) Submit N crypto_acomp_compress() jobs using N requests.
 2) Use the iaa_crypto driver async poll() method to check for the jobs
    to complete.
 3) There are no ordering constraints implied by submission, hence we
    could loop through the requests and process any job that has
    completed.
 4) This would repeat until all jobs have completed with success/error
    status.

To facilitate this, we need to provide for multiple acomp_reqs in
"struct crypto_acomp_ctx", each representing a distinct compress
job. Likewise, there needs to be a distinct destination buffer
corresponding to each acomp_req.

If CONFIG_ZSWAP_STORE_BATCHING_ENABLED is enabled, this patch will set the
SWAP_CRYPTO_SUB_BATCH_SIZE constant to 8UL. This implies each per-cpu
crypto_acomp_ctx associated with the zswap_pool can submit up to 8
acomp_reqs at a time to accomplish parallel compressions.

If IAA is not present and/or CONFIG_ZSWAP_STORE_BATCHING_ENABLED is not
set, SWAP_CRYPTO_SUB_BATCH_SIZE will be set to 1UL.

On an Intel Sapphire Rapids server, each socket has 4 IAA, each of which
has 2 compress engines and 8 decompress engines. Experiments modeling a
contended system with say 72 processes running under a cgroup with a fixed
memory-limit, have shown that there is a significant performance
improvement with dispatching compress jobs from all cores to all the
IAA devices on the socket. Hence, SWAP_CRYPTO_SUB_BATCH_SIZE is set to
8 to maximize compression throughput if IAA is available.

The definition of "struct crypto_acomp_ctx" is modified to make the
req/buffer be arrays of size SWAP_CRYPTO_SUB_BATCH_SIZE. Thus, the
added memory footprint cost of this per-cpu structure for batching is
incurred only for platforms that have Intel IAA.

Suggested-by: Ying Huang <ying.huang@intel.com>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/swap.h  |  11 ++++++
 mm/zswap.c | 104 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/mm/swap.h b/mm/swap.h
index ad2f121de970..566616c971d4 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -8,6 +8,17 @@ struct mempolicy;
 #include <linux/swapops.h> /* for swp_offset */
 #include <linux/blk_types.h> /* for bio_end_io_t */
 
+/*
+ * For IAA compression batching:
+ * Maximum number of IAA acomp compress requests that will be processed
+ * in a sub-batch.
+ */
+#if defined(CONFIG_ZSWAP_STORE_BATCHING_ENABLED)
+#define SWAP_CRYPTO_SUB_BATCH_SIZE 8UL
+#else
+#define SWAP_CRYPTO_SUB_BATCH_SIZE 1UL
+#endif
+
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
 struct swap_iocb;
diff --git a/mm/zswap.c b/mm/zswap.c
index 4893302d8c34..579869d1bdf6 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -152,9 +152,9 @@ bool zswap_never_enabled(void)
 
 struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
-	struct acomp_req *req;
+	struct acomp_req *req[SWAP_CRYPTO_SUB_BATCH_SIZE];
+	u8 *buffer[SWAP_CRYPTO_SUB_BATCH_SIZE];
 	struct crypto_wait wait;
-	u8 *buffer;
 	struct mutex mutex;
 	bool is_sleepable;
 };
@@ -832,49 +832,64 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 	struct crypto_acomp *acomp;
-	struct acomp_req *req;
 	int ret;
+	int i, j;
 
 	mutex_init(&acomp_ctx->mutex);
 
-	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!acomp_ctx->buffer)
-		return -ENOMEM;
-
 	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
 	if (IS_ERR(acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
 				pool->tfm_name, PTR_ERR(acomp));
-		ret = PTR_ERR(acomp);
-		goto acomp_fail;
+		return PTR_ERR(acomp);
 	}
 	acomp_ctx->acomp = acomp;
 	acomp_ctx->is_sleepable = acomp_is_async(acomp);
 
-	req = acomp_request_alloc(acomp_ctx->acomp);
-	if (!req) {
-		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
-		ret = -ENOMEM;
-		goto req_fail;
+	for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
+		acomp_ctx->buffer[i] = kmalloc_node(PAGE_SIZE * 2,
+						GFP_KERNEL, cpu_to_node(cpu));
+		if (!acomp_ctx->buffer[i]) {
+			for (j = 0; j < i; ++j)
+				kfree(acomp_ctx->buffer[j]);
+			ret = -ENOMEM;
+			goto buf_fail;
+		}
+	}
+
+	for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
+		acomp_ctx->req[i] = acomp_request_alloc(acomp_ctx->acomp);
+		if (!acomp_ctx->req[i]) {
+			pr_err("could not alloc crypto acomp_request req[%d] %s\n",
+			       i, pool->tfm_name);
+			for (j = 0; j < i; ++j)
+				acomp_request_free(acomp_ctx->req[j]);
+			ret = -ENOMEM;
+			goto req_fail;
+		}
 	}
-	acomp_ctx->req = req;
 
+	/*
+	 * The crypto_wait is used only in fully synchronous, i.e., with scomp
+	 * or non-poll mode of acomp, hence there is only one "wait" per
+	 * acomp_ctx, with callback set to req[0].
+	 */
 	crypto_init_wait(&acomp_ctx->wait);
 	/*
 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
 	 * won't be called, crypto_wait_req() will return without blocking.
 	 */
-	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+	acomp_request_set_callback(acomp_ctx->req[0], CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
 
 	return 0;
 
 req_fail:
+	for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
+		kfree(acomp_ctx->buffer[i]);
+buf_fail:
 	crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
-	kfree(acomp_ctx->buffer);
 	return ret;
 }
 
@@ -884,11 +899,17 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 
 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
-		if (!IS_ERR_OR_NULL(acomp_ctx->req))
-			acomp_request_free(acomp_ctx->req);
+		int i;
+
+		for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
+			if (!IS_ERR_OR_NULL(acomp_ctx->req[i]))
+				acomp_request_free(acomp_ctx->req[i]);
+
+		for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
+			kfree(acomp_ctx->buffer[i]);
+
 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 			crypto_free_acomp(acomp_ctx->acomp);
-		kfree(acomp_ctx->buffer);
 	}
 
 	return 0;
@@ -911,7 +932,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 
 	mutex_lock(&acomp_ctx->mutex);
 
-	dst = acomp_ctx->buffer;
+	dst = acomp_ctx->buffer[0];
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
 
@@ -921,7 +942,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	 * giving the dst buffer with enough length to avoid buffer overflow.
 	 */
 	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+	acomp_request_set_params(acomp_ctx->req[0], &input, &output, PAGE_SIZE, dlen);
 
 	/*
 	 * If the crypto_acomp provides an asynchronous poll() interface,
@@ -940,19 +961,20 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	 * parallel.
 	 */
 	if (acomp_ctx->acomp->poll) {
-		comp_ret = crypto_acomp_compress(acomp_ctx->req);
+		comp_ret = crypto_acomp_compress(acomp_ctx->req[0]);
 		if (comp_ret == -EINPROGRESS) {
 			do {
-				comp_ret = crypto_acomp_poll(acomp_ctx->req);
+				comp_ret = crypto_acomp_poll(acomp_ctx->req[0]);
 				if (comp_ret && comp_ret != -EAGAIN)
 					break;
 			} while (comp_ret);
 		}
 	} else {
-		comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+		comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req[0]),
+					   &acomp_ctx->wait);
 	}
 
-	dlen = acomp_ctx->req->dlen;
+	dlen = acomp_ctx->req[0]->dlen;
 	if (comp_ret)
 		goto unlock;
 
@@ -1006,31 +1028,39 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	 */
 	if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
 	    !virt_addr_valid(src)) {
-		memcpy(acomp_ctx->buffer, src, entry->length);
-		src = acomp_ctx->buffer;
+		memcpy(acomp_ctx->buffer[0], src, entry->length);
+		src = acomp_ctx->buffer[0];
 		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	sg_init_one(&input, src, entry->length);
 	sg_init_table(&output, 1);
 	sg_set_folio(&output, folio, PAGE_SIZE, 0);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
+	acomp_request_set_params(acomp_ctx->req[0], &input, &output,
+				 entry->length, PAGE_SIZE);
 	if (acomp_ctx->acomp->poll) {
-		ret = crypto_acomp_decompress(acomp_ctx->req);
+		ret = crypto_acomp_decompress(acomp_ctx->req[0]);
 		if (ret == -EINPROGRESS) {
 			do {
-				ret = crypto_acomp_poll(acomp_ctx->req);
+				ret = crypto_acomp_poll(acomp_ctx->req[0]);
 				BUG_ON(ret && ret != -EAGAIN);
 			} while (ret);
 		}
 	} else {
-		BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
+		BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req[0]),
+				       &acomp_ctx->wait));
 	}
-	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
-	mutex_unlock(&acomp_ctx->mutex);
+	BUG_ON(acomp_ctx->req[0]->dlen != PAGE_SIZE);
 
-	if (src != acomp_ctx->buffer)
+	if (src != acomp_ctx->buffer[0])
 		zpool_unmap_handle(zpool, entry->handle);
+
+	/*
+	 * It is safer to unlock the mutex after the check for
+	 * "src != acomp_ctx->buffer[0]" so that the value of "src"
+	 * does not change.
+	 */
+	mutex_unlock(&acomp_ctx->mutex);
 }
 
 /*********************************
-- 
2.27.0

Re: [RFC PATCH v1 10/13] mm: zswap: Create multiple reqs/buffers in crypto_acomp_ctx if platform has IAA.

Posted by Yosry Ahmed 1 year, 3 months ago

On Thu, Oct 17, 2024 at 11:41 PM Kanchana P Sridhar
<kanchana.p.sridhar@intel.com> wrote:
>
> Intel IAA hardware acceleration can be used effectively to improve the
> zswap_store() performance of large folios by batching multiple pages in a
> folio to be compressed in parallel by IAA. Hence, to build compress batching
> of zswap large folio stores using IAA, we need to be able to submit a batch
> of compress jobs from zswap to the hardware to compress in parallel if the
> iaa_crypto "async" mode is used.
>
> The IAA compress batching paradigm works as follows:
>
>  1) Submit N crypto_acomp_compress() jobs using N requests.
>  2) Use the iaa_crypto driver async poll() method to check for the jobs
>     to complete.
>  3) There are no ordering constraints implied by submission, hence we
>     could loop through the requests and process any job that has
>     completed.
>  4) This would repeat until all jobs have completed with success/error
>     status.
>
> To facilitate this, we need to provide for multiple acomp_reqs in
> "struct crypto_acomp_ctx", each representing a distinct compress
> job. Likewise, there needs to be a distinct destination buffer
> corresponding to each acomp_req.
>
> If CONFIG_ZSWAP_STORE_BATCHING_ENABLED is enabled, this patch will set the
> SWAP_CRYPTO_SUB_BATCH_SIZE constant to 8UL. This implies each per-cpu
> crypto_acomp_ctx associated with the zswap_pool can submit up to 8
> acomp_reqs at a time to accomplish parallel compressions.
>
> If IAA is not present and/or CONFIG_ZSWAP_STORE_BATCHING_ENABLED is not
> set, SWAP_CRYPTO_SUB_BATCH_SIZE will be set to 1UL.
>
> On an Intel Sapphire Rapids server, each socket has 4 IAA, each of which
> has 2 compress engines and 8 decompress engines. Experiments modeling a
> contended system with say 72 processes running under a cgroup with a fixed
> memory-limit, have shown that there is a significant performance
> improvement with dispatching compress jobs from all cores to all the
> IAA devices on the socket. Hence, SWAP_CRYPTO_SUB_BATCH_SIZE is set to
> 8 to maximize compression throughput if IAA is available.
>
> The definition of "struct crypto_acomp_ctx" is modified to make the
> req/buffer be arrays of size SWAP_CRYPTO_SUB_BATCH_SIZE. Thus, the
> added memory footprint cost of this per-cpu structure for batching is
> incurred only for platforms that have Intel IAA.
>
> Suggested-by: Ying Huang <ying.huang@intel.com>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>

Does this really need to be done in zswap? Why can't zswap submit a
single compression request with the supported number of pages, and
have the driver handle it as it sees fit?

> ---
>  mm/swap.h  |  11 ++++++
>  mm/zswap.c | 104 ++++++++++++++++++++++++++++++++++-------------------
>  2 files changed, 78 insertions(+), 37 deletions(-)
>
> diff --git a/mm/swap.h b/mm/swap.h
> index ad2f121de970..566616c971d4 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -8,6 +8,17 @@ struct mempolicy;
>  #include <linux/swapops.h> /* for swp_offset */
>  #include <linux/blk_types.h> /* for bio_end_io_t */
>
> +/*
> + * For IAA compression batching:
> + * Maximum number of IAA acomp compress requests that will be processed
> + * in a sub-batch.
> + */
> +#if defined(CONFIG_ZSWAP_STORE_BATCHING_ENABLED)
> +#define SWAP_CRYPTO_SUB_BATCH_SIZE 8UL
> +#else
> +#define SWAP_CRYPTO_SUB_BATCH_SIZE 1UL
> +#endif
> +
>  /* linux/mm/page_io.c */
>  int sio_pool_init(void);
>  struct swap_iocb;
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 4893302d8c34..579869d1bdf6 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -152,9 +152,9 @@ bool zswap_never_enabled(void)
>
>  struct crypto_acomp_ctx {
>         struct crypto_acomp *acomp;
> -       struct acomp_req *req;
> +       struct acomp_req *req[SWAP_CRYPTO_SUB_BATCH_SIZE];
> +       u8 *buffer[SWAP_CRYPTO_SUB_BATCH_SIZE];
>         struct crypto_wait wait;
> -       u8 *buffer;
>         struct mutex mutex;
>         bool is_sleepable;
>  };
> @@ -832,49 +832,64 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
>         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>         struct crypto_acomp *acomp;
> -       struct acomp_req *req;
>         int ret;
> +       int i, j;
>
>         mutex_init(&acomp_ctx->mutex);
>
> -       acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
> -       if (!acomp_ctx->buffer)
> -               return -ENOMEM;
> -
>         acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
>         if (IS_ERR(acomp)) {
>                 pr_err("could not alloc crypto acomp %s : %ld\n",
>                                 pool->tfm_name, PTR_ERR(acomp));
> -               ret = PTR_ERR(acomp);
> -               goto acomp_fail;
> +               return PTR_ERR(acomp);
>         }
>         acomp_ctx->acomp = acomp;
>         acomp_ctx->is_sleepable = acomp_is_async(acomp);
>
> -       req = acomp_request_alloc(acomp_ctx->acomp);
> -       if (!req) {
> -               pr_err("could not alloc crypto acomp_request %s\n",
> -                      pool->tfm_name);
> -               ret = -ENOMEM;
> -               goto req_fail;
> +       for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
> +               acomp_ctx->buffer[i] = kmalloc_node(PAGE_SIZE * 2,
> +                                               GFP_KERNEL, cpu_to_node(cpu));
> +               if (!acomp_ctx->buffer[i]) {
> +                       for (j = 0; j < i; ++j)
> +                               kfree(acomp_ctx->buffer[j]);
> +                       ret = -ENOMEM;
> +                       goto buf_fail;
> +               }
> +       }
> +
> +       for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
> +               acomp_ctx->req[i] = acomp_request_alloc(acomp_ctx->acomp);
> +               if (!acomp_ctx->req[i]) {
> +                       pr_err("could not alloc crypto acomp_request req[%d] %s\n",
> +                              i, pool->tfm_name);
> +                       for (j = 0; j < i; ++j)
> +                               acomp_request_free(acomp_ctx->req[j]);
> +                       ret = -ENOMEM;
> +                       goto req_fail;
> +               }
>         }
> -       acomp_ctx->req = req;
>
> +       /*
> +        * The crypto_wait is used only in fully synchronous, i.e., with scomp
> +        * or non-poll mode of acomp, hence there is only one "wait" per
> +        * acomp_ctx, with callback set to req[0].
> +        */
>         crypto_init_wait(&acomp_ctx->wait);
>         /*
>          * if the backend of acomp is async zip, crypto_req_done() will wakeup
>          * crypto_wait_req(); if the backend of acomp is scomp, the callback
>          * won't be called, crypto_wait_req() will return without blocking.
>          */
> -       acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> +       acomp_request_set_callback(acomp_ctx->req[0], CRYPTO_TFM_REQ_MAY_BACKLOG,
>                                    crypto_req_done, &acomp_ctx->wait);
>
>         return 0;
>
>  req_fail:
> +       for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
> +               kfree(acomp_ctx->buffer[i]);
> +buf_fail:
>         crypto_free_acomp(acomp_ctx->acomp);
> -acomp_fail:
> -       kfree(acomp_ctx->buffer);
>         return ret;
>  }
>
> @@ -884,11 +899,17 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
>         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>
>         if (!IS_ERR_OR_NULL(acomp_ctx)) {
> -               if (!IS_ERR_OR_NULL(acomp_ctx->req))
> -                       acomp_request_free(acomp_ctx->req);
> +               int i;
> +
> +               for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
> +                       if (!IS_ERR_OR_NULL(acomp_ctx->req[i]))
> +                               acomp_request_free(acomp_ctx->req[i]);
> +
> +               for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
> +                       kfree(acomp_ctx->buffer[i]);
> +
>                 if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
>                         crypto_free_acomp(acomp_ctx->acomp);
> -               kfree(acomp_ctx->buffer);
>         }
>
>         return 0;
> @@ -911,7 +932,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>
>         mutex_lock(&acomp_ctx->mutex);
>
> -       dst = acomp_ctx->buffer;
> +       dst = acomp_ctx->buffer[0];
>         sg_init_table(&input, 1);
>         sg_set_page(&input, page, PAGE_SIZE, 0);
>
> @@ -921,7 +942,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>          * giving the dst buffer with enough length to avoid buffer overflow.
>          */
>         sg_init_one(&output, dst, PAGE_SIZE * 2);
> -       acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
> +       acomp_request_set_params(acomp_ctx->req[0], &input, &output, PAGE_SIZE, dlen);
>
>         /*
>          * If the crypto_acomp provides an asynchronous poll() interface,
> @@ -940,19 +961,20 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>          * parallel.
>          */
>         if (acomp_ctx->acomp->poll) {
> -               comp_ret = crypto_acomp_compress(acomp_ctx->req);
> +               comp_ret = crypto_acomp_compress(acomp_ctx->req[0]);
>                 if (comp_ret == -EINPROGRESS) {
>                         do {
> -                               comp_ret = crypto_acomp_poll(acomp_ctx->req);
> +                               comp_ret = crypto_acomp_poll(acomp_ctx->req[0]);
>                                 if (comp_ret && comp_ret != -EAGAIN)
>                                         break;
>                         } while (comp_ret);
>                 }
>         } else {
> -               comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> +               comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req[0]),
> +                                          &acomp_ctx->wait);
>         }
>
> -       dlen = acomp_ctx->req->dlen;
> +       dlen = acomp_ctx->req[0]->dlen;
>         if (comp_ret)
>                 goto unlock;
>
> @@ -1006,31 +1028,39 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>          */
>         if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
>             !virt_addr_valid(src)) {
> -               memcpy(acomp_ctx->buffer, src, entry->length);
> -               src = acomp_ctx->buffer;
> +               memcpy(acomp_ctx->buffer[0], src, entry->length);
> +               src = acomp_ctx->buffer[0];
>                 zpool_unmap_handle(zpool, entry->handle);
>         }
>
>         sg_init_one(&input, src, entry->length);
>         sg_init_table(&output, 1);
>         sg_set_folio(&output, folio, PAGE_SIZE, 0);
> -       acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
> +       acomp_request_set_params(acomp_ctx->req[0], &input, &output,
> +                                entry->length, PAGE_SIZE);
>         if (acomp_ctx->acomp->poll) {
> -               ret = crypto_acomp_decompress(acomp_ctx->req);
> +               ret = crypto_acomp_decompress(acomp_ctx->req[0]);
>                 if (ret == -EINPROGRESS) {
>                         do {
> -                               ret = crypto_acomp_poll(acomp_ctx->req);
> +                               ret = crypto_acomp_poll(acomp_ctx->req[0]);
>                                 BUG_ON(ret && ret != -EAGAIN);
>                         } while (ret);
>                 }
>         } else {
> -               BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
> +               BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req[0]),
> +                                      &acomp_ctx->wait));
>         }
> -       BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> -       mutex_unlock(&acomp_ctx->mutex);
> +       BUG_ON(acomp_ctx->req[0]->dlen != PAGE_SIZE);
>
> -       if (src != acomp_ctx->buffer)
> +       if (src != acomp_ctx->buffer[0])
>                 zpool_unmap_handle(zpool, entry->handle);
> +
> +       /*
> +        * It is safer to unlock the mutex after the check for
> +        * "src != acomp_ctx->buffer[0]" so that the value of "src"
> +        * does not change.
> +        */
> +       mutex_unlock(&acomp_ctx->mutex);
>  }
>
>  /*********************************
> --
> 2.27.0
>

RE: [RFC PATCH v1 10/13] mm: zswap: Create multiple reqs/buffers in crypto_acomp_ctx if platform has IAA.

Posted by Sridhar, Kanchana P 1 year, 3 months ago

> -----Original Message-----
> From: Yosry Ahmed <yosryahmed@google.com>
> Sent: Tuesday, October 22, 2024 5:52 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; nphamcs@gmail.com; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; Huang, Ying
> <ying.huang@intel.com>; 21cnbao@gmail.com; akpm@linux-foundation.org;
> linux-crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; zanussi@kernel.org; viro@zeniv.linux.org.uk;
> brauner@kernel.org; jack@suse.cz; mcgrof@kernel.org; kees@kernel.org;
> joel.granados@kernel.org; bfoster@redhat.com; willy@infradead.org; linux-
> fsdevel@vger.kernel.org; Feghali, Wajdi K <wajdi.k.feghali@intel.com>; Gopal,
> Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [RFC PATCH v1 10/13] mm: zswap: Create multiple reqs/buffers
> in crypto_acomp_ctx if platform has IAA.
> 
> On Thu, Oct 17, 2024 at 11:41 PM Kanchana P Sridhar
> <kanchana.p.sridhar@intel.com> wrote:
> >
> > Intel IAA hardware acceleration can be used effectively to improve the
> > zswap_store() performance of large folios by batching multiple pages in a
> > folio to be compressed in parallel by IAA. Hence, to build compress batching
> > of zswap large folio stores using IAA, we need to be able to submit a batch
> > of compress jobs from zswap to the hardware to compress in parallel if the
> > iaa_crypto "async" mode is used.
> >
> > The IAA compress batching paradigm works as follows:
> >
> >  1) Submit N crypto_acomp_compress() jobs using N requests.
> >  2) Use the iaa_crypto driver async poll() method to check for the jobs
> >     to complete.
> >  3) There are no ordering constraints implied by submission, hence we
> >     could loop through the requests and process any job that has
> >     completed.
> >  4) This would repeat until all jobs have completed with success/error
> >     status.
> >
> > To facilitate this, we need to provide for multiple acomp_reqs in
> > "struct crypto_acomp_ctx", each representing a distinct compress
> > job. Likewise, there needs to be a distinct destination buffer
> > corresponding to each acomp_req.
> >
> > If CONFIG_ZSWAP_STORE_BATCHING_ENABLED is enabled, this patch will
> set the
> > SWAP_CRYPTO_SUB_BATCH_SIZE constant to 8UL. This implies each per-
> cpu
> > crypto_acomp_ctx associated with the zswap_pool can submit up to 8
> > acomp_reqs at a time to accomplish parallel compressions.
> >
> > If IAA is not present and/or CONFIG_ZSWAP_STORE_BATCHING_ENABLED
> is not
> > set, SWAP_CRYPTO_SUB_BATCH_SIZE will be set to 1UL.
> >
> > On an Intel Sapphire Rapids server, each socket has 4 IAA, each of which
> > has 2 compress engines and 8 decompress engines. Experiments modeling a
> > contended system with say 72 processes running under a cgroup with a
> fixed
> > memory-limit, have shown that there is a significant performance
> > improvement with dispatching compress jobs from all cores to all the
> > IAA devices on the socket. Hence, SWAP_CRYPTO_SUB_BATCH_SIZE is set
> to
> > 8 to maximize compression throughput if IAA is available.
> >
> > The definition of "struct crypto_acomp_ctx" is modified to make the
> > req/buffer be arrays of size SWAP_CRYPTO_SUB_BATCH_SIZE. Thus, the
> > added memory footprint cost of this per-cpu structure for batching is
> > incurred only for platforms that have Intel IAA.
> >
> > Suggested-by: Ying Huang <ying.huang@intel.com>
> > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> 
> Does this really need to be done in zswap? Why can't zswap submit a
> single compression request with the supported number of pages, and
> have the driver handle it as it sees fit?

For sure, this approach would work well too.

Thanks,
Kanchana

> 
> > ---
> >  mm/swap.h  |  11 ++++++
> >  mm/zswap.c | 104 ++++++++++++++++++++++++++++++++++----------------
> ---
> >  2 files changed, 78 insertions(+), 37 deletions(-)
> >
> > diff --git a/mm/swap.h b/mm/swap.h
> > index ad2f121de970..566616c971d4 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -8,6 +8,17 @@ struct mempolicy;
> >  #include <linux/swapops.h> /* for swp_offset */
> >  #include <linux/blk_types.h> /* for bio_end_io_t */
> >
> > +/*
> > + * For IAA compression batching:
> > + * Maximum number of IAA acomp compress requests that will be
> processed
> > + * in a sub-batch.
> > + */
> > +#if defined(CONFIG_ZSWAP_STORE_BATCHING_ENABLED)
> > +#define SWAP_CRYPTO_SUB_BATCH_SIZE 8UL
> > +#else
> > +#define SWAP_CRYPTO_SUB_BATCH_SIZE 1UL
> > +#endif
> > +
> >  /* linux/mm/page_io.c */
> >  int sio_pool_init(void);
> >  struct swap_iocb;
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 4893302d8c34..579869d1bdf6 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -152,9 +152,9 @@ bool zswap_never_enabled(void)
> >
> >  struct crypto_acomp_ctx {
> >         struct crypto_acomp *acomp;
> > -       struct acomp_req *req;
> > +       struct acomp_req *req[SWAP_CRYPTO_SUB_BATCH_SIZE];
> > +       u8 *buffer[SWAP_CRYPTO_SUB_BATCH_SIZE];
> >         struct crypto_wait wait;
> > -       u8 *buffer;
> >         struct mutex mutex;
> >         bool is_sleepable;
> >  };
> > @@ -832,49 +832,64 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> >         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> >         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx,
> cpu);
> >         struct crypto_acomp *acomp;
> > -       struct acomp_req *req;
> >         int ret;
> > +       int i, j;
> >
> >         mutex_init(&acomp_ctx->mutex);
> >
> > -       acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> cpu_to_node(cpu));
> > -       if (!acomp_ctx->buffer)
> > -               return -ENOMEM;
> > -
> >         acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> cpu_to_node(cpu));
> >         if (IS_ERR(acomp)) {
> >                 pr_err("could not alloc crypto acomp %s : %ld\n",
> >                                 pool->tfm_name, PTR_ERR(acomp));
> > -               ret = PTR_ERR(acomp);
> > -               goto acomp_fail;
> > +               return PTR_ERR(acomp);
> >         }
> >         acomp_ctx->acomp = acomp;
> >         acomp_ctx->is_sleepable = acomp_is_async(acomp);
> >
> > -       req = acomp_request_alloc(acomp_ctx->acomp);
> > -       if (!req) {
> > -               pr_err("could not alloc crypto acomp_request %s\n",
> > -                      pool->tfm_name);
> > -               ret = -ENOMEM;
> > -               goto req_fail;
> > +       for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
> > +               acomp_ctx->buffer[i] = kmalloc_node(PAGE_SIZE * 2,
> > +                                               GFP_KERNEL, cpu_to_node(cpu));
> > +               if (!acomp_ctx->buffer[i]) {
> > +                       for (j = 0; j < i; ++j)
> > +                               kfree(acomp_ctx->buffer[j]);
> > +                       ret = -ENOMEM;
> > +                       goto buf_fail;
> > +               }
> > +       }
> > +
> > +       for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i) {
> > +               acomp_ctx->req[i] = acomp_request_alloc(acomp_ctx->acomp);
> > +               if (!acomp_ctx->req[i]) {
> > +                       pr_err("could not alloc crypto acomp_request req[%d] %s\n",
> > +                              i, pool->tfm_name);
> > +                       for (j = 0; j < i; ++j)
> > +                               acomp_request_free(acomp_ctx->req[j]);
> > +                       ret = -ENOMEM;
> > +                       goto req_fail;
> > +               }
> >         }
> > -       acomp_ctx->req = req;
> >
> > +       /*
> > +        * The crypto_wait is used only in fully synchronous, i.e., with scomp
> > +        * or non-poll mode of acomp, hence there is only one "wait" per
> > +        * acomp_ctx, with callback set to req[0].
> > +        */
> >         crypto_init_wait(&acomp_ctx->wait);
> >         /*
> >          * if the backend of acomp is async zip, crypto_req_done() will wakeup
> >          * crypto_wait_req(); if the backend of acomp is scomp, the callback
> >          * won't be called, crypto_wait_req() will return without blocking.
> >          */
> > -       acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> > +       acomp_request_set_callback(acomp_ctx->req[0],
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> >                                    crypto_req_done, &acomp_ctx->wait);
> >
> >         return 0;
> >
> >  req_fail:
> > +       for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
> > +               kfree(acomp_ctx->buffer[i]);
> > +buf_fail:
> >         crypto_free_acomp(acomp_ctx->acomp);
> > -acomp_fail:
> > -       kfree(acomp_ctx->buffer);
> >         return ret;
> >  }
> >
> > @@ -884,11 +899,17 @@ static int zswap_cpu_comp_dead(unsigned int
> cpu, struct hlist_node *node)
> >         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx,
> cpu);
> >
> >         if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > -               if (!IS_ERR_OR_NULL(acomp_ctx->req))
> > -                       acomp_request_free(acomp_ctx->req);
> > +               int i;
> > +
> > +               for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
> > +                       if (!IS_ERR_OR_NULL(acomp_ctx->req[i]))
> > +                               acomp_request_free(acomp_ctx->req[i]);
> > +
> > +               for (i = 0; i < SWAP_CRYPTO_SUB_BATCH_SIZE; ++i)
> > +                       kfree(acomp_ctx->buffer[i]);
> > +
> >                 if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> >                         crypto_free_acomp(acomp_ctx->acomp);
> > -               kfree(acomp_ctx->buffer);
> >         }
> >
> >         return 0;
> > @@ -911,7 +932,7 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> >
> >         mutex_lock(&acomp_ctx->mutex);
> >
> > -       dst = acomp_ctx->buffer;
> > +       dst = acomp_ctx->buffer[0];
> >         sg_init_table(&input, 1);
> >         sg_set_page(&input, page, PAGE_SIZE, 0);
> >
> > @@ -921,7 +942,7 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> >          * giving the dst buffer with enough length to avoid buffer overflow.
> >          */
> >         sg_init_one(&output, dst, PAGE_SIZE * 2);
> > -       acomp_request_set_params(acomp_ctx->req, &input, &output,
> PAGE_SIZE, dlen);
> > +       acomp_request_set_params(acomp_ctx->req[0], &input, &output,
> PAGE_SIZE, dlen);
> >
> >         /*
> >          * If the crypto_acomp provides an asynchronous poll() interface,
> > @@ -940,19 +961,20 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> >          * parallel.
> >          */
> >         if (acomp_ctx->acomp->poll) {
> > -               comp_ret = crypto_acomp_compress(acomp_ctx->req);
> > +               comp_ret = crypto_acomp_compress(acomp_ctx->req[0]);
> >                 if (comp_ret == -EINPROGRESS) {
> >                         do {
> > -                               comp_ret = crypto_acomp_poll(acomp_ctx->req);
> > +                               comp_ret = crypto_acomp_poll(acomp_ctx->req[0]);
> >                                 if (comp_ret && comp_ret != -EAGAIN)
> >                                         break;
> >                         } while (comp_ret);
> >                 }
> >         } else {
> > -               comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx-
> >req), &acomp_ctx->wait);
> > +               comp_ret =
> crypto_wait_req(crypto_acomp_compress(acomp_ctx->req[0]),
> > +                                          &acomp_ctx->wait);
> >         }
> >
> > -       dlen = acomp_ctx->req->dlen;
> > +       dlen = acomp_ctx->req[0]->dlen;
> >         if (comp_ret)
> >                 goto unlock;
> >
> > @@ -1006,31 +1028,39 @@ static void zswap_decompress(struct
> zswap_entry *entry, struct folio *folio)
> >          */
> >         if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
> >             !virt_addr_valid(src)) {
> > -               memcpy(acomp_ctx->buffer, src, entry->length);
> > -               src = acomp_ctx->buffer;
> > +               memcpy(acomp_ctx->buffer[0], src, entry->length);
> > +               src = acomp_ctx->buffer[0];
> >                 zpool_unmap_handle(zpool, entry->handle);
> >         }
> >
> >         sg_init_one(&input, src, entry->length);
> >         sg_init_table(&output, 1);
> >         sg_set_folio(&output, folio, PAGE_SIZE, 0);
> > -       acomp_request_set_params(acomp_ctx->req, &input, &output, entry-
> >length, PAGE_SIZE);
> > +       acomp_request_set_params(acomp_ctx->req[0], &input, &output,
> > +                                entry->length, PAGE_SIZE);
> >         if (acomp_ctx->acomp->poll) {
> > -               ret = crypto_acomp_decompress(acomp_ctx->req);
> > +               ret = crypto_acomp_decompress(acomp_ctx->req[0]);
> >                 if (ret == -EINPROGRESS) {
> >                         do {
> > -                               ret = crypto_acomp_poll(acomp_ctx->req);
> > +                               ret = crypto_acomp_poll(acomp_ctx->req[0]);
> >                                 BUG_ON(ret && ret != -EAGAIN);
> >                         } while (ret);
> >                 }
> >         } else {
> > -
> BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req),
> &acomp_ctx->wait));
> > +
> BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req[0]),
> > +                                      &acomp_ctx->wait));
> >         }
> > -       BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> > -       mutex_unlock(&acomp_ctx->mutex);
> > +       BUG_ON(acomp_ctx->req[0]->dlen != PAGE_SIZE);
> >
> > -       if (src != acomp_ctx->buffer)
> > +       if (src != acomp_ctx->buffer[0])
> >                 zpool_unmap_handle(zpool, entry->handle);
> > +
> > +       /*
> > +        * It is safer to unlock the mutex after the check for
> > +        * "src != acomp_ctx->buffer[0]" so that the value of "src"
> > +        * does not change.
> > +        */
> > +       mutex_unlock(&acomp_ctx->mutex);
> >  }
> >
> >  /*********************************
> > --
> > 2.27.0
> >