[v3] zswap IAA compress batching

[PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Kanchana P Sridhar 1 year, 3 months ago

Modified the definition of "struct crypto_acomp_ctx" to represent a
configurable number of acomp_reqs and the required number of buffers.

Accordingly, refactored the code that allocates/deallocates the acomp_ctx
resources, so that it can be called to create a regular acomp_ctx with
exactly one acomp_req/buffer, for use in the the existing non-batching
zswap_store(), as well as to create a separate "batching acomp_ctx" with
multiple acomp_reqs/buffers for IAA compress batching.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/zswap.c | 149 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 107 insertions(+), 42 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 3e899fa61445..02e031122fdf 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -143,9 +143,10 @@ bool zswap_never_enabled(void)
 
 struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
-	struct acomp_req *req;
+	struct acomp_req **reqs;
+	u8 **buffers;
+	unsigned int nr_reqs;
 	struct crypto_wait wait;
-	u8 *buffer;
 	struct mutex mutex;
 	bool is_sleepable;
 };
@@ -241,6 +242,11 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpool))
 
+static int zswap_create_acomp_ctx(unsigned int cpu,
+				  struct crypto_acomp_ctx *acomp_ctx,
+				  char *tfm_name,
+				  unsigned int nr_reqs);
+
 /*********************************
 * pool functions
 **********************************/
@@ -813,69 +819,128 @@ static void zswap_entry_free(struct zswap_entry *entry)
 /*********************************
 * compressed storage functions
 **********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+static int zswap_create_acomp_ctx(unsigned int cpu,
+				  struct crypto_acomp_ctx *acomp_ctx,
+				  char *tfm_name,
+				  unsigned int nr_reqs)
 {
-	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 	struct crypto_acomp *acomp;
-	struct acomp_req *req;
-	int ret;
+	int ret = -ENOMEM;
+	int i, j;
 
+	acomp_ctx->nr_reqs = 0;
 	mutex_init(&acomp_ctx->mutex);
 
-	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!acomp_ctx->buffer)
-		return -ENOMEM;
-
-	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+	acomp = crypto_alloc_acomp_node(tfm_name, 0, 0, cpu_to_node(cpu));
 	if (IS_ERR(acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
-				pool->tfm_name, PTR_ERR(acomp));
-		ret = PTR_ERR(acomp);
-		goto acomp_fail;
+				tfm_name, PTR_ERR(acomp));
+		return PTR_ERR(acomp);
 	}
+
 	acomp_ctx->acomp = acomp;
 	acomp_ctx->is_sleepable = acomp_is_async(acomp);
 
-	req = acomp_request_alloc(acomp_ctx->acomp);
-	if (!req) {
-		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
-		ret = -ENOMEM;
+	acomp_ctx->buffers = kmalloc_node(nr_reqs * sizeof(u8 *),
+					  GFP_KERNEL, cpu_to_node(cpu));
+	if (!acomp_ctx->buffers)
+		goto buf_fail;
+
+	for (i = 0; i < nr_reqs; ++i) {
+		acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
+						     GFP_KERNEL, cpu_to_node(cpu));
+		if (!acomp_ctx->buffers[i]) {
+			for (j = 0; j < i; ++j)
+				kfree(acomp_ctx->buffers[j]);
+			kfree(acomp_ctx->buffers);
+			ret = -ENOMEM;
+			goto buf_fail;
+		}
+	}
+
+	acomp_ctx->reqs = kmalloc_node(nr_reqs * sizeof(struct acomp_req *),
+				       GFP_KERNEL, cpu_to_node(cpu));
+	if (!acomp_ctx->reqs)
 		goto req_fail;
+
+	for (i = 0; i < nr_reqs; ++i) {
+		acomp_ctx->reqs[i] = acomp_request_alloc(acomp_ctx->acomp);
+		if (!acomp_ctx->reqs[i]) {
+			pr_err("could not alloc crypto acomp_request reqs[%d] %s\n",
+			       i, tfm_name);
+			for (j = 0; j < i; ++j)
+				acomp_request_free(acomp_ctx->reqs[j]);
+			kfree(acomp_ctx->reqs);
+			ret = -ENOMEM;
+			goto req_fail;
+		}
 	}
-	acomp_ctx->req = req;
 
+	/*
+	 * The crypto_wait is used only in fully synchronous, i.e., with scomp
+	 * or non-poll mode of acomp, hence there is only one "wait" per
+	 * acomp_ctx, with callback set to reqs[0], under the assumption that
+	 * there is at least 1 request per acomp_ctx.
+	 */
 	crypto_init_wait(&acomp_ctx->wait);
 	/*
 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
 	 * won't be called, crypto_wait_req() will return without blocking.
 	 */
-	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+	acomp_request_set_callback(acomp_ctx->reqs[0], CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
 
+	acomp_ctx->nr_reqs = nr_reqs;
 	return 0;
 
 req_fail:
+	for (i = 0; i < nr_reqs; ++i)
+		kfree(acomp_ctx->buffers[i]);
+	kfree(acomp_ctx->buffers);
+buf_fail:
 	crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
-	kfree(acomp_ctx->buffer);
 	return ret;
 }
 
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+static void zswap_delete_acomp_ctx(struct crypto_acomp_ctx *acomp_ctx)
 {
-	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
-		if (!IS_ERR_OR_NULL(acomp_ctx->req))
-			acomp_request_free(acomp_ctx->req);
+		int i;
+
+		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
+			if (!IS_ERR_OR_NULL(acomp_ctx->reqs[i]))
+				acomp_request_free(acomp_ctx->reqs[i]);
+		kfree(acomp_ctx->reqs);
+
+		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
+			kfree(acomp_ctx->buffers[i]);
+		kfree(acomp_ctx->buffers);
+
 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 			crypto_free_acomp(acomp_ctx->acomp);
-		kfree(acomp_ctx->buffer);
+
+		acomp_ctx->nr_reqs = 0;
+		acomp_ctx = NULL;
 	}
+}
+
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+	struct crypto_acomp_ctx *acomp_ctx;
+
+	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+	return zswap_create_acomp_ctx(cpu, acomp_ctx, pool->tfm_name, 1);
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+	struct crypto_acomp_ctx *acomp_ctx;
+
+	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+	zswap_delete_acomp_ctx(acomp_ctx);
 
 	return 0;
 }
@@ -898,7 +963,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 
 	acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
 
-	dst = acomp_ctx->buffer;
+	dst = acomp_ctx->buffers[0];
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
 
@@ -908,7 +973,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	 * giving the dst buffer with enough length to avoid buffer overflow.
 	 */
 	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+	acomp_request_set_params(acomp_ctx->reqs[0], &input, &output, PAGE_SIZE, dlen);
 
 	/*
 	 * it maybe looks a little bit silly that we send an asynchronous request,
@@ -922,8 +987,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	 * but in different threads running on different cpu, we have different
 	 * acomp instance, so multiple threads can do (de)compression in parallel.
 	 */
-	comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
-	dlen = acomp_ctx->req->dlen;
+	comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->reqs[0]), &acomp_ctx->wait);
+	dlen = acomp_ctx->reqs[0]->dlen;
 	if (comp_ret)
 		goto unlock;
 
@@ -975,24 +1040,24 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	 */
 	if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
 	    !virt_addr_valid(src)) {
-		memcpy(acomp_ctx->buffer, src, entry->length);
-		src = acomp_ctx->buffer;
+		memcpy(acomp_ctx->buffers[0], src, entry->length);
+		src = acomp_ctx->buffers[0];
 		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	sg_init_one(&input, src, entry->length);
 	sg_init_table(&output, 1);
 	sg_set_folio(&output, folio, PAGE_SIZE, 0);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
-	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
-	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+	acomp_request_set_params(acomp_ctx->reqs[0], &input, &output, entry->length, PAGE_SIZE);
+	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->reqs[0]), &acomp_ctx->wait));
+	BUG_ON(acomp_ctx->reqs[0]->dlen != PAGE_SIZE);
 
-	if (src != acomp_ctx->buffer)
+	if (src != acomp_ctx->buffers[0])
 		zpool_unmap_handle(zpool, entry->handle);
 
 	/*
 	 * It is safer to unlock the mutex after the check for
-	 * "src != acomp_ctx->buffer" so that the value of "src"
+	 * "src != acomp_ctx->buffers[0]" so that the value of "src"
 	 * does not change.
 	 */
 	mutex_unlock(&acomp_ctx->mutex);
-- 
2.27.0

Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Johannes Weiner 1 year, 3 months ago

On Wed, Nov 06, 2024 at 11:21:01AM -0800, Kanchana P Sridhar wrote:
> Modified the definition of "struct crypto_acomp_ctx" to represent a
> configurable number of acomp_reqs and the required number of buffers.
> 
> Accordingly, refactored the code that allocates/deallocates the acomp_ctx
> resources, so that it can be called to create a regular acomp_ctx with
> exactly one acomp_req/buffer, for use in the the existing non-batching
> zswap_store(), as well as to create a separate "batching acomp_ctx" with
> multiple acomp_reqs/buffers for IAA compress batching.
> 
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> ---
>  mm/zswap.c | 149 ++++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 107 insertions(+), 42 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 3e899fa61445..02e031122fdf 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -143,9 +143,10 @@ bool zswap_never_enabled(void)
>  
>  struct crypto_acomp_ctx {
>  	struct crypto_acomp *acomp;
> -	struct acomp_req *req;
> +	struct acomp_req **reqs;
> +	u8 **buffers;
> +	unsigned int nr_reqs;
>  	struct crypto_wait wait;
> -	u8 *buffer;
>  	struct mutex mutex;
>  	bool is_sleepable;
>  };
> @@ -241,6 +242,11 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
>  	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
>  		 zpool_get_type((p)->zpool))
>  
> +static int zswap_create_acomp_ctx(unsigned int cpu,
> +				  struct crypto_acomp_ctx *acomp_ctx,
> +				  char *tfm_name,
> +				  unsigned int nr_reqs);

This looks unnecessary.

> +
>  /*********************************
>  * pool functions
>  **********************************/
> @@ -813,69 +819,128 @@ static void zswap_entry_free(struct zswap_entry *entry)
>  /*********************************
>  * compressed storage functions
>  **********************************/
> -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> +static int zswap_create_acomp_ctx(unsigned int cpu,
> +				  struct crypto_acomp_ctx *acomp_ctx,
> +				  char *tfm_name,
> +				  unsigned int nr_reqs)
>  {
> -	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> -	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>  	struct crypto_acomp *acomp;
> -	struct acomp_req *req;
> -	int ret;
> +	int ret = -ENOMEM;
> +	int i, j;
>  
> +	acomp_ctx->nr_reqs = 0;
>  	mutex_init(&acomp_ctx->mutex);
>  
> -	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
> -	if (!acomp_ctx->buffer)
> -		return -ENOMEM;
> -
> -	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
> +	acomp = crypto_alloc_acomp_node(tfm_name, 0, 0, cpu_to_node(cpu));
>  	if (IS_ERR(acomp)) {
>  		pr_err("could not alloc crypto acomp %s : %ld\n",
> -				pool->tfm_name, PTR_ERR(acomp));
> -		ret = PTR_ERR(acomp);
> -		goto acomp_fail;
> +				tfm_name, PTR_ERR(acomp));
> +		return PTR_ERR(acomp);
>  	}
> +
>  	acomp_ctx->acomp = acomp;
>  	acomp_ctx->is_sleepable = acomp_is_async(acomp);
>  
> -	req = acomp_request_alloc(acomp_ctx->acomp);
> -	if (!req) {
> -		pr_err("could not alloc crypto acomp_request %s\n",
> -		       pool->tfm_name);
> -		ret = -ENOMEM;
> +	acomp_ctx->buffers = kmalloc_node(nr_reqs * sizeof(u8 *),
> +					  GFP_KERNEL, cpu_to_node(cpu));
> +	if (!acomp_ctx->buffers)
> +		goto buf_fail;
> +
> +	for (i = 0; i < nr_reqs; ++i) {
> +		acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> +						     GFP_KERNEL, cpu_to_node(cpu));
> +		if (!acomp_ctx->buffers[i]) {
> +			for (j = 0; j < i; ++j)
> +				kfree(acomp_ctx->buffers[j]);
> +			kfree(acomp_ctx->buffers);
> +			ret = -ENOMEM;
> +			goto buf_fail;
> +		}
> +	}
> +
> +	acomp_ctx->reqs = kmalloc_node(nr_reqs * sizeof(struct acomp_req *),
> +				       GFP_KERNEL, cpu_to_node(cpu));
> +	if (!acomp_ctx->reqs)
>  		goto req_fail;
> +
> +	for (i = 0; i < nr_reqs; ++i) {
> +		acomp_ctx->reqs[i] = acomp_request_alloc(acomp_ctx->acomp);
> +		if (!acomp_ctx->reqs[i]) {
> +			pr_err("could not alloc crypto acomp_request reqs[%d] %s\n",
> +			       i, tfm_name);
> +			for (j = 0; j < i; ++j)
> +				acomp_request_free(acomp_ctx->reqs[j]);
> +			kfree(acomp_ctx->reqs);
> +			ret = -ENOMEM;
> +			goto req_fail;
> +		}
>  	}
> -	acomp_ctx->req = req;
>  
> +	/*
> +	 * The crypto_wait is used only in fully synchronous, i.e., with scomp
> +	 * or non-poll mode of acomp, hence there is only one "wait" per
> +	 * acomp_ctx, with callback set to reqs[0], under the assumption that
> +	 * there is at least 1 request per acomp_ctx.
> +	 */
>  	crypto_init_wait(&acomp_ctx->wait);
>  	/*
>  	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
>  	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
>  	 * won't be called, crypto_wait_req() will return without blocking.
>  	 */
> -	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> +	acomp_request_set_callback(acomp_ctx->reqs[0], CRYPTO_TFM_REQ_MAY_BACKLOG,
>  				   crypto_req_done, &acomp_ctx->wait);
>  
> +	acomp_ctx->nr_reqs = nr_reqs;
>  	return 0;
>  
>  req_fail:
> +	for (i = 0; i < nr_reqs; ++i)
> +		kfree(acomp_ctx->buffers[i]);
> +	kfree(acomp_ctx->buffers);
> +buf_fail:
>  	crypto_free_acomp(acomp_ctx->acomp);
> -acomp_fail:
> -	kfree(acomp_ctx->buffer);
>  	return ret;
>  }
>  
> -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> +static void zswap_delete_acomp_ctx(struct crypto_acomp_ctx *acomp_ctx)
>  {
> -	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> -	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> -
>  	if (!IS_ERR_OR_NULL(acomp_ctx)) {
> -		if (!IS_ERR_OR_NULL(acomp_ctx->req))
> -			acomp_request_free(acomp_ctx->req);
> +		int i;
> +
> +		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> +			if (!IS_ERR_OR_NULL(acomp_ctx->reqs[i]))
> +				acomp_request_free(acomp_ctx->reqs[i]);
> +		kfree(acomp_ctx->reqs);
> +
> +		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> +			kfree(acomp_ctx->buffers[i]);
> +		kfree(acomp_ctx->buffers);
> +
>  		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
>  			crypto_free_acomp(acomp_ctx->acomp);
> -		kfree(acomp_ctx->buffer);
> +
> +		acomp_ctx->nr_reqs = 0;
> +		acomp_ctx = NULL;
>  	}
> +}
> +
> +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> +	struct crypto_acomp_ctx *acomp_ctx;
> +
> +	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> +	return zswap_create_acomp_ctx(cpu, acomp_ctx, pool->tfm_name, 1);
> +}
> +
> +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> +	struct crypto_acomp_ctx *acomp_ctx;
> +
> +	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> +	zswap_delete_acomp_ctx(acomp_ctx);
>  
>  	return 0;
>  }

There are no other callers to these functions. Just do the work
directly in the cpu callbacks here like it used to be.

Otherwise it looks good to me.

RE: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Sridhar, Kanchana P 1 year, 3 months ago

Hi Johannes,

> -----Original Message-----
> From: Johannes Weiner <hannes@cmpxchg.org>
> Sent: Thursday, November 7, 2024 9:21 AM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> yosryahmed@google.com; nphamcs@gmail.com;
> chengming.zhou@linux.dev; usamaarif642@gmail.com;
> ryan.roberts@arm.com; Huang, Ying <ying.huang@intel.com>;
> 21cnbao@gmail.com; akpm@linux-foundation.org; linux-
> crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; zanussi@kernel.org; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx
> to be configurable in nr of acomp_reqs.
> 
> On Wed, Nov 06, 2024 at 11:21:01AM -0800, Kanchana P Sridhar wrote:
> > Modified the definition of "struct crypto_acomp_ctx" to represent a
> > configurable number of acomp_reqs and the required number of buffers.
> >
> > Accordingly, refactored the code that allocates/deallocates the acomp_ctx
> > resources, so that it can be called to create a regular acomp_ctx with
> > exactly one acomp_req/buffer, for use in the the existing non-batching
> > zswap_store(), as well as to create a separate "batching acomp_ctx" with
> > multiple acomp_reqs/buffers for IAA compress batching.
> >
> > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> > ---
> >  mm/zswap.c | 149 ++++++++++++++++++++++++++++++++++++++----------
> -----
> >  1 file changed, 107 insertions(+), 42 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 3e899fa61445..02e031122fdf 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -143,9 +143,10 @@ bool zswap_never_enabled(void)
> >
> >  struct crypto_acomp_ctx {
> >  	struct crypto_acomp *acomp;
> > -	struct acomp_req *req;
> > +	struct acomp_req **reqs;
> > +	u8 **buffers;
> > +	unsigned int nr_reqs;
> >  	struct crypto_wait wait;
> > -	u8 *buffer;
> >  	struct mutex mutex;
> >  	bool is_sleepable;
> >  };
> > @@ -241,6 +242,11 @@ static inline struct xarray
> *swap_zswap_tree(swp_entry_t swp)
> >  	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
> >  		 zpool_get_type((p)->zpool))
> >
> > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > +				  struct crypto_acomp_ctx *acomp_ctx,
> > +				  char *tfm_name,
> > +				  unsigned int nr_reqs);
> 
> This looks unnecessary.

Thanks for the code review comments. I will make sure to avoid the
forward declarations.

> 
> > +
> >  /*********************************
> >  * pool functions
> >  **********************************/
> > @@ -813,69 +819,128 @@ static void zswap_entry_free(struct
> zswap_entry *entry)
> >  /*********************************
> >  * compressed storage functions
> >  **********************************/
> > -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> *node)
> > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > +				  struct crypto_acomp_ctx *acomp_ctx,
> > +				  char *tfm_name,
> > +				  unsigned int nr_reqs)
> >  {
> > -	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > -	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> >acomp_ctx, cpu);
> >  	struct crypto_acomp *acomp;
> > -	struct acomp_req *req;
> > -	int ret;
> > +	int ret = -ENOMEM;
> > +	int i, j;
> >
> > +	acomp_ctx->nr_reqs = 0;
> >  	mutex_init(&acomp_ctx->mutex);
> >
> > -	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> cpu_to_node(cpu));
> > -	if (!acomp_ctx->buffer)
> > -		return -ENOMEM;
> > -
> > -	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> cpu_to_node(cpu));
> > +	acomp = crypto_alloc_acomp_node(tfm_name, 0, 0,
> cpu_to_node(cpu));
> >  	if (IS_ERR(acomp)) {
> >  		pr_err("could not alloc crypto acomp %s : %ld\n",
> > -				pool->tfm_name, PTR_ERR(acomp));
> > -		ret = PTR_ERR(acomp);
> > -		goto acomp_fail;
> > +				tfm_name, PTR_ERR(acomp));
> > +		return PTR_ERR(acomp);
> >  	}
> > +
> >  	acomp_ctx->acomp = acomp;
> >  	acomp_ctx->is_sleepable = acomp_is_async(acomp);
> >
> > -	req = acomp_request_alloc(acomp_ctx->acomp);
> > -	if (!req) {
> > -		pr_err("could not alloc crypto acomp_request %s\n",
> > -		       pool->tfm_name);
> > -		ret = -ENOMEM;
> > +	acomp_ctx->buffers = kmalloc_node(nr_reqs * sizeof(u8 *),
> > +					  GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!acomp_ctx->buffers)
> > +		goto buf_fail;
> > +
> > +	for (i = 0; i < nr_reqs; ++i) {
> > +		acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> > +						     GFP_KERNEL,
> cpu_to_node(cpu));
> > +		if (!acomp_ctx->buffers[i]) {
> > +			for (j = 0; j < i; ++j)
> > +				kfree(acomp_ctx->buffers[j]);
> > +			kfree(acomp_ctx->buffers);
> > +			ret = -ENOMEM;
> > +			goto buf_fail;
> > +		}
> > +	}
> > +
> > +	acomp_ctx->reqs = kmalloc_node(nr_reqs * sizeof(struct acomp_req
> *),
> > +				       GFP_KERNEL, cpu_to_node(cpu));
> > +	if (!acomp_ctx->reqs)
> >  		goto req_fail;
> > +
> > +	for (i = 0; i < nr_reqs; ++i) {
> > +		acomp_ctx->reqs[i] = acomp_request_alloc(acomp_ctx-
> >acomp);
> > +		if (!acomp_ctx->reqs[i]) {
> > +			pr_err("could not alloc crypto acomp_request
> reqs[%d] %s\n",
> > +			       i, tfm_name);
> > +			for (j = 0; j < i; ++j)
> > +				acomp_request_free(acomp_ctx->reqs[j]);
> > +			kfree(acomp_ctx->reqs);
> > +			ret = -ENOMEM;
> > +			goto req_fail;
> > +		}
> >  	}
> > -	acomp_ctx->req = req;
> >
> > +	/*
> > +	 * The crypto_wait is used only in fully synchronous, i.e., with scomp
> > +	 * or non-poll mode of acomp, hence there is only one "wait" per
> > +	 * acomp_ctx, with callback set to reqs[0], under the assumption that
> > +	 * there is at least 1 request per acomp_ctx.
> > +	 */
> >  	crypto_init_wait(&acomp_ctx->wait);
> >  	/*
> >  	 * if the backend of acomp is async zip, crypto_req_done() will
> wakeup
> >  	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
> >  	 * won't be called, crypto_wait_req() will return without blocking.
> >  	 */
> > -	acomp_request_set_callback(req,
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> > +	acomp_request_set_callback(acomp_ctx->reqs[0],
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> >  				   crypto_req_done, &acomp_ctx->wait);
> >
> > +	acomp_ctx->nr_reqs = nr_reqs;
> >  	return 0;
> >
> >  req_fail:
> > +	for (i = 0; i < nr_reqs; ++i)
> > +		kfree(acomp_ctx->buffers[i]);
> > +	kfree(acomp_ctx->buffers);
> > +buf_fail:
> >  	crypto_free_acomp(acomp_ctx->acomp);
> > -acomp_fail:
> > -	kfree(acomp_ctx->buffer);
> >  	return ret;
> >  }
> >
> > -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> *node)
> > +static void zswap_delete_acomp_ctx(struct crypto_acomp_ctx
> *acomp_ctx)
> >  {
> > -	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > -	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> >acomp_ctx, cpu);
> > -
> >  	if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > -		if (!IS_ERR_OR_NULL(acomp_ctx->req))
> > -			acomp_request_free(acomp_ctx->req);
> > +		int i;
> > +
> > +		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > +			if (!IS_ERR_OR_NULL(acomp_ctx->reqs[i]))
> > +				acomp_request_free(acomp_ctx->reqs[i]);
> > +		kfree(acomp_ctx->reqs);
> > +
> > +		for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > +			kfree(acomp_ctx->buffers[i]);
> > +		kfree(acomp_ctx->buffers);
> > +
> >  		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> >  			crypto_free_acomp(acomp_ctx->acomp);
> > -		kfree(acomp_ctx->buffer);
> > +
> > +		acomp_ctx->nr_reqs = 0;
> > +		acomp_ctx = NULL;
> >  	}
> > +}
> > +
> > +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> *node)
> > +{
> > +	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > +	struct crypto_acomp_ctx *acomp_ctx;
> > +
> > +	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > +	return zswap_create_acomp_ctx(cpu, acomp_ctx, pool->tfm_name,
> 1);
> > +}
> > +
> > +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> *node)
> > +{
> > +	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> node);
> > +	struct crypto_acomp_ctx *acomp_ctx;
> > +
> > +	acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > +	zswap_delete_acomp_ctx(acomp_ctx);
> >
> >  	return 0;
> >  }
> 
> There are no other callers to these functions. Just do the work
> directly in the cpu callbacks here like it used to be.

There will be other callers to zswap_create_acomp_ctx() and
zswap_delete_acomp_ctx() in patches 10 and 11 of this series, when the
per-cpu "acomp_batch_ctx" is introduced in struct zswap_pool. I was trying
to modularize the code first, so as to split the changes into smaller commits.

The per-cpu "acomp_batch_ctx" resources are allocated in patch 11 in the
"zswap_pool_can_batch()" function, that allocates batching resources
for this cpu. This was to address Yosry's earlier comment about minimizing
the memory footprint cost of batching.

The way I decided to do this is by reusing the code that allocates the de-facto
pool->acomp_ctx for the selected compressor for all cpu's in zswap_pool_create().
However, I did not want to add the acomp_batch_ctx multiple reqs/buffers
allocation to the cpuhp_state_add_instance() code path which would incur the
memory cost on all cpu's.

Instead, the approach I chose to follow is to allocate the batching resources
in patch 11 only as needed, on "a given cpu" that has to store a large folio. Hope
this explains the purpose of the modularization better.

Other ideas towards accomplishing this are very welcome.

Thanks,
Kanchana

> 
> Otherwise it looks good to me.

Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Yosry Ahmed 1 year, 3 months ago

On Thu, Nov 7, 2024 at 2:21 PM Sridhar, Kanchana P
<kanchana.p.sridhar@intel.com> wrote:
>
> Hi Johannes,
>
> > -----Original Message-----
> > From: Johannes Weiner <hannes@cmpxchg.org>
> > Sent: Thursday, November 7, 2024 9:21 AM
> > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > yosryahmed@google.com; nphamcs@gmail.com;
> > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > ryan.roberts@arm.com; Huang, Ying <ying.huang@intel.com>;
> > 21cnbao@gmail.com; akpm@linux-foundation.org; linux-
> > crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> > davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> > ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> > <kristen.c.accardi@intel.com>; zanussi@kernel.org; Feghali, Wajdi K
> > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > Subject: Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx
> > to be configurable in nr of acomp_reqs.
> >
> > On Wed, Nov 06, 2024 at 11:21:01AM -0800, Kanchana P Sridhar wrote:
> > > Modified the definition of "struct crypto_acomp_ctx" to represent a
> > > configurable number of acomp_reqs and the required number of buffers.
> > >
> > > Accordingly, refactored the code that allocates/deallocates the acomp_ctx
> > > resources, so that it can be called to create a regular acomp_ctx with
> > > exactly one acomp_req/buffer, for use in the the existing non-batching
> > > zswap_store(), as well as to create a separate "batching acomp_ctx" with
> > > multiple acomp_reqs/buffers for IAA compress batching.
> > >
> > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> > > ---
> > >  mm/zswap.c | 149 ++++++++++++++++++++++++++++++++++++++----------
> > -----
> > >  1 file changed, 107 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > index 3e899fa61445..02e031122fdf 100644
> > > --- a/mm/zswap.c
> > > +++ b/mm/zswap.c
> > > @@ -143,9 +143,10 @@ bool zswap_never_enabled(void)
> > >
> > >  struct crypto_acomp_ctx {
> > >     struct crypto_acomp *acomp;
> > > -   struct acomp_req *req;
> > > +   struct acomp_req **reqs;
> > > +   u8 **buffers;
> > > +   unsigned int nr_reqs;
> > >     struct crypto_wait wait;
> > > -   u8 *buffer;
> > >     struct mutex mutex;
> > >     bool is_sleepable;
> > >  };
> > > @@ -241,6 +242,11 @@ static inline struct xarray
> > *swap_zswap_tree(swp_entry_t swp)
> > >     pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> > >              zpool_get_type((p)->zpool))
> > >
> > > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > > +                             struct crypto_acomp_ctx *acomp_ctx,
> > > +                             char *tfm_name,
> > > +                             unsigned int nr_reqs);
> >
> > This looks unnecessary.
>
> Thanks for the code review comments. I will make sure to avoid the
> forward declarations.
>
> >
> > > +
> > >  /*********************************
> > >  * pool functions
> > >  **********************************/
> > > @@ -813,69 +819,128 @@ static void zswap_entry_free(struct
> > zswap_entry *entry)
> > >  /*********************************
> > >  * compressed storage functions
> > >  **********************************/
> > > -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> > *node)
> > > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > > +                             struct crypto_acomp_ctx *acomp_ctx,
> > > +                             char *tfm_name,
> > > +                             unsigned int nr_reqs)
> > >  {
> > > -   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > node);
> > > -   struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> > >acomp_ctx, cpu);
> > >     struct crypto_acomp *acomp;
> > > -   struct acomp_req *req;
> > > -   int ret;
> > > +   int ret = -ENOMEM;
> > > +   int i, j;
> > >
> > > +   acomp_ctx->nr_reqs = 0;
> > >     mutex_init(&acomp_ctx->mutex);
> > >
> > > -   acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> > cpu_to_node(cpu));
> > > -   if (!acomp_ctx->buffer)
> > > -           return -ENOMEM;
> > > -
> > > -   acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> > cpu_to_node(cpu));
> > > +   acomp = crypto_alloc_acomp_node(tfm_name, 0, 0,
> > cpu_to_node(cpu));
> > >     if (IS_ERR(acomp)) {
> > >             pr_err("could not alloc crypto acomp %s : %ld\n",
> > > -                           pool->tfm_name, PTR_ERR(acomp));
> > > -           ret = PTR_ERR(acomp);
> > > -           goto acomp_fail;
> > > +                           tfm_name, PTR_ERR(acomp));
> > > +           return PTR_ERR(acomp);
> > >     }
> > > +
> > >     acomp_ctx->acomp = acomp;
> > >     acomp_ctx->is_sleepable = acomp_is_async(acomp);
> > >
> > > -   req = acomp_request_alloc(acomp_ctx->acomp);
> > > -   if (!req) {
> > > -           pr_err("could not alloc crypto acomp_request %s\n",
> > > -                  pool->tfm_name);
> > > -           ret = -ENOMEM;
> > > +   acomp_ctx->buffers = kmalloc_node(nr_reqs * sizeof(u8 *),
> > > +                                     GFP_KERNEL, cpu_to_node(cpu));
> > > +   if (!acomp_ctx->buffers)
> > > +           goto buf_fail;
> > > +
> > > +   for (i = 0; i < nr_reqs; ++i) {
> > > +           acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> > > +                                                GFP_KERNEL,
> > cpu_to_node(cpu));
> > > +           if (!acomp_ctx->buffers[i]) {
> > > +                   for (j = 0; j < i; ++j)
> > > +                           kfree(acomp_ctx->buffers[j]);
> > > +                   kfree(acomp_ctx->buffers);
> > > +                   ret = -ENOMEM;
> > > +                   goto buf_fail;
> > > +           }
> > > +   }
> > > +
> > > +   acomp_ctx->reqs = kmalloc_node(nr_reqs * sizeof(struct acomp_req
> > *),
> > > +                                  GFP_KERNEL, cpu_to_node(cpu));
> > > +   if (!acomp_ctx->reqs)
> > >             goto req_fail;
> > > +
> > > +   for (i = 0; i < nr_reqs; ++i) {
> > > +           acomp_ctx->reqs[i] = acomp_request_alloc(acomp_ctx-
> > >acomp);
> > > +           if (!acomp_ctx->reqs[i]) {
> > > +                   pr_err("could not alloc crypto acomp_request
> > reqs[%d] %s\n",
> > > +                          i, tfm_name);
> > > +                   for (j = 0; j < i; ++j)
> > > +                           acomp_request_free(acomp_ctx->reqs[j]);
> > > +                   kfree(acomp_ctx->reqs);
> > > +                   ret = -ENOMEM;
> > > +                   goto req_fail;
> > > +           }
> > >     }
> > > -   acomp_ctx->req = req;
> > >
> > > +   /*
> > > +    * The crypto_wait is used only in fully synchronous, i.e., with scomp
> > > +    * or non-poll mode of acomp, hence there is only one "wait" per
> > > +    * acomp_ctx, with callback set to reqs[0], under the assumption that
> > > +    * there is at least 1 request per acomp_ctx.
> > > +    */
> > >     crypto_init_wait(&acomp_ctx->wait);
> > >     /*
> > >      * if the backend of acomp is async zip, crypto_req_done() will
> > wakeup
> > >      * crypto_wait_req(); if the backend of acomp is scomp, the callback
> > >      * won't be called, crypto_wait_req() will return without blocking.
> > >      */
> > > -   acomp_request_set_callback(req,
> > CRYPTO_TFM_REQ_MAY_BACKLOG,
> > > +   acomp_request_set_callback(acomp_ctx->reqs[0],
> > CRYPTO_TFM_REQ_MAY_BACKLOG,
> > >                                crypto_req_done, &acomp_ctx->wait);
> > >
> > > +   acomp_ctx->nr_reqs = nr_reqs;
> > >     return 0;
> > >
> > >  req_fail:
> > > +   for (i = 0; i < nr_reqs; ++i)
> > > +           kfree(acomp_ctx->buffers[i]);
> > > +   kfree(acomp_ctx->buffers);
> > > +buf_fail:
> > >     crypto_free_acomp(acomp_ctx->acomp);
> > > -acomp_fail:
> > > -   kfree(acomp_ctx->buffer);
> > >     return ret;
> > >  }
> > >
> > > -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> > *node)
> > > +static void zswap_delete_acomp_ctx(struct crypto_acomp_ctx
> > *acomp_ctx)
> > >  {
> > > -   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > node);
> > > -   struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> > >acomp_ctx, cpu);
> > > -
> > >     if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > -           if (!IS_ERR_OR_NULL(acomp_ctx->req))
> > > -                   acomp_request_free(acomp_ctx->req);
> > > +           int i;
> > > +
> > > +           for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > > +                   if (!IS_ERR_OR_NULL(acomp_ctx->reqs[i]))
> > > +                           acomp_request_free(acomp_ctx->reqs[i]);
> > > +           kfree(acomp_ctx->reqs);
> > > +
> > > +           for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > > +                   kfree(acomp_ctx->buffers[i]);
> > > +           kfree(acomp_ctx->buffers);
> > > +
> > >             if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> > >                     crypto_free_acomp(acomp_ctx->acomp);
> > > -           kfree(acomp_ctx->buffer);
> > > +
> > > +           acomp_ctx->nr_reqs = 0;
> > > +           acomp_ctx = NULL;
> > >     }
> > > +}
> > > +
> > > +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> > *node)
> > > +{
> > > +   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > node);
> > > +   struct crypto_acomp_ctx *acomp_ctx;
> > > +
> > > +   acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > +   return zswap_create_acomp_ctx(cpu, acomp_ctx, pool->tfm_name,
> > 1);
> > > +}
> > > +
> > > +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> > *node)
> > > +{
> > > +   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > node);
> > > +   struct crypto_acomp_ctx *acomp_ctx;
> > > +
> > > +   acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > +   zswap_delete_acomp_ctx(acomp_ctx);
> > >
> > >     return 0;
> > >  }
> >
> > There are no other callers to these functions. Just do the work
> > directly in the cpu callbacks here like it used to be.
>
> There will be other callers to zswap_create_acomp_ctx() and
> zswap_delete_acomp_ctx() in patches 10 and 11 of this series, when the
> per-cpu "acomp_batch_ctx" is introduced in struct zswap_pool. I was trying
> to modularize the code first, so as to split the changes into smaller commits.
>
> The per-cpu "acomp_batch_ctx" resources are allocated in patch 11 in the
> "zswap_pool_can_batch()" function, that allocates batching resources
> for this cpu. This was to address Yosry's earlier comment about minimizing
> the memory footprint cost of batching.
>
> The way I decided to do this is by reusing the code that allocates the de-facto
> pool->acomp_ctx for the selected compressor for all cpu's in zswap_pool_create().
> However, I did not want to add the acomp_batch_ctx multiple reqs/buffers
> allocation to the cpuhp_state_add_instance() code path which would incur the
> memory cost on all cpu's.
>
> Instead, the approach I chose to follow is to allocate the batching resources
> in patch 11 only as needed, on "a given cpu" that has to store a large folio. Hope
> this explains the purpose of the modularization better.
>
> Other ideas towards accomplishing this are very welcome.

If we remove the sysctl as suggested by Johannes, then we can just
allocate the number of buffers based on the compressor and whether it
supports batching during the pool initialization in the cpu callbacks
only.

Right?

>
> Thanks,
> Kanchana
>
> >
> > Otherwise it looks good to me.

RE: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Sridhar, Kanchana P 1 year, 3 months ago

> -----Original Message-----
> From: Yosry Ahmed <yosryahmed@google.com>
> Sent: Friday, November 8, 2024 12:22 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>; linux-kernel@vger.kernel.org;
> linux-mm@kvack.org; nphamcs@gmail.com; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; Huang, Ying
> <ying.huang@intel.com>; 21cnbao@gmail.com; akpm@linux-foundation.org;
> linux-crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; zanussi@kernel.org; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx
> to be configurable in nr of acomp_reqs.
> 
> On Thu, Nov 7, 2024 at 2:21 PM Sridhar, Kanchana P
> <kanchana.p.sridhar@intel.com> wrote:
> >
> > Hi Johannes,
> >
> > > -----Original Message-----
> > > From: Johannes Weiner <hannes@cmpxchg.org>
> > > Sent: Thursday, November 7, 2024 9:21 AM
> > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > > yosryahmed@google.com; nphamcs@gmail.com;
> > > chengming.zhou@linux.dev; usamaarif642@gmail.com;
> > > ryan.roberts@arm.com; Huang, Ying <ying.huang@intel.com>;
> > > 21cnbao@gmail.com; akpm@linux-foundation.org; linux-
> > > crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> > > davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> > > ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> > > <kristen.c.accardi@intel.com>; zanussi@kernel.org; Feghali, Wajdi K
> > > <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> > > Subject: Re: [PATCH v3 09/13] mm: zswap: Modify struct
> crypto_acomp_ctx
> > > to be configurable in nr of acomp_reqs.
> > >
> > > On Wed, Nov 06, 2024 at 11:21:01AM -0800, Kanchana P Sridhar wrote:
> > > > Modified the definition of "struct crypto_acomp_ctx" to represent a
> > > > configurable number of acomp_reqs and the required number of buffers.
> > > >
> > > > Accordingly, refactored the code that allocates/deallocates the
> acomp_ctx
> > > > resources, so that it can be called to create a regular acomp_ctx with
> > > > exactly one acomp_req/buffer, for use in the the existing non-batching
> > > > zswap_store(), as well as to create a separate "batching acomp_ctx"
> with
> > > > multiple acomp_reqs/buffers for IAA compress batching.
> > > >
> > > > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> > > > ---
> > > >  mm/zswap.c | 149 ++++++++++++++++++++++++++++++++++++++-----
> -----
> > > -----
> > > >  1 file changed, 107 insertions(+), 42 deletions(-)
> > > >
> > > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > > index 3e899fa61445..02e031122fdf 100644
> > > > --- a/mm/zswap.c
> > > > +++ b/mm/zswap.c
> > > > @@ -143,9 +143,10 @@ bool zswap_never_enabled(void)
> > > >
> > > >  struct crypto_acomp_ctx {
> > > >     struct crypto_acomp *acomp;
> > > > -   struct acomp_req *req;
> > > > +   struct acomp_req **reqs;
> > > > +   u8 **buffers;
> > > > +   unsigned int nr_reqs;
> > > >     struct crypto_wait wait;
> > > > -   u8 *buffer;
> > > >     struct mutex mutex;
> > > >     bool is_sleepable;
> > > >  };
> > > > @@ -241,6 +242,11 @@ static inline struct xarray
> > > *swap_zswap_tree(swp_entry_t swp)
> > > >     pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> > > >              zpool_get_type((p)->zpool))
> > > >
> > > > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > > > +                             struct crypto_acomp_ctx *acomp_ctx,
> > > > +                             char *tfm_name,
> > > > +                             unsigned int nr_reqs);
> > >
> > > This looks unnecessary.
> >
> > Thanks for the code review comments. I will make sure to avoid the
> > forward declarations.
> >
> > >
> > > > +
> > > >  /*********************************
> > > >  * pool functions
> > > >  **********************************/
> > > > @@ -813,69 +819,128 @@ static void zswap_entry_free(struct
> > > zswap_entry *entry)
> > > >  /*********************************
> > > >  * compressed storage functions
> > > >  **********************************/
> > > > -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> > > *node)
> > > > +static int zswap_create_acomp_ctx(unsigned int cpu,
> > > > +                             struct crypto_acomp_ctx *acomp_ctx,
> > > > +                             char *tfm_name,
> > > > +                             unsigned int nr_reqs)
> > > >  {
> > > > -   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > > node);
> > > > -   struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> > > >acomp_ctx, cpu);
> > > >     struct crypto_acomp *acomp;
> > > > -   struct acomp_req *req;
> > > > -   int ret;
> > > > +   int ret = -ENOMEM;
> > > > +   int i, j;
> > > >
> > > > +   acomp_ctx->nr_reqs = 0;
> > > >     mutex_init(&acomp_ctx->mutex);
> > > >
> > > > -   acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL,
> > > cpu_to_node(cpu));
> > > > -   if (!acomp_ctx->buffer)
> > > > -           return -ENOMEM;
> > > > -
> > > > -   acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0,
> > > cpu_to_node(cpu));
> > > > +   acomp = crypto_alloc_acomp_node(tfm_name, 0, 0,
> > > cpu_to_node(cpu));
> > > >     if (IS_ERR(acomp)) {
> > > >             pr_err("could not alloc crypto acomp %s : %ld\n",
> > > > -                           pool->tfm_name, PTR_ERR(acomp));
> > > > -           ret = PTR_ERR(acomp);
> > > > -           goto acomp_fail;
> > > > +                           tfm_name, PTR_ERR(acomp));
> > > > +           return PTR_ERR(acomp);
> > > >     }
> > > > +
> > > >     acomp_ctx->acomp = acomp;
> > > >     acomp_ctx->is_sleepable = acomp_is_async(acomp);
> > > >
> > > > -   req = acomp_request_alloc(acomp_ctx->acomp);
> > > > -   if (!req) {
> > > > -           pr_err("could not alloc crypto acomp_request %s\n",
> > > > -                  pool->tfm_name);
> > > > -           ret = -ENOMEM;
> > > > +   acomp_ctx->buffers = kmalloc_node(nr_reqs * sizeof(u8 *),
> > > > +                                     GFP_KERNEL, cpu_to_node(cpu));
> > > > +   if (!acomp_ctx->buffers)
> > > > +           goto buf_fail;
> > > > +
> > > > +   for (i = 0; i < nr_reqs; ++i) {
> > > > +           acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE * 2,
> > > > +                                                GFP_KERNEL,
> > > cpu_to_node(cpu));
> > > > +           if (!acomp_ctx->buffers[i]) {
> > > > +                   for (j = 0; j < i; ++j)
> > > > +                           kfree(acomp_ctx->buffers[j]);
> > > > +                   kfree(acomp_ctx->buffers);
> > > > +                   ret = -ENOMEM;
> > > > +                   goto buf_fail;
> > > > +           }
> > > > +   }
> > > > +
> > > > +   acomp_ctx->reqs = kmalloc_node(nr_reqs * sizeof(struct acomp_req
> > > *),
> > > > +                                  GFP_KERNEL, cpu_to_node(cpu));
> > > > +   if (!acomp_ctx->reqs)
> > > >             goto req_fail;
> > > > +
> > > > +   for (i = 0; i < nr_reqs; ++i) {
> > > > +           acomp_ctx->reqs[i] = acomp_request_alloc(acomp_ctx-
> > > >acomp);
> > > > +           if (!acomp_ctx->reqs[i]) {
> > > > +                   pr_err("could not alloc crypto acomp_request
> > > reqs[%d] %s\n",
> > > > +                          i, tfm_name);
> > > > +                   for (j = 0; j < i; ++j)
> > > > +                           acomp_request_free(acomp_ctx->reqs[j]);
> > > > +                   kfree(acomp_ctx->reqs);
> > > > +                   ret = -ENOMEM;
> > > > +                   goto req_fail;
> > > > +           }
> > > >     }
> > > > -   acomp_ctx->req = req;
> > > >
> > > > +   /*
> > > > +    * The crypto_wait is used only in fully synchronous, i.e., with scomp
> > > > +    * or non-poll mode of acomp, hence there is only one "wait" per
> > > > +    * acomp_ctx, with callback set to reqs[0], under the assumption that
> > > > +    * there is at least 1 request per acomp_ctx.
> > > > +    */
> > > >     crypto_init_wait(&acomp_ctx->wait);
> > > >     /*
> > > >      * if the backend of acomp is async zip, crypto_req_done() will
> > > wakeup
> > > >      * crypto_wait_req(); if the backend of acomp is scomp, the callback
> > > >      * won't be called, crypto_wait_req() will return without blocking.
> > > >      */
> > > > -   acomp_request_set_callback(req,
> > > CRYPTO_TFM_REQ_MAY_BACKLOG,
> > > > +   acomp_request_set_callback(acomp_ctx->reqs[0],
> > > CRYPTO_TFM_REQ_MAY_BACKLOG,
> > > >                                crypto_req_done, &acomp_ctx->wait);
> > > >
> > > > +   acomp_ctx->nr_reqs = nr_reqs;
> > > >     return 0;
> > > >
> > > >  req_fail:
> > > > +   for (i = 0; i < nr_reqs; ++i)
> > > > +           kfree(acomp_ctx->buffers[i]);
> > > > +   kfree(acomp_ctx->buffers);
> > > > +buf_fail:
> > > >     crypto_free_acomp(acomp_ctx->acomp);
> > > > -acomp_fail:
> > > > -   kfree(acomp_ctx->buffer);
> > > >     return ret;
> > > >  }
> > > >
> > > > -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> > > *node)
> > > > +static void zswap_delete_acomp_ctx(struct crypto_acomp_ctx
> > > *acomp_ctx)
> > > >  {
> > > > -   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > > node);
> > > > -   struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool-
> > > >acomp_ctx, cpu);
> > > > -
> > > >     if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > > -           if (!IS_ERR_OR_NULL(acomp_ctx->req))
> > > > -                   acomp_request_free(acomp_ctx->req);
> > > > +           int i;
> > > > +
> > > > +           for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > > > +                   if (!IS_ERR_OR_NULL(acomp_ctx->reqs[i]))
> > > > +                           acomp_request_free(acomp_ctx->reqs[i]);
> > > > +           kfree(acomp_ctx->reqs);
> > > > +
> > > > +           for (i = 0; i < acomp_ctx->nr_reqs; ++i)
> > > > +                   kfree(acomp_ctx->buffers[i]);
> > > > +           kfree(acomp_ctx->buffers);
> > > > +
> > > >             if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> > > >                     crypto_free_acomp(acomp_ctx->acomp);
> > > > -           kfree(acomp_ctx->buffer);
> > > > +
> > > > +           acomp_ctx->nr_reqs = 0;
> > > > +           acomp_ctx = NULL;
> > > >     }
> > > > +}
> > > > +
> > > > +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node
> > > *node)
> > > > +{
> > > > +   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > > node);
> > > > +   struct crypto_acomp_ctx *acomp_ctx;
> > > > +
> > > > +   acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > > +   return zswap_create_acomp_ctx(cpu, acomp_ctx, pool->tfm_name,
> > > 1);
> > > > +}
> > > > +
> > > > +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node
> > > *node)
> > > > +{
> > > > +   struct zswap_pool *pool = hlist_entry(node, struct zswap_pool,
> > > node);
> > > > +   struct crypto_acomp_ctx *acomp_ctx;
> > > > +
> > > > +   acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > > +   zswap_delete_acomp_ctx(acomp_ctx);
> > > >
> > > >     return 0;
> > > >  }
> > >
> > > There are no other callers to these functions. Just do the work
> > > directly in the cpu callbacks here like it used to be.
> >
> > There will be other callers to zswap_create_acomp_ctx() and
> > zswap_delete_acomp_ctx() in patches 10 and 11 of this series, when the
> > per-cpu "acomp_batch_ctx" is introduced in struct zswap_pool. I was trying
> > to modularize the code first, so as to split the changes into smaller commits.
> >
> > The per-cpu "acomp_batch_ctx" resources are allocated in patch 11 in the
> > "zswap_pool_can_batch()" function, that allocates batching resources
> > for this cpu. This was to address Yosry's earlier comment about minimizing
> > the memory footprint cost of batching.
> >
> > The way I decided to do this is by reusing the code that allocates the de-
> facto
> > pool->acomp_ctx for the selected compressor for all cpu's in
> zswap_pool_create().
> > However, I did not want to add the acomp_batch_ctx multiple reqs/buffers
> > allocation to the cpuhp_state_add_instance() code path which would incur
> the
> > memory cost on all cpu's.
> >
> > Instead, the approach I chose to follow is to allocate the batching resources
> > in patch 11 only as needed, on "a given cpu" that has to store a large folio.
> Hope
> > this explains the purpose of the modularization better.
> >
> > Other ideas towards accomplishing this are very welcome.
> 
> If we remove the sysctl as suggested by Johannes, then we can just
> allocate the number of buffers based on the compressor and whether it
> supports batching during the pool initialization in the cpu callbacks
> only.
> 
> Right?

Yes, we could do that if the sysctl is removed, as suggested by Johannes.
The only "drawback" of allocating the batching resources (assuming the
compressor allows batching) would be that the memory footprint penalty
would be incurred on every cpu. I was trying to further economize this
cost based on whether a given cpu actually needs to zswap_store() a
large folio, and only then allocate the batching resources. Although, I am
not sure if this would benefit any usage model.

If we agree the pool initialization is the best place to allocate the batching
resources, then I will make this change in v4.

Thanks,
Kanchana

> 
> >
> > Thanks,
> > Kanchana
> >
> > >
> > > Otherwise it looks good to me.

Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Yosry Ahmed 1 year, 3 months ago

[..]
> > > >
> > > > There are no other callers to these functions. Just do the work
> > > > directly in the cpu callbacks here like it used to be.
> > >
> > > There will be other callers to zswap_create_acomp_ctx() and
> > > zswap_delete_acomp_ctx() in patches 10 and 11 of this series, when the
> > > per-cpu "acomp_batch_ctx" is introduced in struct zswap_pool. I was trying
> > > to modularize the code first, so as to split the changes into smaller commits.
> > >
> > > The per-cpu "acomp_batch_ctx" resources are allocated in patch 11 in the
> > > "zswap_pool_can_batch()" function, that allocates batching resources
> > > for this cpu. This was to address Yosry's earlier comment about minimizing
> > > the memory footprint cost of batching.
> > >
> > > The way I decided to do this is by reusing the code that allocates the de-
> > facto
> > > pool->acomp_ctx for the selected compressor for all cpu's in
> > zswap_pool_create().
> > > However, I did not want to add the acomp_batch_ctx multiple reqs/buffers
> > > allocation to the cpuhp_state_add_instance() code path which would incur
> > the
> > > memory cost on all cpu's.
> > >
> > > Instead, the approach I chose to follow is to allocate the batching resources
> > > in patch 11 only as needed, on "a given cpu" that has to store a large folio.
> > Hope
> > > this explains the purpose of the modularization better.
> > >
> > > Other ideas towards accomplishing this are very welcome.
> >
> > If we remove the sysctl as suggested by Johannes, then we can just
> > allocate the number of buffers based on the compressor and whether it
> > supports batching during the pool initialization in the cpu callbacks
> > only.
> >
> > Right?
>
> Yes, we could do that if the sysctl is removed, as suggested by Johannes.
> The only "drawback" of allocating the batching resources (assuming the
> compressor allows batching) would be that the memory footprint penalty
> would be incurred on every cpu. I was trying to further economize this
> cost based on whether a given cpu actually needs to zswap_store() a
> large folio, and only then allocate the batching resources. Although, I am
> not sure if this would benefit any usage model.
>
> If we agree the pool initialization is the best place to allocate the batching
> resources, then I will make this change in v4.

IIUC the additional cost would apply if someone wants to use
deflate-iaa on hardware that supports batching but does not want to
use batching. I don't think catering to such a use case warrants the
complexity in advance, not until we have a user that genuinely cares.

RE: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx to be configurable in nr of acomp_reqs.

Posted by Sridhar, Kanchana P 1 year, 3 months ago

> -----Original Message-----
> From: Yosry Ahmed <yosryahmed@google.com>
> Sent: Friday, November 8, 2024 2:54 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>; linux-kernel@vger.kernel.org;
> linux-mm@kvack.org; nphamcs@gmail.com; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; Huang, Ying
> <ying.huang@intel.com>; 21cnbao@gmail.com; akpm@linux-foundation.org;
> linux-crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; zanussi@kernel.org; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>; Gopal, Vinodh <vinodh.gopal@intel.com>
> Subject: Re: [PATCH v3 09/13] mm: zswap: Modify struct crypto_acomp_ctx
> to be configurable in nr of acomp_reqs.
> 
> [..]
> > > > >
> > > > > There are no other callers to these functions. Just do the work
> > > > > directly in the cpu callbacks here like it used to be.
> > > >
> > > > There will be other callers to zswap_create_acomp_ctx() and
> > > > zswap_delete_acomp_ctx() in patches 10 and 11 of this series, when the
> > > > per-cpu "acomp_batch_ctx" is introduced in struct zswap_pool. I was
> trying
> > > > to modularize the code first, so as to split the changes into smaller
> commits.
> > > >
> > > > The per-cpu "acomp_batch_ctx" resources are allocated in patch 11 in
> the
> > > > "zswap_pool_can_batch()" function, that allocates batching resources
> > > > for this cpu. This was to address Yosry's earlier comment about
> minimizing
> > > > the memory footprint cost of batching.
> > > >
> > > > The way I decided to do this is by reusing the code that allocates the de-
> > > facto
> > > > pool->acomp_ctx for the selected compressor for all cpu's in
> > > zswap_pool_create().
> > > > However, I did not want to add the acomp_batch_ctx multiple
> reqs/buffers
> > > > allocation to the cpuhp_state_add_instance() code path which would
> incur
> > > the
> > > > memory cost on all cpu's.
> > > >
> > > > Instead, the approach I chose to follow is to allocate the batching
> resources
> > > > in patch 11 only as needed, on "a given cpu" that has to store a large
> folio.
> > > Hope
> > > > this explains the purpose of the modularization better.
> > > >
> > > > Other ideas towards accomplishing this are very welcome.
> > >
> > > If we remove the sysctl as suggested by Johannes, then we can just
> > > allocate the number of buffers based on the compressor and whether it
> > > supports batching during the pool initialization in the cpu callbacks
> > > only.
> > >
> > > Right?
> >
> > Yes, we could do that if the sysctl is removed, as suggested by Johannes.
> > The only "drawback" of allocating the batching resources (assuming the
> > compressor allows batching) would be that the memory footprint penalty
> > would be incurred on every cpu. I was trying to further economize this
> > cost based on whether a given cpu actually needs to zswap_store() a
> > large folio, and only then allocate the batching resources. Although, I am
> > not sure if this would benefit any usage model.
> >
> > If we agree the pool initialization is the best place to allocate the batching
> > resources, then I will make this change in v4.
> 
> IIUC the additional cost would apply if someone wants to use
> deflate-iaa on hardware that supports batching but does not want to
> use batching. I don't think catering to such a use case warrants the
> complexity in advance, not until we have a user that genuinely cares.

Sure, this makes sense. I will address this in v4.

Thanks,
Kanchana