From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 72E881D63C2;
	Fri, 26 Sep 2025 03:35:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857709; cv=none;
 b=UbnMKYbOxObY9ABFOZHrT4Ud72twJ8AeYMFaCWzGLbWVaUP+MmYW1MfxkNzkfEjfU8Oqzm4rLw47iBXxfoVeJSSmHw93cwicyl8OsCJ21NE1xgqdxrGbkDx1rHS6usNf6jJ6s7FvquwiuJUTs/WXd9GC2DVJD5B74UVW2o5G22Y=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857709; c=relaxed/simple;
	bh=pev2rWbFSDQItiAHUxDqBId8SFZZt9oAc87aO1i4JWM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=cDE/1HVbv/7u+IECh4DFWTwS0LW7D1A52soqKE1AON61mJJUxB24u7W9U7WA14Uo/3fXf7VpBoGCjkpAOPJOAA24F8CTpIWV9VhOV7g7eXscHYGpDrRalrlVk0vXSMLHwpskEnwFt2Ex2ZcPZ+oDVodD3JSLa+cDX91rbzKOyaI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=k9Db4b8P; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="k9Db4b8P"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857707; x=1790393707;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=pev2rWbFSDQItiAHUxDqBId8SFZZt9oAc87aO1i4JWM=;
  b=k9Db4b8PTNfQmBoTQtScIABongeM+tiPrb/DwxSzqEJa2gF5aO6llGRB
   2QvVwK7TSI0Ph92GItPrtbiLpoO4u7ALcmXJzEZVWp/dD5VWrSw3hsIoL
   T/Lg37uoBxBJj3qjlv7zakTzIWCSRAeuqaD0IMNAf8CnMpa8S9CbFS+fI
   jSqFwIjRbZbWfa8PoYYekalESrm1odqf+GOyLmt7I2QEZ7Kt6+7fwZCDM
   2yN1leGZ+5sMriOfg4FBEQ8Uy3Up9uiN4o3AD59JY7815/R+WMVdKswEM
   S6ks6B1xkcd6YI114ZcC4g+8s8F5f5ryWQ5f8R0wtMGpFNu4Mnr648WZv
   Q==;
X-CSE-ConnectionGUID: 4wvqXRvjQ4umrdegmOfJOw==
X-CSE-MsgGUID: Aq59FD7HSnq7aqf8MGwNvQ==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819403"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819403"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:04 -0700
X-CSE-ConnectionGUID: 1T5PToXfQ1a7NhfCz7mXig==
X-CSE-MsgGUID: Tu1BiAufQF+RvwZRNZ4/eg==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636546"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:02 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 01/23] crypto: iaa - Reorganize the iaa_crypto driver
 code.
Date: Thu, 25 Sep 2025 20:34:40 -0700
Message-Id: <20250926033502.7486-2-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch merely reorganizes the code in iaa_crypto_main.c, so that
the functions are consolidated into logically related sub-sections of
code, without requiring forward declarations.

This is expected to make the code more maintainable and for it to be
easier to replace functional layers and/or add new features.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 677 +++++++++++----------
 1 file changed, 350 insertions(+), 327 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 23f585219fb4..760997eee8fe 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -24,6 +24,10 @@
=20
 #define IAA_ALG_PRIORITY               300
=20
+/**************************************
+ * Driver internal global variables.
+ **************************************/
+
 /* number of iaa instances probed */
 static unsigned int nr_iaa;
 static unsigned int nr_cpus;
@@ -36,54 +40,6 @@ static unsigned int cpus_per_iaa;
 /* Per-cpu lookup table for balanced wqs */
 static struct wq_table_entry __percpu *wq_table;
=20
-static struct idxd_wq *wq_table_next_wq(int cpu)
-{
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
-
-	if (++entry->cur_wq >=3D entry->n_wqs)
-		entry->cur_wq =3D 0;
-
-	if (!entry->wqs[entry->cur_wq])
-		return NULL;
-
-	pr_debug("%s: returning wq at idx %d (iaa wq %d.%d) from cpu %d\n", __fun=
c__,
-		 entry->cur_wq, entry->wqs[entry->cur_wq]->idxd->id,
-		 entry->wqs[entry->cur_wq]->id, cpu);
-
-	return entry->wqs[entry->cur_wq];
-}
-
-static void wq_table_add(int cpu, struct idxd_wq *wq)
-{
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
-
-	if (WARN_ON(entry->n_wqs =3D=3D entry->max_wqs))
-		return;
-
-	entry->wqs[entry->n_wqs++] =3D wq;
-
-	pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
-		 entry->wqs[entry->n_wqs - 1]->idxd->id,
-		 entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
-}
-
-static void wq_table_free_entry(int cpu)
-{
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
-
-	kfree(entry->wqs);
-	memset(entry, 0, sizeof(*entry));
-}
-
-static void wq_table_clear_entry(int cpu)
-{
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
-
-	entry->n_wqs =3D 0;
-	entry->cur_wq =3D 0;
-	memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
-}
-
 LIST_HEAD(iaa_devices);
 DEFINE_MUTEX(iaa_devices_lock);
=20
@@ -91,36 +47,11 @@ DEFINE_MUTEX(iaa_devices_lock);
 static bool iaa_crypto_enabled;
 static bool iaa_crypto_registered;
=20
+static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_M=
AX];
+
 /* Verify results of IAA compress or not */
 static bool iaa_verify_compress =3D true;
=20
-static ssize_t verify_compress_show(struct device_driver *driver, char *bu=
f)
-{
-	return sprintf(buf, "%d\n", iaa_verify_compress);
-}
-
-static ssize_t verify_compress_store(struct device_driver *driver,
-				     const char *buf, size_t count)
-{
-	int ret =3D -EBUSY;
-
-	mutex_lock(&iaa_devices_lock);
-
-	if (iaa_crypto_enabled)
-		goto out;
-
-	ret =3D kstrtobool(buf, &iaa_verify_compress);
-	if (ret)
-		goto out;
-
-	ret =3D count;
-out:
-	mutex_unlock(&iaa_devices_lock);
-
-	return ret;
-}
-static DRIVER_ATTR_RW(verify_compress);
-
 /*
  * The iaa crypto driver supports three 'sync' methods determining how
  * compressions and decompressions are performed:
@@ -155,6 +86,37 @@ static bool async_mode;
 /* Use interrupts */
 static bool use_irq;
=20
+/**************************************************
+ * Driver attributes along with get/set functions.
+ **************************************************/
+
+static ssize_t verify_compress_show(struct device_driver *driver, char *bu=
f)
+{
+	return sprintf(buf, "%d\n", iaa_verify_compress);
+}
+
+static ssize_t verify_compress_store(struct device_driver *driver,
+				     const char *buf, size_t count)
+{
+	int ret =3D -EBUSY;
+
+	mutex_lock(&iaa_devices_lock);
+
+	if (iaa_crypto_enabled)
+		goto out;
+
+	ret =3D kstrtobool(buf, &iaa_verify_compress);
+	if (ret)
+		goto out;
+
+	ret =3D count;
+out:
+	mutex_unlock(&iaa_devices_lock);
+
+	return ret;
+}
+static DRIVER_ATTR_RW(verify_compress);
+
 /**
  * set_iaa_sync_mode - Set IAA sync mode
  * @name: The name of the sync mode
@@ -217,7 +179,9 @@ static ssize_t sync_mode_store(struct device_driver *dr=
iver,
 }
 static DRIVER_ATTR_RW(sync_mode);
=20
-static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_M=
AX];
+/****************************
+ * Driver compression modes.
+ ****************************/
=20
 static int find_empty_iaa_compression_mode(void)
 {
@@ -409,11 +373,6 @@ static void free_device_compression_mode(struct iaa_de=
vice *iaa_device,
 						IDXD_OP_FLAG_WR_SRC2_AECS_COMP | \
 						IDXD_OP_FLAG_AECS_RW_TGLS)
=20
-static int check_completion(struct device *dev,
-			    struct iax_completion_record *comp,
-			    bool compress,
-			    bool only_once);
-
 static int init_device_compression_mode(struct iaa_device *iaa_device,
 					struct iaa_compression_mode *mode,
 					int idx, struct idxd_wq *wq)
@@ -500,6 +459,11 @@ static void remove_device_compression_modes(struct iaa=
_device *iaa_device)
 	}
 }
=20
+/***********************************************************
+ * Functions for use in crypto probe and remove interfaces:
+ * allocate/init/query/deallocate devices/wqs.
+ ***********************************************************/
+
 static struct iaa_device *iaa_device_alloc(void)
 {
 	struct iaa_device *iaa_device;
@@ -513,18 +477,6 @@ static struct iaa_device *iaa_device_alloc(void)
 	return iaa_device;
 }
=20
-static bool iaa_has_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
-{
-	struct iaa_wq *iaa_wq;
-
-	list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
-		if (iaa_wq->wq =3D=3D wq)
-			return true;
-	}
-
-	return false;
-}
-
 static struct iaa_device *add_iaa_device(struct idxd_device *idxd)
 {
 	struct iaa_device *iaa_device;
@@ -560,6 +512,27 @@ static void del_iaa_device(struct iaa_device *iaa_devi=
ce)
 	nr_iaa--;
 }
=20
+static void free_iaa_device(struct iaa_device *iaa_device)
+{
+	if (!iaa_device)
+		return;
+
+	remove_device_compression_modes(iaa_device);
+	kfree(iaa_device);
+}
+
+static bool iaa_has_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
+{
+	struct iaa_wq *iaa_wq;
+
+	list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
+		if (iaa_wq->wq =3D=3D wq)
+			return true;
+	}
+
+	return false;
+}
+
 static int add_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq,
 		      struct iaa_wq **new_wq)
 {
@@ -612,23 +585,23 @@ static void del_iaa_wq(struct iaa_device *iaa_device,=
 struct idxd_wq *wq)
 	}
 }
=20
-static void clear_wq_table(void)
+static void remove_iaa_wq(struct idxd_wq *wq)
 {
-	int cpu;
-
-	for (cpu =3D 0; cpu < nr_cpus; cpu++)
-		wq_table_clear_entry(cpu);
-
-	pr_debug("cleared wq table\n");
-}
+	struct iaa_device *iaa_device;
=20
-static void free_iaa_device(struct iaa_device *iaa_device)
-{
-	if (!iaa_device)
-		return;
+	list_for_each_entry(iaa_device, &iaa_devices, list) {
+		if (iaa_has_wq(iaa_device, wq)) {
+			del_iaa_wq(iaa_device, wq);
+			break;
+		}
+	}
=20
-	remove_device_compression_modes(iaa_device);
-	kfree(iaa_device);
+	if (nr_iaa) {
+		cpus_per_iaa =3D (nr_nodes * nr_cpus_per_node) / nr_iaa;
+		if (!cpus_per_iaa)
+			cpus_per_iaa =3D 1;
+	} else
+		cpus_per_iaa =3D 1;
 }
=20
 static void __free_iaa_wq(struct iaa_wq *iaa_wq)
@@ -655,6 +628,75 @@ static void free_iaa_wq(struct iaa_wq *iaa_wq)
 	idxd_wq_set_private(wq, NULL);
 }
=20
+static int save_iaa_wq(struct idxd_wq *wq)
+{
+	struct iaa_device *iaa_device, *found =3D NULL;
+	struct idxd_device *idxd;
+	struct pci_dev *pdev;
+	struct device *dev;
+	int ret =3D 0;
+
+	list_for_each_entry(iaa_device, &iaa_devices, list) {
+		if (iaa_device->idxd =3D=3D wq->idxd) {
+			idxd =3D iaa_device->idxd;
+			pdev =3D idxd->pdev;
+			dev =3D &pdev->dev;
+			/*
+			 * Check to see that we don't already have this wq.
+			 * Shouldn't happen but we don't control probing.
+			 */
+			if (iaa_has_wq(iaa_device, wq)) {
+				dev_dbg(dev, "same wq probed multiple times for iaa_device %p\n",
+					iaa_device);
+				goto out;
+			}
+
+			found =3D iaa_device;
+
+			ret =3D add_iaa_wq(iaa_device, wq, NULL);
+			if (ret)
+				goto out;
+
+			break;
+		}
+	}
+
+	if (!found) {
+		struct iaa_device *new_device;
+		struct iaa_wq *new_wq;
+
+		new_device =3D add_iaa_device(wq->idxd);
+		if (!new_device) {
+			ret =3D -ENOMEM;
+			goto out;
+		}
+
+		ret =3D add_iaa_wq(new_device, wq, &new_wq);
+		if (ret) {
+			del_iaa_device(new_device);
+			free_iaa_device(new_device);
+			goto out;
+		}
+
+		ret =3D init_iaa_device(new_device, new_wq);
+		if (ret) {
+			del_iaa_wq(new_device, new_wq->wq);
+			del_iaa_device(new_device);
+			free_iaa_wq(new_wq);
+			goto out;
+		}
+	}
+
+	if (WARN_ON(nr_iaa =3D=3D 0))
+		return -EINVAL;
+
+	cpus_per_iaa =3D (nr_nodes * nr_cpus_per_node) / nr_iaa;
+	if (!cpus_per_iaa)
+		cpus_per_iaa =3D 1;
+out:
+	return 0;
+}
+
 static int iaa_wq_get(struct idxd_wq *wq)
 {
 	struct idxd_device *idxd =3D wq->idxd;
@@ -702,6 +744,37 @@ static int iaa_wq_put(struct idxd_wq *wq)
 	return ret;
 }
=20
+/***************************************************************
+ * Mapping IAA devices and wqs to cores with per-cpu wq_tables.
+ ***************************************************************/
+
+static void wq_table_free_entry(int cpu)
+{
+	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+
+	kfree(entry->wqs);
+	memset(entry, 0, sizeof(*entry));
+}
+
+static void wq_table_clear_entry(int cpu)
+{
+	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+
+	entry->n_wqs =3D 0;
+	entry->cur_wq =3D 0;
+	memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
+}
+
+static void clear_wq_table(void)
+{
+	int cpu;
+
+	for (cpu =3D 0; cpu < nr_cpus; cpu++)
+		wq_table_clear_entry(cpu);
+
+	pr_debug("cleared wq table\n");
+}
+
 static void free_wq_table(void)
 {
 	int cpu;
@@ -739,92 +812,18 @@ static int alloc_wq_table(int max_wqs)
 	return 0;
 }
=20
-static int save_iaa_wq(struct idxd_wq *wq)
+static void wq_table_add(int cpu, struct idxd_wq *wq)
 {
-	struct iaa_device *iaa_device, *found =3D NULL;
-	struct idxd_device *idxd;
-	struct pci_dev *pdev;
-	struct device *dev;
-	int ret =3D 0;
-
-	list_for_each_entry(iaa_device, &iaa_devices, list) {
-		if (iaa_device->idxd =3D=3D wq->idxd) {
-			idxd =3D iaa_device->idxd;
-			pdev =3D idxd->pdev;
-			dev =3D &pdev->dev;
-			/*
-			 * Check to see that we don't already have this wq.
-			 * Shouldn't happen but we don't control probing.
-			 */
-			if (iaa_has_wq(iaa_device, wq)) {
-				dev_dbg(dev, "same wq probed multiple times for iaa_device %p\n",
-					iaa_device);
-				goto out;
-			}
-
-			found =3D iaa_device;
-
-			ret =3D add_iaa_wq(iaa_device, wq, NULL);
-			if (ret)
-				goto out;
-
-			break;
-		}
-	}
-
-	if (!found) {
-		struct iaa_device *new_device;
-		struct iaa_wq *new_wq;
-
-		new_device =3D add_iaa_device(wq->idxd);
-		if (!new_device) {
-			ret =3D -ENOMEM;
-			goto out;
-		}
-
-		ret =3D add_iaa_wq(new_device, wq, &new_wq);
-		if (ret) {
-			del_iaa_device(new_device);
-			free_iaa_device(new_device);
-			goto out;
-		}
-
-		ret =3D init_iaa_device(new_device, new_wq);
-		if (ret) {
-			del_iaa_wq(new_device, new_wq->wq);
-			del_iaa_device(new_device);
-			free_iaa_wq(new_wq);
-			goto out;
-		}
-	}
-
-	if (WARN_ON(nr_iaa =3D=3D 0))
-		return -EINVAL;
-
-	cpus_per_iaa =3D (nr_nodes * nr_cpus_per_node) / nr_iaa;
-	if (!cpus_per_iaa)
-		cpus_per_iaa =3D 1;
-out:
-	return 0;
-}
+	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
=20
-static void remove_iaa_wq(struct idxd_wq *wq)
-{
-	struct iaa_device *iaa_device;
+	if (WARN_ON(entry->n_wqs =3D=3D entry->max_wqs))
+		return;
=20
-	list_for_each_entry(iaa_device, &iaa_devices, list) {
-		if (iaa_has_wq(iaa_device, wq)) {
-			del_iaa_wq(iaa_device, wq);
-			break;
-		}
-	}
+	entry->wqs[entry->n_wqs++] =3D wq;
=20
-	if (nr_iaa) {
-		cpus_per_iaa =3D (nr_nodes * nr_cpus_per_node) / nr_iaa;
-		if (!cpus_per_iaa)
-			cpus_per_iaa =3D 1;
-	} else
-		cpus_per_iaa =3D 1;
+	pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
+		 entry->wqs[entry->n_wqs - 1]->idxd->id,
+		 entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
 }
=20
 static int wq_table_add_wqs(int iaa, int cpu)
@@ -930,6 +929,44 @@ static void rebalance_wq_table(void)
 	pr_debug("could not add any wqs for iaa %d to cpu %d!\n", iaa, cpu);
 }
=20
+/***************************************************************
+ * Assign work-queues for driver ops using per-cpu wq_tables.
+ ***************************************************************/
+
+static struct idxd_wq *wq_table_next_wq(int cpu)
+{
+	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+
+	if (++entry->cur_wq >=3D entry->n_wqs)
+		entry->cur_wq =3D 0;
+
+	if (!entry->wqs[entry->cur_wq])
+		return NULL;
+
+	pr_debug("%s: returning wq at idx %d (iaa wq %d.%d) from cpu %d\n", __fun=
c__,
+		 entry->cur_wq, entry->wqs[entry->cur_wq]->idxd->id,
+		 entry->wqs[entry->cur_wq]->id, cpu);
+
+	return entry->wqs[entry->cur_wq];
+}
+
+/*************************************************
+ * Core iaa_crypto compress/decompress functions.
+ *************************************************/
+
+static int deflate_generic_decompress(struct acomp_req *req)
+{
+	ACOMP_FBREQ_ON_STACK(fbreq, req);
+	int ret;
+
+	ret =3D crypto_acomp_decompress(fbreq);
+	req->dlen =3D fbreq->dlen;
+
+	update_total_sw_decomp_calls();
+
+	return ret;
+}
+
 static inline int check_completion(struct device *dev,
 				   struct iax_completion_record *comp,
 				   bool compress,
@@ -990,27 +1027,132 @@ static inline int check_completion(struct device *d=
ev,
 	return ret;
 }
=20
-static int deflate_generic_decompress(struct acomp_req *req)
+static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
+				struct acomp_req *req,
+				dma_addr_t *src_addr, dma_addr_t *dst_addr)
 {
-	ACOMP_FBREQ_ON_STACK(fbreq, req);
-	int ret;
+	int ret =3D 0;
+	int nr_sgs;
=20
-	ret =3D crypto_acomp_decompress(fbreq);
-	req->dlen =3D fbreq->dlen;
+	dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
=20
-	update_total_sw_decomp_calls();
+	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+		dev_dbg(dev, "verify: couldn't map src sg for iaa device %d,"
+			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
+			iaa_wq->wq->id, ret);
+		ret =3D -EIO;
+		goto out;
+	}
+	*src_addr =3D sg_dma_address(req->src);
+	dev_dbg(dev, "verify: dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
+		" req->slen %d, sg_dma_len(sg) %d\n", *src_addr, nr_sgs,
+		req->src, req->slen, sg_dma_len(req->src));
=20
+	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
+	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+		dev_dbg(dev, "verify: couldn't map dst sg for iaa device %d,"
+			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
+			iaa_wq->wq->id, ret);
+		ret =3D -EIO;
+		dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+		goto out;
+	}
+	*dst_addr =3D sg_dma_address(req->dst);
+	dev_dbg(dev, "verify: dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
+		" req->dlen %d, sg_dma_len(sg) %d\n", *dst_addr, nr_sgs,
+		req->dst, req->dlen, sg_dma_len(req->dst));
+out:
 	return ret;
 }
=20
-static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
-				struct acomp_req *req,
-				dma_addr_t *src_addr, dma_addr_t *dst_addr);
-
 static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *r=
eq,
 			       struct idxd_wq *wq,
 			       dma_addr_t src_addr, unsigned int slen,
-			       dma_addr_t dst_addr, unsigned int *dlen);
+			       dma_addr_t dst_addr, unsigned int *dlen)
+{
+	struct iaa_device_compression_mode *active_compression_mode;
+	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
+	u32 *compression_crc =3D acomp_request_ctx(req);
+	struct iaa_device *iaa_device;
+	struct idxd_desc *idxd_desc;
+	struct iax_hw_desc *desc;
+	struct idxd_device *idxd;
+	struct iaa_wq *iaa_wq;
+	struct pci_dev *pdev;
+	struct device *dev;
+	int ret =3D 0;
+
+	iaa_wq =3D idxd_wq_get_private(wq);
+	iaa_device =3D iaa_wq->iaa_device;
+	idxd =3D iaa_device->idxd;
+	pdev =3D idxd->pdev;
+	dev =3D &pdev->dev;
+
+	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
+
+	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	if (IS_ERR(idxd_desc)) {
+		dev_dbg(dev, "idxd descriptor allocation failed\n");
+		dev_dbg(dev, "iaa compress failed: ret=3D%ld\n",
+			PTR_ERR(idxd_desc));
+		return PTR_ERR(idxd_desc);
+	}
+	desc =3D idxd_desc->iax_hw;
+
+	/* Verify (optional) - decompress and check crc, suppress dest write */
+
+	desc->flags =3D IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode =3D IAX_OPCODE_DECOMPRESS;
+	desc->decompr_flags =3D IAA_DECOMP_FLAGS | IAA_DECOMP_SUPPRESS_OUTPUT;
+	desc->priv =3D 0;
+
+	desc->src1_addr =3D (u64)dst_addr;
+	desc->src1_size =3D *dlen;
+	desc->dst_addr =3D (u64)src_addr;
+	desc->max_dst_size =3D slen;
+	desc->completion_addr =3D idxd_desc->compl_dma;
+
+	dev_dbg(dev, "(verify) compression mode %s,"
+		" desc->src1_addr %llx, desc->src1_size %d,"
+		" desc->dst_addr %llx, desc->max_dst_size %d,"
+		" desc->src2_addr %llx, desc->src2_size %d\n",
+		active_compression_mode->name,
+		desc->src1_addr, desc->src1_size, desc->dst_addr,
+		desc->max_dst_size, desc->src2_addr, desc->src2_size);
+
+	ret =3D idxd_submit_desc(wq, idxd_desc);
+	if (ret) {
+		dev_dbg(dev, "submit_desc (verify) failed ret=3D%d\n", ret);
+		goto err;
+	}
+
+	ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
+	if (ret) {
+		dev_dbg(dev, "(verify) check_completion failed ret=3D%d\n", ret);
+		goto err;
+	}
+
+	if (*compression_crc !=3D idxd_desc->iax_completion->crc) {
+		ret =3D -EINVAL;
+		dev_dbg(dev, "(verify) iaa comp/decomp crc mismatch:"
+			" comp=3D0x%x, decomp=3D0x%x\n", *compression_crc,
+			idxd_desc->iax_completion->crc);
+		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET,
+			       8, 1, idxd_desc->iax_completion, 64, 0);
+		goto err;
+	}
+
+	idxd_free_desc(wq, idxd_desc);
+out:
+	return ret;
+err:
+	idxd_free_desc(wq, idxd_desc);
+	dev_dbg(dev, "iaa compress failed: ret=3D%d\n", ret);
+
+	goto out;
+}
=20
 static void iaa_desc_complete(struct idxd_desc *idxd_desc,
 			      enum idxd_complete_type comp_type,
@@ -1226,133 +1368,6 @@ static int iaa_compress(struct crypto_tfm *tfm,	str=
uct acomp_req *req,
 	goto out;
 }
=20
-static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
-				struct acomp_req *req,
-				dma_addr_t *src_addr, dma_addr_t *dst_addr)
-{
-	int ret =3D 0;
-	int nr_sgs;
-
-	dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
-	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
-
-	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
-		dev_dbg(dev, "verify: couldn't map src sg for iaa device %d,"
-			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
-			iaa_wq->wq->id, ret);
-		ret =3D -EIO;
-		goto out;
-	}
-	*src_addr =3D sg_dma_address(req->src);
-	dev_dbg(dev, "verify: dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
-		" req->slen %d, sg_dma_len(sg) %d\n", *src_addr, nr_sgs,
-		req->src, req->slen, sg_dma_len(req->src));
-
-	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
-		dev_dbg(dev, "verify: couldn't map dst sg for iaa device %d,"
-			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
-			iaa_wq->wq->id, ret);
-		ret =3D -EIO;
-		dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
-		goto out;
-	}
-	*dst_addr =3D sg_dma_address(req->dst);
-	dev_dbg(dev, "verify: dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
-		" req->dlen %d, sg_dma_len(sg) %d\n", *dst_addr, nr_sgs,
-		req->dst, req->dlen, sg_dma_len(req->dst));
-out:
-	return ret;
-}
-
-static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *r=
eq,
-			       struct idxd_wq *wq,
-			       dma_addr_t src_addr, unsigned int slen,
-			       dma_addr_t dst_addr, unsigned int *dlen)
-{
-	struct iaa_device_compression_mode *active_compression_mode;
-	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
-	u32 *compression_crc =3D acomp_request_ctx(req);
-	struct iaa_device *iaa_device;
-	struct idxd_desc *idxd_desc;
-	struct iax_hw_desc *desc;
-	struct idxd_device *idxd;
-	struct iaa_wq *iaa_wq;
-	struct pci_dev *pdev;
-	struct device *dev;
-	int ret =3D 0;
-
-	iaa_wq =3D idxd_wq_get_private(wq);
-	iaa_device =3D iaa_wq->iaa_device;
-	idxd =3D iaa_device->idxd;
-	pdev =3D idxd->pdev;
-	dev =3D &pdev->dev;
-
-	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
-
-	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
-	if (IS_ERR(idxd_desc)) {
-		dev_dbg(dev, "idxd descriptor allocation failed\n");
-		dev_dbg(dev, "iaa compress failed: ret=3D%ld\n",
-			PTR_ERR(idxd_desc));
-		return PTR_ERR(idxd_desc);
-	}
-	desc =3D idxd_desc->iax_hw;
-
-	/* Verify (optional) - decompress and check crc, suppress dest write */
-
-	desc->flags =3D IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
-	desc->opcode =3D IAX_OPCODE_DECOMPRESS;
-	desc->decompr_flags =3D IAA_DECOMP_FLAGS | IAA_DECOMP_SUPPRESS_OUTPUT;
-	desc->priv =3D 0;
-
-	desc->src1_addr =3D (u64)dst_addr;
-	desc->src1_size =3D *dlen;
-	desc->dst_addr =3D (u64)src_addr;
-	desc->max_dst_size =3D slen;
-	desc->completion_addr =3D idxd_desc->compl_dma;
-
-	dev_dbg(dev, "(verify) compression mode %s,"
-		" desc->src1_addr %llx, desc->src1_size %d,"
-		" desc->dst_addr %llx, desc->max_dst_size %d,"
-		" desc->src2_addr %llx, desc->src2_size %d\n",
-		active_compression_mode->name,
-		desc->src1_addr, desc->src1_size, desc->dst_addr,
-		desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
-	ret =3D idxd_submit_desc(wq, idxd_desc);
-	if (ret) {
-		dev_dbg(dev, "submit_desc (verify) failed ret=3D%d\n", ret);
-		goto err;
-	}
-
-	ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
-	if (ret) {
-		dev_dbg(dev, "(verify) check_completion failed ret=3D%d\n", ret);
-		goto err;
-	}
-
-	if (*compression_crc !=3D idxd_desc->iax_completion->crc) {
-		ret =3D -EINVAL;
-		dev_dbg(dev, "(verify) iaa comp/decomp crc mismatch:"
-			" comp=3D0x%x, decomp=3D0x%x\n", *compression_crc,
-			idxd_desc->iax_completion->crc);
-		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET,
-			       8, 1, idxd_desc->iax_completion, 64, 0);
-		goto err;
-	}
-
-	idxd_free_desc(wq, idxd_desc);
-out:
-	return ret;
-err:
-	idxd_free_desc(wq, idxd_desc);
-	dev_dbg(dev, "iaa compress failed: ret=3D%d\n", ret);
-
-	goto out;
-}
-
 static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
 			  struct idxd_wq *wq,
 			  dma_addr_t src_addr, unsigned int slen,
@@ -1662,6 +1677,10 @@ static void compression_ctx_init(struct iaa_compress=
ion_ctx *ctx)
 	ctx->use_irq =3D use_irq;
 }
=20
+/*********************************************
+ * Interfaces to crypto_alg and crypto_acomp.
+ *********************************************/
+
 static int iaa_comp_init_fixed(struct crypto_acomp *acomp_tfm)
 {
 	struct crypto_tfm *tfm =3D crypto_acomp_tfm(acomp_tfm);
@@ -1864,6 +1883,10 @@ static struct idxd_device_driver iaa_crypto_driver =
=3D {
 	.desc_complete =3D iaa_desc_complete,
 };
=20
+/********************
+ * Module init/exit.
+ ********************/
+
 static int __init iaa_crypto_init_module(void)
 {
 	int ret =3D 0;
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 954BD11712;
	Fri, 26 Sep 2025 03:35:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857711; cv=none;
 b=CcMiGyFkszzyXIMc+UAyZmIcGcbMtS6aLqVLuOeBwdxFPy09dZ+NMs1mQ4GfqRd/aPo1ofT97LyCstI0+Zcnpvw+lqEVcAIj846zR+3Objt+Iyjx/9C+/P9gbCfNfAavNK5pI3akWTBygZqU3+rvEWsPMccc/dblUjCgVPNE/xA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857711; c=relaxed/simple;
	bh=3XujNYoirJUorvuc2BHSGKLalfO6Ya+rGo10gIQ2mcM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=fUdZ4jVshZ0DA78dU5weApQeCFbaKP3MZCr+Y3nAroxK0A1l2vAWnvo4udK2MMlmWgRlkr5bZry1ranmPbrHnqDOD+2wSZ1+xWWXEUk9NvMox/HgcLAsZ05dVY6kcv3y2tHpKz3u9vW/JSA2KXMQ0jwEpigMvYHLuVo8I5vqtJ4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=YQQurMZ6; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="YQQurMZ6"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857708; x=1790393708;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=3XujNYoirJUorvuc2BHSGKLalfO6Ya+rGo10gIQ2mcM=;
  b=YQQurMZ6xuRTwkFSXCDXp8AR7R4O6KL1Vp2OqDTDLIOivlqjfb7jxnob
   lKu0mjkuK9Wpqzb+gEgOyWy1Lrvth6hbtKFisVWbQgWhgrtouXkGoDLeQ
   G7Xsirnyn4Y2VGeQjlsCIsv2FqkwnypEVYxzPR1ko/4VqGVN/44guCRb9
   vz+V2bQnV0/zIK1iy4vVrJkGnlE37DmrTq16O2nSDaBltH0U4tHYzBZXV
   LPOzTLjlTO4+F/18eTaHAnllCULCy7Hr7ejrl8D7xqH3kfCN93Mm0/qCr
   yCL49HRNnaqKVXIHmffkh0iQbRRQVpKAdMphVRmf29R78yxGobBXNnH0B
   g==;
X-CSE-ConnectionGUID: hWWMaKQXQFeQCTXZpuvBHA==
X-CSE-MsgGUID: +CyHDVBSS8S30/115uZoDw==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819426"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819426"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: x2unOBg2RKaxJBUhmqjT/Q==
X-CSE-MsgGUID: 2e+E1IytTi6NYpk5SU4bzg==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636549"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:02 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 02/23] crypto: iaa - New architecture for IAA device WQ
 comp/decomp usage & core mapping.
Date: Thu, 25 Sep 2025 20:34:41 -0700
Message-Id: <20250926033502.7486-3-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch re-architects the iaa_crypto driver in three main aspects, to
make it more robust, stable, generic and functionally versatile to
support zswap users on platforms with different number of cores/IAAs
running workloads with different swap characteristics, and most
importantly, better performance.

 Summary of latency improvement for large folio compression:
 =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
 When measured in zswap using a simple madvise workload, where 64K
 Folios are stored using IAA batch compressions, this is how the
 per-page compress latency changes just by setting the
 "distribute_comps" driver parameter to "1":

   --------------------------------------------------------------
   zswap compressor: deflate-iaa
   64K Folios: zswap_store() latency normalized to per-page
   --------------------------------------------------------------
                                         p50 (ns)     p99 (ns)
   --------------------------------------------------------------
   Sequential store                         3,503        3,695
   Batch compress, distribute_comps=3D0       1,356        1,384
   Batch compress, distribute_comps=3D1         706          763
   --------------------------------------------------------------

The rearchitecting aspects are:

A) Map IAA devices/wqs to cores based on packages instead of NUMA.

B) The WQ rebalancing algorithm that is invoked as WQs are
   discovered/deleted has been made very general and flexible so that
   the user can control exactly how IAA WQs are used, for optimizing
   performance.

C) Additionally, the "iaa_crypto_enabled" driver global has been
   modified to be an atomic, and used for synchronization between
   dynamic/asynchronous WQ discovery/deletion and the fundamental
   routines comp_wq_table_next_wq() and decomp_wq_table_next_wq() that
   are queried by compress/decompress job submissions.

Description/motivation for (A):
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D
This patch modifies the algorithm for mapping available IAA devices and
WQs to cores based on packages instead of NUMA nodes. This leads to a
more realistic mapping of IAA devices as compression/decompression
resources for a package, rather than for a NUMA node. This also resolves
problems that were observed during internal validation on Intel Granite
Rapids platforms with many more NUMA nodes than packages: for such
cases, the earlier NUMA based allocation caused some IAAs to be
over-subscribed and some to not be utilized at all.

As a result of this change from NUMA to packages, some of the core
functions used by the iaa_crypto driver's "probe" and "remove" API
have been re-written. The new infrastructure maintains a static mapping
of wqs per IAA device, in the "struct iaa_device" itself. The earlier
implementation would allocate memory per-cpu for this data, which never
changes once the IAA devices/wqs have been initialized.

Two main outcomes from this new iaa_crypto driver infrastructure are:

 1) Resolves "task blocked for more than x seconds" errors observed during
    internal validation on Intel systems with the earlier NUMA node based
    mappings, which was root-caused to the non-optimal IAA-to-core mappings
    described earlier.

 2) Results in a NUM_THREADS factor reduction in memory footprint cost of
    initializing IAA devices/wqs, due to eliminating the per-cpu copies of
    each IAA device's wqs. On a 384 cores Intel Granite Rapids server with
    8 IAA devices, this saves 140MiB.

An auxiliary change included in this patch is that the driver's "nr_iaa",
"nr_iaa_per_package" and "cpus_per_iaa" global variables are made
atomic, because iaa_crypto_probe() and iaa_crypto_remove() change the
values of these variables asynchronously and concurrently as wqs get
added/deleted and rebalance_wq_table() is called. This change allows the
rebalance_wq_table() code to see consistent values of the number of IAA
devices.

Description/motivation for (B):
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D
This builds upon the package-based driver infrastructure, to provide
more flexibility in using particular WQs for compress-only or
decompress-only jobs. It also introduces the notion of using all the IAA
devices on a package as resources that are shared by all cores on the
package: this significantly improves batching (to be added in subsequent
patches) latency and compress/decompress throughput. sysfs driver
paramters provide configurability of these features.

Two main concepts are introduced as part of the rebalancing changes:

 1) An IAA WQ can be used for specific ops, that determines a WQ "type"
    for the iaa_crypto driver to submit compress/decompress jobs:

    - compress only
    - decompress only
    - generic, i.e, for both compresses and decompresses

    The WQ type is decided based on the number of WQs configured for a
    given IAA device, and the new "g_comp_wqs_per_iaa" driver parameter.

 2) An IAA WQ can be mapped to cores using either of the following
    balancing techniques:

    a) Shared by all cores on a package. The iaa_crypto driver will
       dispatch compress/decompress jobs to all WQs of the same type,
       across all IAA devices on the package:
       - IAA compress jobs will be distributed to all same-package IAA
         compress-only/generic WQs.
       - IAA decompress jobs will be distributed to all same-package IAA
         decompress-only/generic WQs.

    b) Handles compress/decompress jobs only from "mapped cores", i.e.,
       the cores derived by evenly dividing the number of IAAs among the
       number of cores, per package.

Server setups that are moderately to highly contended can benefit from
(2.a). When the mix of workloads running on a system need high compress
throughput, and have relatively lower decompress activity, (2.b) might
be more optimal.

These approaches can be accomplished with the following new iaa_crypto
driver parameters. These parameters are global settings and will apply
to all IAAs on a package, interpreted in the context of the number of
WQs configured per IAA device.

 g_comp_wqs_per_iaa:
 =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
   Number of compress-only WQs. The default is 1, but is applicable only
   if the device has more than 1 WQ. If the device has exactly 1 WQ
   configured, "g_comp_wqs_per_iaa" is a don't care.

   If the IAA device has more than "g_comp_wqs_per_iaa" WQs configured,
   the last "g_comp_wqs_per_iaa" number of WQs will be considered as
   "compress only". The remaining WQs will be considered as
   "decompress only".

   If the device has less than or equal to "g_comp_wqs_per_iaa" WQs, all
   the device's WQs will be considered "generic", i.e., the driver will
   submit compress and decompress jobs to all the WQs configured for the
   device.

   For e.g., if an IAA "X" has 2 WQs, this will set up 1 decompress WQ and
   1 compress WQ:

     echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa

     wqX.0: decompress jobs only.
     wqX.1: compress jobs only.

   This setting would typically benefit workloads that see a high
   level of compress and decompress activity.

   If an IAA has 1 WQ, that WQ will be considered "generic": the driver
   will submit compress and decompress jobs to the same WQ (this is
   independent of the "g_comp_wqs_per_iaa" setting):

     wqX.0: compress and decompress jobs.

   This would typically benefit workloads that see significant cold
   memory being reclaimed, and consequently, high swapout and low swapin
   activity.

 distribute_comps:
 =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
   Distribute compressions to all IAAs on package (default is Y).

   Assuming the WQ type has been established as
   compress-only/decompress-only/generic, this setting will determine if
   the driver will distribute compress jobs to all IAAs on a package
   (default behavior) or not.

   If this is turned off, the driver will dispatch compress jobs to a
   given IAA "compression enabled" WQ only from cores that are mapped to
   that IAA using an algorithm that evenly distributes IAAs per package
   to cores per package. For e.g., on a Sapphire Rapids server with
   56-physical-cores and 4 IAAs per package, with Hyperthreading, 28
   logical cores will be assigned to each IAA. With the
   "distribute_comps" driver parameter turned off, the driver will send
   compress jobs only to it's assigned IAA device.

   Enabling "distribute_comps" would typically benefit workloads in
   terms of batch compress latency and throughput.

 distribute_decomps:
 =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
   Distribute decompressions to all IAAs on package (default is N).

   Assuming the WQ type has been established as
   compress-only/decompress-only/generic, this setting will determine if
   the driver will distribute decompress jobs to all IAAs on a package
   (default behavior) or not.

   We recommend leaving this parameter at its default setting of "N".
   Enabling "distribute_decomps =3D Y" can be evaluated for workloads that
   are sensitive to p99 decompress latency, and see a high level of
   compress and decompress activity (for e.g. warm memory reclaim/swapin).

Recommended settings for best compress/decompress latency, throughput
and hence memory savings for a moderately contended server, are:

   2 WQs per IAA
   g_comp_wqs_per_iaa =3D 1 (separate WQ for comps/decomps per IAA)
   distribute_decomps =3D N
   distribute_comps =3D Y

For systems that have one IAA device, the distribute_[de]comps settings
will be a no-op. Even for such systems, as long as considerable swapout
and swapin activity is expected, we recommend setting up 2 WQs
for the IAA, one each for compressions/decompressions. If swapouts are
significantly more than swapins, 1 WQ would be a better configuration,
as mentioned earlier.

 Examples:
 =3D=3D=3D=3D=3D=3D=3D=3D=3D
   For a Sapphire Rapids server with 2 packages, 56 cores and 4 IAAs per
   package, each IAA has 2 WQs, and these settings are in effect:

     echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
     echo 1 > /sys/bus/dsa/drivers/crypto/distribute_comps
     echo 0 > /sys/bus/dsa/drivers/crypto/distribute_decomps

     wqX.0: decompress jobs only.
     wqX.1: compress jobs only.

   Compress jobs from all cores on package-0 will be distributed in
   round-robin manner to [iax1, iax3, iax5, iax7]'s wqX.1, to maximize
   compression throughput/latency/memory savings:

     wq1.1
     wq3.1
     wq5.1
     wq7.1

   Likewise, compress jobs from all cores on package-1 will be
   distributed in round-robin manner to [iax9, iax11, iax13, iax15]'s
   wqX.1, to maximize compression throughput/latency/memory savings for
   workloads running on package-1:

     wq9.1
     wq11.1
     wq13.1
     wq15.1

   Decompress jobs will be submitted from mapped logical cores only, as
   follows:

     package-0:

       CPU   0-13,112-125   14-27,126-139  28-41,140-153  42-55,154-167
       IAA:  iax1           iax3           iax5           iax7
       WQ:   wq1.0          wq3.0          wq5.0          wq7.0

     package-1:

       CPU   56-69,168-181  70-83,182-195  84-97,196-209   98-111,210-223
       IAA:  iax9           iax11          iax13           iax15
       WQ:   wq9.0          wq11.0         wq13.0          wq15.0

IAA WQs can be configured using higher level scripts as described in
Documentation/driver-api/crypto/iaa/iaa-crypto.rst. This documentation
has been updated for the above new parameters.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 .../driver-api/crypto/iaa/iaa-crypto.rst      | 136 +++
 drivers/crypto/intel/iaa/iaa_crypto.h         |  18 +-
 drivers/crypto/intel/iaa/iaa_crypto_main.c    | 889 ++++++++++++++----
 3 files changed, 872 insertions(+), 171 deletions(-)

diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documenta=
tion/driver-api/crypto/iaa/iaa-crypto.rst
index f815d4fd8372..0ff4ec603b43 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -290,6 +290,142 @@ The available attributes are:
     'sync' mode. This is to ensure correct iaa_crypto behavior until true
     async polling without interrupts is enabled in iaa_crypto.
=20
+  - g_comp_wqs_per_iaa
+
+    Number of compress-only WQs. The default is 1, but is applicable only
+    if the device has more than 1 WQ. If the device has exactly 1 WQ
+    configured, "g_comp_wqs_per_iaa" is a don't care.
+
+    If the IAA device has more than "g_comp_wqs_per_iaa" WQs configured,
+    the last "g_comp_wqs_per_iaa" number of WQs will be considered as
+    "compress only". The remaining WQs will be considered as "decomp only".
+
+    If the device has less than or equal to "g_comp_wqs_per_iaa" WQs, all
+    the device's WQs will be considered "generic", i.e., the driver will
+    submit compress and decompress jobs to all the WQs configured for the
+    device.
+
+    For e.g., if an IAA "X" has 2 WQs, this will set up 1 decompress WQ and
+    1 compress WQ::
+
+      echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
+
+     wqX.0: decompress jobs only.
+     wqX.1: compress jobs only.
+
+    This setting would typically benefit workloads that see a high
+    level of compress and decompress activity.
+
+    If an IAA has 1 WQ, that WQ will be considered "generic": the driver
+    will submit compress and decompress jobs to the same WQ (this is
+    independent of the "g_comp_wqs_per_iaa" setting):
+
+     wqX.0: compress and decompress jobs.
+
+    This would typically benefit workloads that see significant cold
+    memory being reclaimed, and consequently, high swapout and low swapin
+    activity.
+
+  - distribute_comps
+
+    Distribute compressions to all IAAs on package (default is Y).
+
+    Assuming the WQ type has been established as
+    compress-only/decompress-only/generic, this setting will determine if
+    the driver will distribute compress jobs to all IAAs on a package
+    (default behavior) or not.
+
+    If this is turned off, the driver will dispatch compress jobs to a
+    given IAA "compression enabled" WQ only from cores that are mapped to
+    that IAA using an algorithm that evenly distributes IAAs per package
+    to cores per package. For e.g., on a Sapphire Rapids server with
+    56-physical-cores and 4 IAAs per package, with Hyperthreading, 28
+    logical cores will be assigned to each IAA. With the
+    "distribute_comps" driver parameter turned off, the driver will send
+    compress jobs only to it's assigned IAA device.
+
+    Enabling "distribute_comps" would typically benefit workloads in
+    terms of batch compress latency and throughput.
+
+  - distribute_decomps
+
+    Distribute decompressions to all IAAs on package (default is Y).
+
+    Assuming the WQ type has been established as
+    compress-only/decompress-only/generic, this setting will determine if
+    the driver will distribute decompress jobs to all IAAs on a package
+    (default behavior) or not.
+
+    Enabling "distribute_decomps" would typically benefit workloads that
+    see a high level of compress and decompress activity, especially
+    p99 decompress latency.
+
+    Recommended settings for best compress/decompress latency, throughput
+    and hence memory savings for a moderately contended server that
+    has more than 1 IAA device enabled on a given package:
+
+      2 WQs per IAA
+      g_comp_wqs_per_iaa =3D 1 (separate WQ for comps/decomps per IAA)
+      distribute_decomps =3D Y
+      distribute_comps =3D Y
+
+    For a system that has only 1 IAA device enabled on a given package,
+    the recommended settings are:
+
+      1 WQ per IAA
+      g_comp_wqs_per_iaa =3D 0 (same WQ for comps/decomps)
+      distribute_decomps =3D N
+      distribute_comps =3D N
+
+    Examples:
+
+    For a Sapphire Rapids server with 2 packages, 56 cores and 4 IAAs per
+    package, each IAA has 2 WQs, and these settings are in effect::
+
+      echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
+      echo 1 > /sys/bus/dsa/drivers/crypto/distribute_comps
+      echo 0 > /sys/bus/dsa/drivers/crypto/distribute_decomps
+
+    This enables the following behavior:
+
+      wqX.0: decompress jobs only.
+      wqX.1: compress jobs only.
+
+    Compress jobs from all cores on package-0 will be distributed in
+    round-robin manner to [iax1, iax3, iax5, iax7]'s wqX.1, to maximize
+    compression throughput/latency/memory savings:
+
+      wq1.1
+      wq3.1
+      wq5.1
+      wq7.1
+
+    Likewise, compress jobs from all cores on package-1 will be
+    distributed in round-robin manner to [iax9, iax11, iax13, iax15]'s
+    wqX.1, to maximize compression throughput/latency/memory savings for
+    workloads running on package-1:
+
+      wq9.1
+      wq11.1
+      wq13.1
+      wq15.1
+
+    Decompress jobs will be submitted from mapped logical cores only, as
+    follows:
+
+      package-0:
+
+        CPU   0-13,112-125   14-27,126-139  28-41,140-153  42-55,154-167
+        IAA:  iax1           iax3           iax5           iax7
+        WQ:   wq1.0          wq3.0          wq5.0          wq7.0
+
+      package-1:
+
+        CPU   56-69,168-181  70-83,182-195  84-97,196-209   98-111,210-223
+        IAA:  iax9           iax11          iax13           iax15
+        WQ:   wq9.0          wq11.0         wq13.0          wq15.0
+
+
 .. _iaa_default_config:
=20
 IAA Default Configuration
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index 56985e395263..549ac98a9366 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -46,6 +46,7 @@ struct iaa_wq {
 	struct idxd_wq		*wq;
 	int			ref;
 	bool			remove;
+	bool			mapped;
=20
 	struct iaa_device	*iaa_device;
=20
@@ -63,6 +64,13 @@ struct iaa_device_compression_mode {
 	dma_addr_t			aecs_comp_table_dma_addr;
 };
=20
+struct wq_table_entry {
+	struct idxd_wq	**wqs;
+	unsigned int	max_wqs;
+	unsigned int	n_wqs;
+	unsigned int	cur_wq;
+};
+
 /* Representation of IAA device with wqs, populated by probe */
 struct iaa_device {
 	struct list_head		list;
@@ -73,19 +81,15 @@ struct iaa_device {
 	int				n_wq;
 	struct list_head		wqs;
=20
+	struct wq_table_entry		*generic_wq_table;
+	struct wq_table_entry		*comp_wq_table;
+
 	atomic64_t			comp_calls;
 	atomic64_t			comp_bytes;
 	atomic64_t			decomp_calls;
 	atomic64_t			decomp_bytes;
 };
=20
-struct wq_table_entry {
-	struct idxd_wq **wqs;
-	int	max_wqs;
-	int	n_wqs;
-	int	cur_wq;
-};
-
 #define IAA_AECS_ALIGN			32
=20
 /*
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 760997eee8fe..c6db721eaa79 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -23,32 +23,86 @@
 #define pr_fmt(fmt)			"idxd: " IDXD_SUBDRIVER_NAME ": " fmt
=20
 #define IAA_ALG_PRIORITY               300
+#define MAX_PKG_IAA   8
+#define MAX_IAA_WQ    8
=20
 /**************************************
  * Driver internal global variables.
  **************************************/
=20
 /* number of iaa instances probed */
-static unsigned int nr_iaa;
+static atomic_t nr_iaa =3D ATOMIC_INIT(0);
 static unsigned int nr_cpus;
-static unsigned int nr_nodes;
-static unsigned int nr_cpus_per_node;
+static unsigned int nr_packages;
+static unsigned int nr_cpus_per_package;
+static atomic_t nr_iaa_per_package =3D ATOMIC_INIT(0);
=20
 /* Number of physical cpus sharing each iaa instance */
-static unsigned int cpus_per_iaa;
+static atomic_t cpus_per_iaa =3D ATOMIC_INIT(0);
=20
-/* Per-cpu lookup table for balanced wqs */
-static struct wq_table_entry __percpu *wq_table;
+/* Per-cpu lookup table for decomp wqs. */
+static struct wq_table_entry __percpu *cpu_decomp_wqs;
+
+/* Per-cpu lookup table for comp wqs. */
+static struct wq_table_entry __percpu *cpu_comp_wqs;
+
+/* All decomp wqs from IAAs on a package. */
+static struct wq_table_entry **pkg_global_decomp_wqs;
+/* All comp wqs from IAAs on a package. */
+static struct wq_table_entry **pkg_global_comp_wqs;
=20
 LIST_HEAD(iaa_devices);
 DEFINE_MUTEX(iaa_devices_lock);
=20
-/* If enabled, IAA hw crypto algos are registered, unavailable otherwise */
-static bool iaa_crypto_enabled;
+/*
+ * If enabled, IAA hw crypto algos are registered, unavailable otherwise:
+ *
+ * We use the atomic @iaa_crypto_enabled to know if the per-CPU
+ * compress/decompress wq tables have been setup successfully.
+ * Since @iaa_crypto_enabled is atomic, the core functions that
+ * return a wq for compression/decompression, namely,
+ * comp_wq_table_next_wq() and decomp_wq_table_next_wq() will
+ * test this atomic before proceeding to query the per-cpu wq tables.
+ *
+ * These events will set @iaa_crypto_enabled to 1:
+ * - Successful rebalance_wq_table() after individual wq addition/removal.
+ *
+ * These events will set @iaa_crypto_enabled to 0:
+ * - Error during rebalance_wq_table() after individual wq addition/remova=
l.
+ * - check_completion() timeouts.
+ * - @nr_iaa is 0.
+ * - module cleanup.
+ */
+static atomic_t iaa_crypto_enabled =3D ATOMIC_INIT(0);
+
+/*
+ * First wq probed, to use until @iaa_crypto_enabled is 1:
+ *
+ * The first wq probed will be entered in the per-CPU comp/decomp wq tables
+ * until the IAA compression modes are registered. This is done to facilit=
ate
+ * the compress/decompress calls from the crypto testmgr resulting from
+ * calling crypto_register_acomp().
+ *
+ * With the new dynamic package-level rebalancing of WQs being
+ * discovered asynchronously and concurrently with tests
+ * triggered from device registration, this is needed to
+ * determine when it is safe for the rebalancing of decomp/comp
+ * WQs to de-allocate the per-package WQs and re-allocate them
+ * based on the latest number of IAA devices and WQs.
+ */
+static struct idxd_wq *first_wq_found;
+DEFINE_MUTEX(first_wq_found_lock);
+
 static bool iaa_crypto_registered;
=20
 static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_M=
AX];
=20
+/* Distribute decompressions across all IAAs on the package. */
+static bool iaa_distribute_decomps;
+
+/* Distribute compressions across all IAAs on the package. */
+static bool iaa_distribute_comps =3D true;
+
 /* Verify results of IAA compress or not */
 static bool iaa_verify_compress =3D true;
=20
@@ -86,6 +140,9 @@ static bool async_mode;
 /* Use interrupts */
 static bool use_irq;
=20
+/* Number of compress-only wqs per iaa*/
+static unsigned int g_comp_wqs_per_iaa =3D 1;
+
 /**************************************************
  * Driver attributes along with get/set functions.
  **************************************************/
@@ -102,7 +159,7 @@ static ssize_t verify_compress_store(struct device_driv=
er *driver,
=20
 	mutex_lock(&iaa_devices_lock);
=20
-	if (iaa_crypto_enabled)
+	if (atomic_read(&iaa_crypto_enabled))
 		goto out;
=20
 	ret =3D kstrtobool(buf, &iaa_verify_compress);
@@ -166,7 +223,7 @@ static ssize_t sync_mode_store(struct device_driver *dr=
iver,
=20
 	mutex_lock(&iaa_devices_lock);
=20
-	if (iaa_crypto_enabled)
+	if (atomic_read(&iaa_crypto_enabled))
 		goto out;
=20
 	ret =3D set_iaa_sync_mode(buf);
@@ -179,6 +236,87 @@ static ssize_t sync_mode_store(struct device_driver *d=
river,
 }
 static DRIVER_ATTR_RW(sync_mode);
=20
+static ssize_t g_comp_wqs_per_iaa_show(struct device_driver *driver, char =
*buf)
+{
+	return sprintf(buf, "%u\n", g_comp_wqs_per_iaa);
+}
+
+static ssize_t g_comp_wqs_per_iaa_store(struct device_driver *driver,
+				   const char *buf, size_t count)
+{
+	int ret =3D -EBUSY;
+
+	mutex_lock(&iaa_devices_lock);
+
+	if (atomic_read(&iaa_crypto_enabled))
+		goto out;
+
+	ret =3D kstrtouint(buf, 10, &g_comp_wqs_per_iaa);
+	if (ret)
+		goto out;
+
+	ret =3D count;
+out:
+	mutex_unlock(&iaa_devices_lock);
+
+	return ret;
+}
+static DRIVER_ATTR_RW(g_comp_wqs_per_iaa);
+
+static ssize_t distribute_decomps_show(struct device_driver *driver, char =
*buf)
+{
+	return sprintf(buf, "%d\n", iaa_distribute_decomps);
+}
+
+static ssize_t distribute_decomps_store(struct device_driver *driver,
+					const char *buf, size_t count)
+{
+	int ret =3D -EBUSY;
+
+	mutex_lock(&iaa_devices_lock);
+
+	if (atomic_read(&iaa_crypto_enabled))
+		goto out;
+
+	ret =3D kstrtobool(buf, &iaa_distribute_decomps);
+	if (ret)
+		goto out;
+
+	ret =3D count;
+out:
+	mutex_unlock(&iaa_devices_lock);
+
+	return ret;
+}
+static DRIVER_ATTR_RW(distribute_decomps);
+
+static ssize_t distribute_comps_show(struct device_driver *driver, char *b=
uf)
+{
+	return sprintf(buf, "%d\n", iaa_distribute_comps);
+}
+
+static ssize_t distribute_comps_store(struct device_driver *driver,
+				      const char *buf, size_t count)
+{
+	int ret =3D -EBUSY;
+
+	mutex_lock(&iaa_devices_lock);
+
+	if (atomic_read(&iaa_crypto_enabled))
+		goto out;
+
+	ret =3D kstrtobool(buf, &iaa_distribute_comps);
+	if (ret)
+		goto out;
+
+	ret =3D count;
+out:
+	mutex_unlock(&iaa_devices_lock);
+
+	return ret;
+}
+static DRIVER_ATTR_RW(distribute_comps);
+
 /****************************
  * Driver compression modes.
  ****************************/
@@ -464,32 +602,81 @@ static void remove_device_compression_modes(struct ia=
a_device *iaa_device)
  * allocate/init/query/deallocate devices/wqs.
  ***********************************************************/
=20
-static struct iaa_device *iaa_device_alloc(void)
+static struct iaa_device *iaa_device_alloc(struct idxd_device *idxd)
 {
 	struct iaa_device *iaa_device;
+	struct wq_table_entry *wqt;
=20
 	iaa_device =3D kzalloc(sizeof(*iaa_device), GFP_KERNEL);
 	if (!iaa_device)
-		return NULL;
+		goto err;
+
+	iaa_device->idxd =3D idxd;
+
+	/* IAA device's generic/decomp wqs. */
+	iaa_device->generic_wq_table =3D kzalloc(sizeof(struct wq_table_entry), G=
FP_KERNEL);
+	if (!iaa_device->generic_wq_table)
+		goto err;
+
+	wqt =3D iaa_device->generic_wq_table;
+
+	wqt->wqs =3D kcalloc(iaa_device->idxd->max_wqs, sizeof(struct idxd_wq *),=
 GFP_KERNEL);
+	if (!wqt->wqs)
+		goto err;
+
+	wqt->max_wqs =3D iaa_device->idxd->max_wqs;
+	wqt->n_wqs =3D 0;
+
+	/*
+	 * IAA device's comp wqs (optional). If the device has more than
+	 * "g_comp_wqs_per_iaa" WQs configured, the last "g_comp_wqs_per_iaa"
+	 * number of WQs will be considered as "comp only". The remaining
+	 * WQs will be considered as "decomp only".
+	 * If the device has <=3D "g_comp_wqs_per_iaa" WQs, all the
+	 * device's WQs will be considered "generic", i.e., cores can submit
+	 * comp and decomp jobs to all the WQs configured for the device.
+	 */
+	iaa_device->comp_wq_table =3D kzalloc(sizeof(struct wq_table_entry), GFP_=
KERNEL);
+	if (!iaa_device->comp_wq_table)
+		goto err;
+
+	wqt =3D iaa_device->comp_wq_table;
+
+	wqt->wqs =3D kcalloc(iaa_device->idxd->max_wqs, sizeof(struct idxd_wq *),=
 GFP_KERNEL);
+	if (!wqt->wqs)
+		goto err;
+
+	wqt->max_wqs =3D iaa_device->idxd->max_wqs;
+	wqt->n_wqs =3D 0;
=20
 	INIT_LIST_HEAD(&iaa_device->wqs);
=20
 	return iaa_device;
+
+err:
+	if (iaa_device) {
+		if (iaa_device->generic_wq_table) {
+			kfree(iaa_device->generic_wq_table->wqs);
+			kfree(iaa_device->generic_wq_table);
+		}
+		kfree(iaa_device->comp_wq_table);
+		kfree(iaa_device);
+	}
+
+	return NULL;
 }
=20
 static struct iaa_device *add_iaa_device(struct idxd_device *idxd)
 {
 	struct iaa_device *iaa_device;
=20
-	iaa_device =3D iaa_device_alloc();
+	iaa_device =3D iaa_device_alloc(idxd);
 	if (!iaa_device)
 		return NULL;
=20
-	iaa_device->idxd =3D idxd;
-
 	list_add_tail(&iaa_device->list, &iaa_devices);
=20
-	nr_iaa++;
+	atomic_inc(&nr_iaa);
=20
 	return iaa_device;
 }
@@ -509,7 +696,7 @@ static void del_iaa_device(struct iaa_device *iaa_devic=
e)
 {
 	list_del(&iaa_device->list);
=20
-	nr_iaa--;
+	atomic_dec(&nr_iaa);
 }
=20
 static void free_iaa_device(struct iaa_device *iaa_device)
@@ -518,6 +705,17 @@ static void free_iaa_device(struct iaa_device *iaa_dev=
ice)
 		return;
=20
 	remove_device_compression_modes(iaa_device);
+
+	if (iaa_device->generic_wq_table) {
+		kfree(iaa_device->generic_wq_table->wqs);
+		kfree(iaa_device->generic_wq_table);
+	}
+
+	if (iaa_device->comp_wq_table) {
+		kfree(iaa_device->comp_wq_table->wqs);
+		kfree(iaa_device->comp_wq_table);
+	}
+
 	kfree(iaa_device);
 }
=20
@@ -576,7 +774,7 @@ static void del_iaa_wq(struct iaa_device *iaa_device, s=
truct idxd_wq *wq)
=20
 			dev_dbg(dev, "removed wq %d from iaa_device %d, n_wq %d, nr_iaa %d\n",
 				wq->id, iaa_device->idxd->id,
-				iaa_device->n_wq, nr_iaa);
+				iaa_device->n_wq, atomic_read(&nr_iaa));
=20
 			if (iaa_device->n_wq =3D=3D 0)
 				del_iaa_device(iaa_device);
@@ -588,6 +786,7 @@ static void del_iaa_wq(struct iaa_device *iaa_device, s=
truct idxd_wq *wq)
 static void remove_iaa_wq(struct idxd_wq *wq)
 {
 	struct iaa_device *iaa_device;
+	unsigned int num_pkg_iaa =3D 0;
=20
 	list_for_each_entry(iaa_device, &iaa_devices, list) {
 		if (iaa_has_wq(iaa_device, wq)) {
@@ -596,12 +795,20 @@ static void remove_iaa_wq(struct idxd_wq *wq)
 		}
 	}
=20
-	if (nr_iaa) {
-		cpus_per_iaa =3D (nr_nodes * nr_cpus_per_node) / nr_iaa;
-		if (!cpus_per_iaa)
-			cpus_per_iaa =3D 1;
-	} else
-		cpus_per_iaa =3D 1;
+	if (atomic_read(&nr_iaa)) {
+		atomic_set(&cpus_per_iaa, (nr_packages * nr_cpus_per_package) / atomic_r=
ead(&nr_iaa));
+		if (!atomic_read(&cpus_per_iaa))
+			atomic_set(&cpus_per_iaa, 1);
+
+		num_pkg_iaa =3D atomic_read(&nr_iaa) / nr_packages;
+		if (!num_pkg_iaa)
+			num_pkg_iaa =3D 1;
+	} else {
+		atomic_set(&cpus_per_iaa, 1);
+		num_pkg_iaa =3D 1;
+	}
+
+	atomic_set(&nr_iaa_per_package, num_pkg_iaa);
 }
=20
 static void __free_iaa_wq(struct iaa_wq *iaa_wq)
@@ -635,6 +842,7 @@ static int save_iaa_wq(struct idxd_wq *wq)
 	struct pci_dev *pdev;
 	struct device *dev;
 	int ret =3D 0;
+	unsigned int num_pkg_iaa =3D 0;
=20
 	list_for_each_entry(iaa_device, &iaa_devices, list) {
 		if (iaa_device->idxd =3D=3D wq->idxd) {
@@ -687,12 +895,19 @@ static int save_iaa_wq(struct idxd_wq *wq)
 		}
 	}
=20
-	if (WARN_ON(nr_iaa =3D=3D 0))
+	if (WARN_ON(atomic_read(&nr_iaa) =3D=3D 0))
 		return -EINVAL;
=20
-	cpus_per_iaa =3D (nr_nodes * nr_cpus_per_node) / nr_iaa;
-	if (!cpus_per_iaa)
-		cpus_per_iaa =3D 1;
+	atomic_set(&cpus_per_iaa, (nr_packages * nr_cpus_per_package) / atomic_re=
ad(&nr_iaa));
+	if (!atomic_read(&cpus_per_iaa))
+		atomic_set(&cpus_per_iaa, 1);
+
+	num_pkg_iaa =3D atomic_read(&nr_iaa) / nr_packages;
+	if (!num_pkg_iaa)
+		num_pkg_iaa =3D 1;
+
+	atomic_set(&nr_iaa_per_package, num_pkg_iaa);
+
 out:
 	return 0;
 }
@@ -748,105 +963,284 @@ static int iaa_wq_put(struct idxd_wq *wq)
  * Mapping IAA devices and wqs to cores with per-cpu wq_tables.
  ***************************************************************/
=20
-static void wq_table_free_entry(int cpu)
+/*
+ * Given a cpu, find the closest IAA instance.
+ */
+static inline int cpu_to_iaa(int cpu)
 {
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+	int package_id, base_iaa, iaa =3D 0;
+
+	if (!nr_packages || !atomic_read(&nr_iaa_per_package) || !atomic_read(&nr=
_iaa))
+		return -1;
+
+	package_id =3D topology_logical_package_id(cpu);
+	base_iaa =3D package_id * atomic_read(&nr_iaa_per_package);
+	iaa =3D base_iaa + ((cpu % nr_cpus_per_package) / atomic_read(&cpus_per_i=
aa));
=20
-	kfree(entry->wqs);
-	memset(entry, 0, sizeof(*entry));
+	pr_debug("cpu =3D %d, package_id =3D %d, base_iaa =3D %d, iaa =3D %d",
+		 cpu, package_id, base_iaa, iaa);
+
+	if (iaa >=3D 0 && iaa < atomic_read(&nr_iaa))
+		return iaa;
+
+	return (atomic_read(&nr_iaa) - 1);
 }
=20
-static void wq_table_clear_entry(int cpu)
+static void free_wq_tables(void)
 {
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+	if (cpu_decomp_wqs) {
+		free_percpu(cpu_decomp_wqs);
+		cpu_decomp_wqs =3D NULL;
+	}
=20
-	entry->n_wqs =3D 0;
-	entry->cur_wq =3D 0;
-	memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
+	if (cpu_comp_wqs) {
+		free_percpu(cpu_comp_wqs);
+		cpu_comp_wqs =3D NULL;
+	}
+
+	pr_debug("freed comp/decomp wq tables\n");
 }
=20
-static void clear_wq_table(void)
+static void pkg_global_wqs_dealloc(void)
 {
-	int cpu;
+	int i;
=20
-	for (cpu =3D 0; cpu < nr_cpus; cpu++)
-		wq_table_clear_entry(cpu);
+	if (pkg_global_decomp_wqs) {
+		for (i =3D 0; i < nr_packages; ++i) {
+			kfree(pkg_global_decomp_wqs[i]->wqs);
+			kfree(pkg_global_decomp_wqs[i]);
+		}
+		kfree(pkg_global_decomp_wqs);
+		pkg_global_decomp_wqs =3D NULL;
+	}
=20
-	pr_debug("cleared wq table\n");
+	if (pkg_global_comp_wqs) {
+		for (i =3D 0; i < nr_packages; ++i) {
+			kfree(pkg_global_comp_wqs[i]->wqs);
+			kfree(pkg_global_comp_wqs[i]);
+		}
+		kfree(pkg_global_comp_wqs);
+		pkg_global_comp_wqs =3D NULL;
+	}
 }
=20
-static void free_wq_table(void)
+static bool pkg_global_wqs_alloc(void)
 {
-	int cpu;
+	int i;
+
+	pkg_global_decomp_wqs =3D kcalloc(nr_packages, sizeof(*pkg_global_decomp_=
wqs), GFP_KERNEL);
+	if (!pkg_global_decomp_wqs)
+		return false;
+
+	for (i =3D 0; i < nr_packages; ++i) {
+		pkg_global_decomp_wqs[i] =3D kzalloc(sizeof(struct wq_table_entry), GFP_=
KERNEL);
+		if (!pkg_global_decomp_wqs[i])
+			goto err;
+
+		pkg_global_decomp_wqs[i]->wqs =3D kcalloc(MAX_PKG_IAA * MAX_IAA_WQ, size=
of(struct idxd_wq *), GFP_KERNEL);
+		if (!pkg_global_decomp_wqs[i]->wqs)
+			goto err;
+
+		pkg_global_decomp_wqs[i]->max_wqs =3D MAX_PKG_IAA * MAX_IAA_WQ;
+	}
+
+	pkg_global_comp_wqs =3D kcalloc(nr_packages, sizeof(*pkg_global_comp_wqs)=
, GFP_KERNEL);
+	if (!pkg_global_comp_wqs)
+		goto err;
+
+	for (i =3D 0; i < nr_packages; ++i) {
+		pkg_global_comp_wqs[i] =3D kzalloc(sizeof(struct wq_table_entry), GFP_KE=
RNEL);
+		if (!pkg_global_comp_wqs[i])
+			goto err;
=20
-	for (cpu =3D 0; cpu < nr_cpus; cpu++)
-		wq_table_free_entry(cpu);
+		pkg_global_comp_wqs[i]->wqs =3D kcalloc(MAX_PKG_IAA * MAX_IAA_WQ, sizeof=
(struct idxd_wq *), GFP_KERNEL);
+		if (!pkg_global_comp_wqs[i]->wqs)
+			goto err;
+
+		pkg_global_comp_wqs[i]->max_wqs =3D MAX_PKG_IAA * MAX_IAA_WQ;
+	}
=20
-	free_percpu(wq_table);
+	return true;
=20
-	pr_debug("freed wq table\n");
+err:
+	pkg_global_wqs_dealloc();
+	return false;
 }
=20
 static int alloc_wq_table(int max_wqs)
 {
-	struct wq_table_entry *entry;
-	int cpu;
-
-	wq_table =3D alloc_percpu(struct wq_table_entry);
-	if (!wq_table)
+	cpu_decomp_wqs =3D alloc_percpu_gfp(struct wq_table_entry, GFP_KERNEL | _=
_GFP_ZERO);
+	if (!cpu_decomp_wqs)
 		return -ENOMEM;
=20
-	for (cpu =3D 0; cpu < nr_cpus; cpu++) {
-		entry =3D per_cpu_ptr(wq_table, cpu);
-		entry->wqs =3D kcalloc(max_wqs, sizeof(*entry->wqs), GFP_KERNEL);
-		if (!entry->wqs) {
-			free_wq_table();
-			return -ENOMEM;
-		}
+	cpu_comp_wqs =3D alloc_percpu_gfp(struct wq_table_entry, GFP_KERNEL | __G=
FP_ZERO);
+	if (!cpu_comp_wqs)
+		goto err;
=20
-		entry->max_wqs =3D max_wqs;
-	}
+	if (!pkg_global_wqs_alloc())
+		goto err;
=20
 	pr_debug("initialized wq table\n");
=20
 	return 0;
+
+err:
+	free_wq_tables();
+	return -ENOMEM;
+}
+
+/*
+ * The caller should have established that device_iaa_wqs is not empty,
+ * i.e., every IAA device in "iaa_devices" has at least one WQ.
+ */
+static void add_device_wqs_to_wq_table(struct wq_table_entry *dst_wq_table,
+				       struct wq_table_entry *device_wq_table)
+{
+	int i;
+
+	for (i =3D 0; i < device_wq_table->n_wqs; ++i)
+		dst_wq_table->wqs[dst_wq_table->n_wqs++] =3D device_wq_table->wqs[i];
+}
+
+static bool reinit_pkg_global_wqs(bool comp)
+{
+	int cur_iaa =3D 0, pkg =3D 0;
+	struct iaa_device *iaa_device;
+	struct wq_table_entry **pkg_wqs =3D comp ? pkg_global_comp_wqs : pkg_glob=
al_decomp_wqs;
+
+	for (pkg =3D 0; pkg < nr_packages; ++pkg)
+		pkg_wqs[pkg]->n_wqs =3D 0;
+
+	pkg =3D 0;
+
+one_iaa_special_case:
+	/* Re-initialize per-package wqs. */
+	list_for_each_entry(iaa_device, &iaa_devices, list) {
+		struct wq_table_entry *device_wq_table =3D comp ?
+			((iaa_device->comp_wq_table->n_wqs > 0) ?
+				iaa_device->comp_wq_table : iaa_device->generic_wq_table) :
+			iaa_device->generic_wq_table;
+
+		if (pkg_wqs[pkg]->n_wqs + device_wq_table->n_wqs > pkg_wqs[pkg]->max_wqs=
) {
+			pkg_wqs[pkg]->wqs =3D krealloc(pkg_wqs[pkg]->wqs,
+						     ksize(pkg_wqs[pkg]->wqs) +
+						     max((MAX_PKG_IAA * MAX_IAA_WQ), iaa_device->n_wq) * sizeof(stru=
ct idxd_wq *),
+						     GFP_KERNEL | __GFP_ZERO);
+			if (!pkg_wqs[pkg]->wqs)
+				return false;
+
+			pkg_wqs[pkg]->max_wqs =3D ksize(pkg_wqs[pkg]->wqs)/sizeof(struct idxd_w=
q *);
+		}
+
+		add_device_wqs_to_wq_table(pkg_wqs[pkg], device_wq_table);
+
+		pr_debug("pkg_global_%s_wqs[%d] has %u n_wqs %u max_wqs",
+			 (comp ? "comp" : "decomp"), pkg, pkg_wqs[pkg]->n_wqs, pkg_wqs[pkg]->ma=
x_wqs);
+
+		if (++cur_iaa =3D=3D atomic_read(&nr_iaa_per_package)) {
+			if (++pkg =3D=3D nr_packages)
+				break;
+			cur_iaa =3D 0;
+			if (atomic_read(&nr_iaa) =3D=3D 1)
+				goto one_iaa_special_case;
+		}
+	}
+
+	return true;
 }
=20
-static void wq_table_add(int cpu, struct idxd_wq *wq)
+static void create_cpu_wq_table(int cpu, struct wq_table_entry *wq_table, =
bool comp)
 {
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+	struct wq_table_entry *entry =3D comp ?
+		per_cpu_ptr(cpu_comp_wqs, cpu) :
+		per_cpu_ptr(cpu_decomp_wqs, cpu);
+
+	if (!atomic_read(&iaa_crypto_enabled)) {
+		mutex_lock(&first_wq_found_lock);
+
+		BUG_ON(!first_wq_found && !wq_table->n_wqs);
+
+		if (!first_wq_found)
+			first_wq_found =3D wq_table->wqs[0];
+
+		mutex_unlock(&first_wq_found_lock);
=20
-	if (WARN_ON(entry->n_wqs =3D=3D entry->max_wqs))
+		entry->wqs =3D &first_wq_found;
+		entry->max_wqs =3D 1;
+		entry->n_wqs =3D 1;
+		entry->cur_wq =3D 0;
+		pr_debug("%s: cpu %d: added %u first_wq_found for %s wqs up to wq %d.%d\=
n", __func__,
+			 cpu, entry->n_wqs, comp ? "comp":"decomp",
+			 entry->wqs[entry->n_wqs - 1]->idxd->id,
+			 entry->wqs[entry->n_wqs - 1]->id);
 		return;
+	}
+
+	entry->wqs =3D wq_table->wqs;
+	entry->max_wqs =3D wq_table->max_wqs;
+	entry->n_wqs =3D wq_table->n_wqs;
+	entry->cur_wq =3D 0;
+
+	if (entry->n_wqs)
+		pr_debug("%s: cpu %d: added %u iaa %s wqs up to wq %d.%d: entry->max_wqs=
 =3D %u\n", __func__,
+			 cpu, entry->n_wqs, comp ? "comp":"decomp",
+			 entry->wqs[entry->n_wqs - 1]->idxd->id, entry->wqs[entry->n_wqs - 1]->=
id,
+			 entry->max_wqs);
+}
+
+static void set_cpu_wq_table_start_wq(int cpu, bool comp)
+{
+	struct wq_table_entry *entry =3D comp ?
+		per_cpu_ptr(cpu_comp_wqs, cpu) :
+		per_cpu_ptr(cpu_decomp_wqs, cpu);
+	unsigned int num_pkg_iaa =3D atomic_read(&nr_iaa_per_package);
+
+	int start_wq =3D (entry->n_wqs / num_pkg_iaa) * (cpu_to_iaa(cpu) % num_pk=
g_iaa);
+
+	if ((start_wq >=3D 0) && (start_wq < entry->n_wqs))
+		entry->cur_wq =3D start_wq;
+}
=20
-	entry->wqs[entry->n_wqs++] =3D wq;
+static void create_cpu_wq_table_from_pkg_wqs(bool comp)
+{
+	int cpu;
=20
-	pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
-		 entry->wqs[entry->n_wqs - 1]->idxd->id,
-		 entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
+	/*
+	 * All CPU on the same package share the same "package global"
+	 * [de]comp_wqs.
+	 */
+	for (cpu =3D 0; cpu < nr_cpus; cpu +=3D nr_cpus_per_package) {
+		int package_id =3D topology_logical_package_id(cpu);
+		struct wq_table_entry *pkg_wq_table =3D comp ?
+			((pkg_global_comp_wqs[package_id]->n_wqs > 0) ?
+				pkg_global_comp_wqs[package_id] : pkg_global_decomp_wqs[package_id])
+			: pkg_global_decomp_wqs[package_id];
+		int pkg_cpu;
+
+		for (pkg_cpu =3D cpu; pkg_cpu < cpu + nr_cpus_per_package; ++pkg_cpu) {
+			/* Initialize decomp/comp wq_table for CPU. */
+			create_cpu_wq_table(pkg_cpu, pkg_wq_table, comp);
+			/* Stagger the starting WQ in the package WQ table, for each CPU. */
+			set_cpu_wq_table_start_wq(pkg_cpu, comp);
+		}
+	}
 }
=20
-static int wq_table_add_wqs(int iaa, int cpu)
+static int add_mapped_device_wq_table_for_cpu(int iaa, int cpu, bool comp)
 {
 	struct iaa_device *iaa_device, *found_device =3D NULL;
-	int ret =3D 0, cur_iaa =3D 0, n_wqs_added =3D 0;
-	struct idxd_device *idxd;
-	struct iaa_wq *iaa_wq;
-	struct pci_dev *pdev;
-	struct device *dev;
+	struct wq_table_entry *device_wq_table;
+	int ret =3D 0, cur_iaa =3D 0;
=20
 	list_for_each_entry(iaa_device, &iaa_devices, list) {
-		idxd =3D iaa_device->idxd;
-		pdev =3D idxd->pdev;
-		dev =3D &pdev->dev;
-
 		if (cur_iaa !=3D iaa) {
 			cur_iaa++;
 			continue;
 		}
=20
 		found_device =3D iaa_device;
-		dev_dbg(dev, "getting wq from iaa_device %d, cur_iaa %d\n",
+		dev_dbg(&found_device->idxd->pdev->dev,
+			"getting wq from iaa_device %d, cur_iaa %d\n",
 			found_device->idxd->id, cur_iaa);
 		break;
 	}
@@ -861,93 +1255,219 @@ static int wq_table_add_wqs(int iaa, int cpu)
 		}
 		cur_iaa =3D 0;
=20
-		idxd =3D found_device->idxd;
-		pdev =3D idxd->pdev;
-		dev =3D &pdev->dev;
-		dev_dbg(dev, "getting wq from only iaa_device %d, cur_iaa %d\n",
+		dev_dbg(&found_device->idxd->pdev->dev,
+			"getting wq from only iaa_device %d, cur_iaa %d\n",
 			found_device->idxd->id, cur_iaa);
 	}
=20
-	list_for_each_entry(iaa_wq, &found_device->wqs, list) {
-		wq_table_add(cpu, iaa_wq->wq);
-		pr_debug("rebalance: added wq for cpu=3D%d: iaa wq %d.%d\n",
-			 cpu, iaa_wq->wq->idxd->id, iaa_wq->wq->id);
-		n_wqs_added++;
+	device_wq_table =3D comp ?
+		((found_device->comp_wq_table->n_wqs > 0) ?
+			found_device->comp_wq_table : found_device->generic_wq_table) :
+		found_device->generic_wq_table;
+
+	create_cpu_wq_table(cpu, device_wq_table, comp);
+
+out:
+	return ret;
+}
+
+static void create_cpu_wq_table_from_mapped_device(bool comp)
+{
+	int cpu, iaa;
+
+	for (cpu =3D 0; cpu < nr_cpus; cpu++) {
+		iaa =3D cpu_to_iaa(cpu);
+		pr_debug("rebalance: cpu=3D%d iaa=3D%d\n", cpu, iaa);
+
+		if (WARN_ON(iaa =3D=3D -1)) {
+			pr_debug("rebalance (cpu_to_iaa(%d)) failed!\n", cpu);
+			return;
+		}
+
+		if (WARN_ON(add_mapped_device_wq_table_for_cpu(iaa, cpu, comp))) {
+			pr_debug("could not add any wqs of iaa %d to cpu %d!\n", iaa, cpu);
+			return;
+		}
+	}
+}
+
+static int map_iaa_device_wqs(struct iaa_device *iaa_device)
+{
+	struct wq_table_entry *generic, *for_comps;
+	int ret =3D 0, n_wqs_added =3D 0;
+	struct iaa_wq *iaa_wq;
+
+	generic =3D iaa_device->generic_wq_table;
+	for_comps =3D iaa_device->comp_wq_table;
+
+	list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
+		if (iaa_wq->mapped && ++n_wqs_added)
+			continue;
+
+		pr_debug("iaa_device %p: processing wq %d.%d\n", iaa_device, iaa_device-=
>idxd->id, iaa_wq->wq->id);
+
+		if ((!n_wqs_added || ((n_wqs_added + g_comp_wqs_per_iaa) < iaa_device->n=
_wq)) &&
+			(generic->n_wqs < generic->max_wqs)) {
+
+			generic->wqs[generic->n_wqs++] =3D iaa_wq->wq;
+			pr_debug("iaa_device %p: added decomp wq %d.%d\n", iaa_device, iaa_devi=
ce->idxd->id, iaa_wq->wq->id);
+		} else {
+			if (WARN_ON(for_comps->n_wqs =3D=3D for_comps->max_wqs))
+				break;
+
+			for_comps->wqs[for_comps->n_wqs++] =3D iaa_wq->wq;
+			pr_debug("iaa_device %p: added comp wq %d.%d\n", iaa_device, iaa_device=
->idxd->id, iaa_wq->wq->id);
+		}
+
+		iaa_wq->mapped =3D true;
+		++n_wqs_added;
 	}
=20
-	if (!n_wqs_added) {
-		pr_debug("couldn't find any iaa wqs!\n");
+	if (!n_wqs_added && !iaa_device->n_wq) {
+		pr_debug("iaa_device %d: couldn't find any iaa wqs!\n", iaa_device->idxd=
->id);
 		ret =3D -EINVAL;
-		goto out;
 	}
-out:
+
 	return ret;
 }
=20
+static void map_iaa_devices(void)
+{
+	struct iaa_device *iaa_device;
+
+	list_for_each_entry(iaa_device, &iaa_devices, list) {
+		BUG_ON(map_iaa_device_wqs(iaa_device));
+	}
+}
+
 /*
- * Rebalance the wq table so that given a cpu, it's easy to find the
- * closest IAA instance.  The idea is to try to choose the most
- * appropriate IAA instance for a caller and spread available
- * workqueues around to clients.
+ * Rebalance the per-cpu wq table based on available IAA devices/WQs.
+ * Three driver parameters control how this algorithm works:
+ *
+ * - g_comp_wqs_per_iaa:
+ *
+ *   If multiple WQs are configured for a given device, this setting deter=
mines
+ *   the number of WQs to be used as "compress only" WQs. The remaining WQ=
s will
+ *   be used as "decompress only WQs".
+ *   Note that the comp WQ can be the same as the decomp WQ, for e.g., if
+ *   g_comp_wqs_per_iaa is 0 (regardless of the # of available WQs per dev=
ice), or,
+ *   if there is only 1 WQ configured for a device (regardless of
+ *   g_comp_wqs_per_iaa).
+ *
+ * - distribute_decomps, distribute_comps:
+ *
+ *   If this is enabled, all [de]comp WQs found from the IAA devices on a
+ *   package, will be aggregated into pkg_global_[de]comp_wqs, then assign=
ed to
+ *   each CPU on the package.
+ *
+ * Note:
+ * -----
+ * rebalance_wq_table() will return true if it was able to successfully
+ * configure comp/decomp wqs for all CPUs, without changing the
+ * @iaa_crypto_enabled atomic. The caller can re-enable the use of the wq
+ * tables after rebalance_wq_table() returns true, by setting the
+ * @iaa_crypto_enabled atomic to 1.
+ * In case of any errors, the @iaa_crypto_enabled atomic will be set to 0,
+ * and rebalance_wq_table() will return false.
  */
-static void rebalance_wq_table(void)
+static bool rebalance_wq_table(void)
 {
-	const struct cpumask *node_cpus;
-	int node_cpu, node, cpu, iaa =3D 0;
+	int cpu;
=20
-	if (nr_iaa =3D=3D 0)
-		return;
+	if (atomic_read(&nr_iaa) =3D=3D 0)
+		goto err;
=20
-	pr_debug("rebalance: nr_nodes=3D%d, nr_cpus %d, nr_iaa %d, cpus_per_iaa %=
d\n",
-		 nr_nodes, nr_cpus, nr_iaa, cpus_per_iaa);
+	map_iaa_devices();
=20
-	clear_wq_table();
+	pr_info("rebalance: nr_packages=3D%d, nr_cpus %d, nr_iaa %d, nr_iaa_per_p=
ackage %d, cpus_per_iaa %d\n",
+		nr_packages, nr_cpus, atomic_read(&nr_iaa),
+		atomic_read(&nr_iaa_per_package), atomic_read(&cpus_per_iaa));
=20
-	if (nr_iaa =3D=3D 1) {
-		for_each_possible_cpu(cpu) {
-			if (WARN_ON(wq_table_add_wqs(0, cpu)))
-				goto err;
-		}
+	if (iaa_distribute_decomps) {
+		/* Each CPU uses all IAA devices on package for decomps. */
+		if (!reinit_pkg_global_wqs(false))
+			goto err;
+		create_cpu_wq_table_from_pkg_wqs(false);
+	} else {
+		/*
+		 * Each CPU uses the decomp WQ on the mapped IAA device using
+		 * a balanced mapping of cores to IAA.
+		 */
+		create_cpu_wq_table_from_mapped_device(false);
+	}
=20
-		return;
+	if (iaa_distribute_comps) {
+		/* Each CPU uses all IAA devices on package for comps. */
+		if (!reinit_pkg_global_wqs(true))
+			goto err;
+		create_cpu_wq_table_from_pkg_wqs(true);
+	} else {
+		/*
+		 * Each CPU uses the comp WQ on the mapped IAA device using
+		 * a balanced mapping of cores to IAA.
+		 */
+		create_cpu_wq_table_from_mapped_device(true);
 	}
=20
-	for_each_node_with_cpus(node) {
-		cpu =3D 0;
-		node_cpus =3D cpumask_of_node(node);
+	/* Verify that each cpu has comp and decomp wqs.*/
+	for (cpu =3D 0; cpu < nr_cpus; cpu++) {
+		struct wq_table_entry *entry =3D per_cpu_ptr(cpu_decomp_wqs, cpu);
=20
-		for_each_cpu(node_cpu, node_cpus) {
-			iaa =3D cpu / cpus_per_iaa;
-			if (WARN_ON(wq_table_add_wqs(iaa, node_cpu)))
-				goto err;
-			cpu++;
+		if (!entry->wqs || !entry->n_wqs) {
+			pr_err("%s: cpu %d does not have decomp_wqs", __func__, cpu);
+			goto err;
+		}
+
+		entry =3D per_cpu_ptr(cpu_comp_wqs, cpu);
+		if (!entry->wqs || !entry->n_wqs) {
+			pr_err("%s: cpu %d does not have comp_wqs", __func__, cpu);
+			goto err;
 		}
 	}
=20
-	return;
+	pr_debug("Finished rebalance decomp/comp wqs.");
+	return true;
+
 err:
-	pr_debug("could not add any wqs for iaa %d to cpu %d!\n", iaa, cpu);
+	atomic_set(&iaa_crypto_enabled, 0);
+	pr_debug("Error during rebalance decomp/comp wqs.");
+	return false;
 }
=20
 /***************************************************************
  * Assign work-queues for driver ops using per-cpu wq_tables.
  ***************************************************************/
=20
-static struct idxd_wq *wq_table_next_wq(int cpu)
+static struct idxd_wq *decomp_wq_table_next_wq(int cpu)
 {
-	struct wq_table_entry *entry =3D per_cpu_ptr(wq_table, cpu);
+	struct wq_table_entry *entry =3D per_cpu_ptr(cpu_decomp_wqs, cpu);
+	struct idxd_wq *wq;
+
+	if (!atomic_read(&iaa_crypto_enabled))
+		return NULL;
+
+	wq =3D entry->wqs[entry->cur_wq];
=20
-	if (++entry->cur_wq >=3D entry->n_wqs)
+	if (++entry->cur_wq =3D=3D entry->n_wqs)
 		entry->cur_wq =3D 0;
=20
-	if (!entry->wqs[entry->cur_wq])
+	return wq;
+}
+
+static struct idxd_wq *comp_wq_table_next_wq(int cpu)
+{
+	struct wq_table_entry *entry =3D per_cpu_ptr(cpu_comp_wqs, cpu);
+	struct idxd_wq *wq;
+
+	if (!atomic_read(&iaa_crypto_enabled))
 		return NULL;
=20
-	pr_debug("%s: returning wq at idx %d (iaa wq %d.%d) from cpu %d\n", __fun=
c__,
-		 entry->cur_wq, entry->wqs[entry->cur_wq]->idxd->id,
-		 entry->wqs[entry->cur_wq]->id, cpu);
+	wq =3D entry->wqs[entry->cur_wq];
=20
-	return entry->wqs[entry->cur_wq];
+	if (++entry->cur_wq =3D=3D entry->n_wqs)
+		entry->cur_wq =3D 0;
+
+	return wq;
 }
=20
 /*************************************************
@@ -985,7 +1505,7 @@ static inline int check_completion(struct device *dev,
 			dev_err(dev, "%s completion timed out - "
 				"assuming broken hw, iaa_crypto now DISABLED\n",
 				op_str);
-			iaa_crypto_enabled =3D false;
+			atomic_set(&iaa_crypto_enabled, 0);
 			ret =3D -ETIMEDOUT;
 			goto out;
 		}
@@ -1501,18 +2021,13 @@ static int iaa_comp_acompress(struct acomp_req *req)
=20
 	compression_ctx =3D crypto_tfm_ctx(tfm);
=20
-	if (!iaa_crypto_enabled) {
-		pr_debug("iaa_crypto disabled, not compressing\n");
-		return -ENODEV;
-	}
-
 	if (!req->src || !req->slen) {
 		pr_debug("invalid src, not compressing\n");
 		return -EINVAL;
 	}
=20
 	cpu =3D get_cpu();
-	wq =3D wq_table_next_wq(cpu);
+	wq =3D comp_wq_table_next_wq(cpu);
 	put_cpu();
 	if (!wq) {
 		pr_debug("no wq configured for cpu=3D%d\n", cpu);
@@ -1599,18 +2114,13 @@ static int iaa_comp_adecompress(struct acomp_req *r=
eq)
 	struct device *dev;
 	struct idxd_wq *wq;
=20
-	if (!iaa_crypto_enabled) {
-		pr_debug("iaa_crypto disabled, not decompressing\n");
-		return -ENODEV;
-	}
-
 	if (!req->src || !req->slen) {
 		pr_debug("invalid src, not decompressing\n");
 		return -EINVAL;
 	}
=20
 	cpu =3D get_cpu();
-	wq =3D wq_table_next_wq(cpu);
+	wq =3D decomp_wq_table_next_wq(cpu);
 	put_cpu();
 	if (!wq) {
 		pr_debug("no wq configured for cpu=3D%d\n", cpu);
@@ -1725,6 +2235,8 @@ static int iaa_register_compression_device(void)
=20
 static int iaa_unregister_compression_device(void)
 {
+	atomic_set(&iaa_crypto_enabled, 0);
+
 	if (iaa_crypto_registered)
 		crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
=20
@@ -1746,10 +2258,13 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_d=
ev)
 	if (data->type !=3D IDXD_TYPE_IAX)
 		return -ENODEV;
=20
+	mutex_lock(&iaa_devices_lock);
+
 	mutex_lock(&wq->wq_lock);
=20
 	if (idxd_wq_get_private(wq)) {
 		mutex_unlock(&wq->wq_lock);
+		mutex_unlock(&iaa_devices_lock);
 		return -EBUSY;
 	}
=20
@@ -1771,8 +2286,6 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
 		goto err;
 	}
=20
-	mutex_lock(&iaa_devices_lock);
-
 	if (list_empty(&iaa_devices)) {
 		ret =3D alloc_wq_table(wq->idxd->max_wqs);
 		if (ret)
@@ -1784,24 +2297,33 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_d=
ev)
 	if (ret)
 		goto err_save;
=20
-	rebalance_wq_table();
+	if (!rebalance_wq_table()) {
+		dev_dbg(dev, "%s: IAA rebalancing device wq tables failed\n", __func__);
+		goto err_register;
+	}
+	atomic_set(&iaa_crypto_enabled, 1);
=20
 	if (first_wq) {
-		iaa_crypto_enabled =3D true;
 		ret =3D iaa_register_compression_device();
 		if (ret !=3D 0) {
-			iaa_crypto_enabled =3D false;
 			dev_dbg(dev, "IAA compression device registration failed\n");
 			goto err_register;
 		}
+
+		if (!rebalance_wq_table()) {
+			dev_dbg(dev, "%s: Rerun after registration: IAA rebalancing device wq t=
ables failed\n", __func__);
+			goto err_register;
+		}
+		atomic_set(&iaa_crypto_enabled, 1);
+
 		try_module_get(THIS_MODULE);
=20
 		pr_info("iaa_crypto now ENABLED\n");
 	}
=20
-	mutex_unlock(&iaa_devices_lock);
 out:
 	mutex_unlock(&wq->wq_lock);
+	mutex_unlock(&iaa_devices_lock);
=20
 	return ret;
=20
@@ -1810,9 +2332,8 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
 	free_iaa_wq(idxd_wq_get_private(wq));
 err_save:
 	if (first_wq)
-		free_wq_table();
+		free_wq_tables();
 err_alloc:
-	mutex_unlock(&iaa_devices_lock);
 	idxd_drv_disable_wq(wq);
 err:
 	wq->type =3D IDXD_WQT_NONE;
@@ -1827,13 +2348,17 @@ static void iaa_crypto_remove(struct idxd_dev *idxd=
_dev)
 	struct iaa_wq *iaa_wq;
 	bool free =3D false;
=20
+	atomic_set(&iaa_crypto_enabled, 0);
 	idxd_wq_quiesce(wq);
=20
-	mutex_lock(&wq->wq_lock);
 	mutex_lock(&iaa_devices_lock);
+	mutex_lock(&wq->wq_lock);
=20
 	remove_iaa_wq(wq);
=20
+	if (!rebalance_wq_table())
+		pr_debug("%s: IAA rebalancing device wq tables failed\n", __func__);
+
 	spin_lock(&idxd->dev_lock);
 	iaa_wq =3D idxd_wq_get_private(wq);
 	if (!iaa_wq) {
@@ -1856,18 +2381,22 @@ static void iaa_crypto_remove(struct idxd_dev *idxd=
_dev)
 	}
=20
 	idxd_drv_disable_wq(wq);
-	rebalance_wq_table();
=20
-	if (nr_iaa =3D=3D 0) {
-		iaa_crypto_enabled =3D false;
-		free_wq_table();
+	if (atomic_read(&nr_iaa) =3D=3D 0) {
+		atomic_set(&iaa_crypto_enabled, 0);
+		pkg_global_wqs_dealloc();
+		free_wq_tables();
+		BUG_ON(!list_empty(&iaa_devices));
+		INIT_LIST_HEAD(&iaa_devices);
 		module_put(THIS_MODULE);
=20
 		pr_info("iaa_crypto now DISABLED\n");
+	} else {
+		atomic_set(&iaa_crypto_enabled, 1);
 	}
 out:
-	mutex_unlock(&iaa_devices_lock);
 	mutex_unlock(&wq->wq_lock);
+	mutex_unlock(&iaa_devices_lock);
 }
=20
 static enum idxd_dev_type dev_types[] =3D {
@@ -1890,16 +2419,12 @@ static struct idxd_device_driver iaa_crypto_driver =
=3D {
 static int __init iaa_crypto_init_module(void)
 {
 	int ret =3D 0;
-	int node;
+
+	INIT_LIST_HEAD(&iaa_devices);
=20
 	nr_cpus =3D num_possible_cpus();
-	for_each_node_with_cpus(node)
-		nr_nodes++;
-	if (!nr_nodes) {
-		pr_err("IAA couldn't find any nodes with cpus\n");
-		return -ENODEV;
-	}
-	nr_cpus_per_node =3D nr_cpus / nr_nodes;
+	nr_cpus_per_package =3D topology_num_cores_per_package();
+	nr_packages =3D topology_max_packages();
=20
 	ret =3D iaa_aecs_init_fixed();
 	if (ret < 0) {
@@ -1913,6 +2438,27 @@ static int __init iaa_crypto_init_module(void)
 		goto err_driver_reg;
 	}
=20
+	ret =3D driver_create_file(&iaa_crypto_driver.drv,
+				&driver_attr_g_comp_wqs_per_iaa);
+	if (ret) {
+		pr_debug("IAA g_comp_wqs_per_iaa attr creation failed\n");
+		goto err_g_comp_wqs_per_iaa_attr_create;
+	}
+
+	ret =3D driver_create_file(&iaa_crypto_driver.drv,
+				 &driver_attr_distribute_decomps);
+	if (ret) {
+		pr_debug("IAA distribute_decomps attr creation failed\n");
+		goto err_distribute_decomps_attr_create;
+	}
+
+	ret =3D driver_create_file(&iaa_crypto_driver.drv,
+				 &driver_attr_distribute_comps);
+	if (ret) {
+		pr_debug("IAA distribute_comps attr creation failed\n");
+		goto err_distribute_comps_attr_create;
+	}
+
 	ret =3D driver_create_file(&iaa_crypto_driver.drv,
 				 &driver_attr_verify_compress);
 	if (ret) {
@@ -1938,6 +2484,15 @@ static int __init iaa_crypto_init_module(void)
 	driver_remove_file(&iaa_crypto_driver.drv,
 			   &driver_attr_verify_compress);
 err_verify_attr_create:
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_distribute_comps);
+err_distribute_comps_attr_create:
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_distribute_decomps);
+err_distribute_decomps_attr_create:
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_g_comp_wqs_per_iaa);
+err_g_comp_wqs_per_iaa_attr_create:
 	idxd_driver_unregister(&iaa_crypto_driver);
 err_driver_reg:
 	iaa_aecs_cleanup_fixed();
@@ -1956,6 +2511,12 @@ static void __exit iaa_crypto_cleanup_module(void)
 			   &driver_attr_sync_mode);
 	driver_remove_file(&iaa_crypto_driver.drv,
 			   &driver_attr_verify_compress);
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_distribute_comps);
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_distribute_decomps);
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_g_comp_wqs_per_iaa);
 	idxd_driver_unregister(&iaa_crypto_driver);
 	iaa_aecs_cleanup_fixed();
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 463A92472BB;
	Fri, 26 Sep 2025 03:35:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857711; cv=none;
 b=dx9NC4XgOGZxyF7dbw19PyOktWLbbQGToHHLVBDIxfV5zakO4ERW3viyUAhlA1OYeAzfWJ5CAYqSqT/z+MYgi2IHPqd4XuHxFziUvyd3Fe2qV1S0uZOaQ4sz9RXiUOSFMx8Cbsqpj7sRqgXW7Yc6+u5LrLdf0lGYsb++IKhR1dU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857711; c=relaxed/simple;
	bh=wd7fVPxR6RBZDaBBnSS4xf4KqsHFpNau8JbUhMchEpE=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=MoHGw+jsgU2//00CLOZR8RisIdVqWda3xc+HSCKOt4EJnegtqcbo7Gz9ApSxc08I3Ur4Ppu5xqPfseTaHET5Jr7C5eJXRQe0d2tQrDjQibwVQx2AuRHG6LURbb+NbyfVl1sePeDQwzetjf6lHY3uEeDgCJtN5LD6WHQnaOc2oNs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=eEmStbz4; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="eEmStbz4"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857709; x=1790393709;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=wd7fVPxR6RBZDaBBnSS4xf4KqsHFpNau8JbUhMchEpE=;
  b=eEmStbz4Fksge6ump8vx6r/E0mX1aKj3YCWYZZcuiDsdoa2afLF4G/oT
   EwiDkDTtvKnkPNnD0a2RQAd21r0R14u0e1EFVQQgl5+Fs7vJvvO6QfvkX
   ss6qL7MbdJorleW+KUDKQ/Xe+sIuIq4xpK7GwsUpyj5gUai9VrWZTWrJB
   0lh9uMKlSpu9AmjKlXTdxBSyMrKLUr9KxJ8RYurrFC3WND2BBOC6RbFr0
   hv/5xuCkqIK7/AoRamhVwEyteD5ODqOaKTKrHw5V5aGTsK9kEvkm3YXCT
   F6V/1l2A/9/7DBhY0BUauyMESg4IsifRHb2tjQMc3fJMpTrOqR+4bEzxZ
   g==;
X-CSE-ConnectionGUID: oIiAF/evTQanNd1DCpRSOg==
X-CSE-MsgGUID: CVcb3nvaSH+n3do83fQP7w==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819451"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819451"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: +EGlOjRNRo+zmsaE28U3tQ==
X-CSE-MsgGUID: LvlZUYaBQeOZdYF/jnSQ6Q==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636552"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:03 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 03/23] crypto: iaa - Simplify,
 consistency of function parameters, minor stats bug fix.
Date: Thu, 25 Sep 2025 20:34:42 -0700
Message-Id: <20250926033502.7486-4-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch further simplifies the code in some places and makes it more
consistent and readable:

1) Change iaa_compress_verify() @dlen parameter to be a value instead of
   a pointer, because @dlen's value is only read, not modified by this
   procedure.

2) Simplify the success/error return paths in iaa_compress(),
   iaa_decompress() and iaa_compress_verify().

3) Delete dev_dbg() statements to make the code more readable.

4) Change return value from descriptor allocation failures to be
   -ENODEV, for better maintainability.

5) Fix a minor statistics bug in iaa_decompress(), with the
   decomp_bytes getting updated in case of errors.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 107 +++++----------------
 1 file changed, 22 insertions(+), 85 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index c6db721eaa79..ed3325bb3291 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1590,7 +1590,7 @@ static int iaa_remap_for_verify(struct device *dev, s=
truct iaa_wq *iaa_wq,
 static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *r=
eq,
 			       struct idxd_wq *wq,
 			       dma_addr_t src_addr, unsigned int slen,
-			       dma_addr_t dst_addr, unsigned int *dlen)
+			       dma_addr_t dst_addr, unsigned int dlen)
 {
 	struct iaa_device_compression_mode *active_compression_mode;
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
@@ -1614,10 +1614,8 @@ static int iaa_compress_verify(struct crypto_tfm *tf=
m, struct acomp_req *req,
=20
 	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
 	if (IS_ERR(idxd_desc)) {
-		dev_dbg(dev, "idxd descriptor allocation failed\n");
-		dev_dbg(dev, "iaa compress failed: ret=3D%ld\n",
-			PTR_ERR(idxd_desc));
-		return PTR_ERR(idxd_desc);
+		dev_dbg(dev, "iaa compress_verify failed: idxd descriptor allocation fai=
lure: ret=3D%ld\n", PTR_ERR(idxd_desc));
+		return -ENODEV;
 	}
 	desc =3D idxd_desc->iax_hw;
=20
@@ -1629,19 +1627,11 @@ static int iaa_compress_verify(struct crypto_tfm *t=
fm, struct acomp_req *req,
 	desc->priv =3D 0;
=20
 	desc->src1_addr =3D (u64)dst_addr;
-	desc->src1_size =3D *dlen;
+	desc->src1_size =3D dlen;
 	desc->dst_addr =3D (u64)src_addr;
 	desc->max_dst_size =3D slen;
 	desc->completion_addr =3D idxd_desc->compl_dma;
=20
-	dev_dbg(dev, "(verify) compression mode %s,"
-		" desc->src1_addr %llx, desc->src1_size %d,"
-		" desc->dst_addr %llx, desc->max_dst_size %d,"
-		" desc->src2_addr %llx, desc->src2_size %d\n",
-		active_compression_mode->name,
-		desc->src1_addr, desc->src1_size, desc->dst_addr,
-		desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
 	ret =3D idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
 		dev_dbg(dev, "submit_desc (verify) failed ret=3D%d\n", ret);
@@ -1664,14 +1654,10 @@ static int iaa_compress_verify(struct crypto_tfm *t=
fm, struct acomp_req *req,
 		goto err;
 	}
=20
-	idxd_free_desc(wq, idxd_desc);
-out:
-	return ret;
 err:
 	idxd_free_desc(wq, idxd_desc);
-	dev_dbg(dev, "iaa compress failed: ret=3D%d\n", ret);
=20
-	goto out;
+	return ret;
 }
=20
 static void iaa_desc_complete(struct idxd_desc *idxd_desc,
@@ -1751,7 +1737,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_=
desc,
 		}
=20
 		ret =3D iaa_compress_verify(ctx->tfm, ctx->req, iaa_wq->wq, src_addr,
-					  ctx->req->slen, dst_addr, &ctx->req->dlen);
+					  ctx->req->slen, dst_addr, ctx->req->dlen);
 		if (ret) {
 			dev_dbg(dev, "%s: compress verify failed ret=3D%d\n", __func__, ret);
 			err =3D -EIO;
@@ -1777,7 +1763,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_=
desc,
 	iaa_wq_put(idxd_desc->wq);
 }
=20
-static int iaa_compress(struct crypto_tfm *tfm,	struct acomp_req *req,
+static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
 			struct idxd_wq *wq,
 			dma_addr_t src_addr, unsigned int slen,
 			dma_addr_t dst_addr, unsigned int *dlen)
@@ -1804,9 +1790,9 @@ static int iaa_compress(struct crypto_tfm *tfm,	struc=
t acomp_req *req,
=20
 	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
 	if (IS_ERR(idxd_desc)) {
-		dev_dbg(dev, "idxd descriptor allocation failed\n");
-		dev_dbg(dev, "iaa compress failed: ret=3D%ld\n", PTR_ERR(idxd_desc));
-		return PTR_ERR(idxd_desc);
+		dev_dbg(dev, "iaa compress failed: idxd descriptor allocation failure: r=
et=3D%ld\n",
+			PTR_ERR(idxd_desc));
+		return -ENODEV;
 	}
 	desc =3D idxd_desc->iax_hw;
=20
@@ -1832,21 +1818,8 @@ static int iaa_compress(struct crypto_tfm *tfm,	stru=
ct acomp_req *req,
 		idxd_desc->crypto.src_addr =3D src_addr;
 		idxd_desc->crypto.dst_addr =3D dst_addr;
 		idxd_desc->crypto.compress =3D true;
-
-		dev_dbg(dev, "%s use_async_irq: compression mode %s,"
-			" src_addr %llx, dst_addr %llx\n", __func__,
-			active_compression_mode->name,
-			src_addr, dst_addr);
 	}
=20
-	dev_dbg(dev, "%s: compression mode %s,"
-		" desc->src1_addr %llx, desc->src1_size %d,"
-		" desc->dst_addr %llx, desc->max_dst_size %d,"
-		" desc->src2_addr %llx, desc->src2_size %d\n", __func__,
-		active_compression_mode->name,
-		desc->src1_addr, desc->src1_size, desc->dst_addr,
-		desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
 	ret =3D idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
 		dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
@@ -1859,7 +1832,6 @@ static int iaa_compress(struct crypto_tfm *tfm,	struc=
t acomp_req *req,
=20
 	if (ctx->async_mode) {
 		ret =3D -EINPROGRESS;
-		dev_dbg(dev, "%s: returning -EINPROGRESS\n", __func__);
 		goto out;
 	}
=20
@@ -1877,15 +1849,10 @@ static int iaa_compress(struct crypto_tfm *tfm,	str=
uct acomp_req *req,
=20
 	*compression_crc =3D idxd_desc->iax_completion->crc;
=20
-	if (!ctx->async_mode)
-		idxd_free_desc(wq, idxd_desc);
-out:
-	return ret;
 err:
 	idxd_free_desc(wq, idxd_desc);
-	dev_dbg(dev, "iaa compress failed: ret=3D%d\n", ret);
-
-	goto out;
+out:
+	return ret;
 }
=20
 static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
@@ -1914,10 +1881,10 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
=20
 	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
 	if (IS_ERR(idxd_desc)) {
-		dev_dbg(dev, "idxd descriptor allocation failed\n");
-		dev_dbg(dev, "iaa decompress failed: ret=3D%ld\n",
+		ret =3D -ENODEV;
+		dev_dbg(dev, "%s: idxd descriptor allocation failed: ret=3D%ld\n", __fun=
c__,
 			PTR_ERR(idxd_desc));
-		return PTR_ERR(idxd_desc);
+		return ret;
 	}
 	desc =3D idxd_desc->iax_hw;
=20
@@ -1941,21 +1908,8 @@ static int iaa_decompress(struct crypto_tfm *tfm, st=
ruct acomp_req *req,
 		idxd_desc->crypto.src_addr =3D src_addr;
 		idxd_desc->crypto.dst_addr =3D dst_addr;
 		idxd_desc->crypto.compress =3D false;
-
-		dev_dbg(dev, "%s: use_async_irq compression mode %s,"
-			" src_addr %llx, dst_addr %llx\n", __func__,
-			active_compression_mode->name,
-			src_addr, dst_addr);
 	}
=20
-	dev_dbg(dev, "%s: decompression mode %s,"
-		" desc->src1_addr %llx, desc->src1_size %d,"
-		" desc->dst_addr %llx, desc->max_dst_size %d,"
-		" desc->src2_addr %llx, desc->src2_size %d\n", __func__,
-		active_compression_mode->name,
-		desc->src1_addr, desc->src1_size, desc->dst_addr,
-		desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
 	ret =3D idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
 		dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
@@ -1968,7 +1922,6 @@ static int iaa_decompress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
=20
 	if (ctx->async_mode) {
 		ret =3D -EINPROGRESS;
-		dev_dbg(dev, "%s: returning -EINPROGRESS\n", __func__);
 		goto out;
 	}
=20
@@ -1990,23 +1943,19 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
 		}
 	} else {
 		req->dlen =3D idxd_desc->iax_completion->output_size;
+
+		/* Update stats */
+		update_total_decomp_bytes_in(slen);
+		update_wq_decomp_bytes(wq, slen);
 	}
=20
 	*dlen =3D req->dlen;
=20
-	if (!ctx->async_mode)
+err:
+	if (idxd_desc)
 		idxd_free_desc(wq, idxd_desc);
-
-	/* Update stats */
-	update_total_decomp_bytes_in(slen);
-	update_wq_decomp_bytes(wq, slen);
 out:
 	return ret;
-err:
-	idxd_free_desc(wq, idxd_desc);
-	dev_dbg(dev, "iaa decompress failed: ret=3D%d\n", ret);
-
-	goto out;
 }
=20
 static int iaa_comp_acompress(struct acomp_req *req)
@@ -2053,9 +2002,6 @@ static int iaa_comp_acompress(struct acomp_req *req)
 		goto out;
 	}
 	src_addr =3D sg_dma_address(req->src);
-	dev_dbg(dev, "dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
-		" req->slen %d, sg_dma_len(sg) %d\n", src_addr, nr_sgs,
-		req->src, req->slen, sg_dma_len(req->src));
=20
 	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
 	if (nr_sgs <=3D 0 || nr_sgs > 1) {
@@ -2066,9 +2012,6 @@ static int iaa_comp_acompress(struct acomp_req *req)
 		goto err_map_dst;
 	}
 	dst_addr =3D sg_dma_address(req->dst);
-	dev_dbg(dev, "dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
-		" req->dlen %d, sg_dma_len(sg) %d\n", dst_addr, nr_sgs,
-		req->dst, req->dlen, sg_dma_len(req->dst));
=20
 	ret =3D iaa_compress(tfm, req, wq, src_addr, req->slen, dst_addr,
 			   &req->dlen);
@@ -2083,7 +2026,7 @@ static int iaa_comp_acompress(struct acomp_req *req)
 		}
=20
 		ret =3D iaa_compress_verify(tfm, req, wq, src_addr, req->slen,
-					  dst_addr, &req->dlen);
+					  dst_addr, req->dlen);
 		if (ret)
 			dev_dbg(dev, "asynchronous compress verification failed ret=3D%d\n", re=
t);
=20
@@ -2146,9 +2089,6 @@ static int iaa_comp_adecompress(struct acomp_req *req)
 		goto out;
 	}
 	src_addr =3D sg_dma_address(req->src);
-	dev_dbg(dev, "dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
-		" req->slen %d, sg_dma_len(sg) %d\n", src_addr, nr_sgs,
-		req->src, req->slen, sg_dma_len(req->src));
=20
 	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
 	if (nr_sgs <=3D 0 || nr_sgs > 1) {
@@ -2159,9 +2099,6 @@ static int iaa_comp_adecompress(struct acomp_req *req)
 		goto err_map_dst;
 	}
 	dst_addr =3D sg_dma_address(req->dst);
-	dev_dbg(dev, "dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
-		" req->dlen %d, sg_dma_len(sg) %d\n", dst_addr, nr_sgs,
-		req->dst, req->dlen, sg_dma_len(req->dst));
=20
 	ret =3D iaa_decompress(tfm, req, wq, src_addr, req->slen,
 			     dst_addr, &req->dlen);
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BAE5D2620D5;
	Fri, 26 Sep 2025 03:35:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857711; cv=none;
 b=VPV2ylnNSx4vV8fPDNttsWHwK/STWJA1gkrPKDQbg7NHOmet+G+JfRhc1suf+40Pxt3P7RR2d9fUq6BQWMu+/g4BohXE+nHVQOPDdCzPN3koum2df5/B4mwB174h2x+lUNPvRYcCSULYCCHjq763JQ2Oc1goFlqQ3j0Pz+G79zY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857711; c=relaxed/simple;
	bh=W3aURnxCmj1la91C1akYQZ8M4BvNi+tJIJrf7rRX5u8=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=IWVoBs7P83FjzMPO6S8ZGQFWt96GJukmtXqelr/LTJJ9oL0IU2P6Dai3/XnVHBhJDifLgvsPHKJsDQKOjOv3SVwH9/AIJv8R821PW4ewx4hb9WNXH3JeflNNcDnB29/MWoiR9KvkUSXsIofqEhw/5e8Z39oUakeAtwdE3zKGW4c=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=B/bngYia; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="B/bngYia"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857710; x=1790393710;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=W3aURnxCmj1la91C1akYQZ8M4BvNi+tJIJrf7rRX5u8=;
  b=B/bngYiaO437X1SukmYQJVF1ClPmfPT/0FPZXLtkobG6EtMScV2DiJQm
   7w8v8DKXdhdeh/W/mlpgDP0g8pFhF/35q6j6GLaluzTYPNmLKJNVu157a
   wo0xBU6/sWQpTflm9zR7W1WzOr2GPU0j+ILemuKVQp5FgJ7K/cwBPRnJ0
   qpXO9Wsf2Z9d595K6HMG7od5kCxwRD/9va3hsdHWPt5xkIqthMxox8WGo
   vRH81M9ixUrUgQVK/Mbk5X7bOsRo/bWPfC8F/ZUdZ/3TNkg/P8ONMhwuE
   5JTiDnpgOsH77Sd2VNEwn0whHoUHEbtFuWVkStI72p/q/hwjB+uQvZqIr
   Q==;
X-CSE-ConnectionGUID: CPgzOVxvTBe+RboWrWlh7g==
X-CSE-MsgGUID: aVhBHJI4SVuc1Cd8tx/F4g==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819460"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819460"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: PDSLPQf9RoGS/IJCefj2Ow==
X-CSE-MsgGUID: Ktg6tixmRPq8dew9KQqPkQ==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636557"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:03 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 04/23] crypto: iaa - Descriptor allocation timeouts with
 mitigations.
Date: Thu, 25 Sep 2025 20:34:43 -0700
Message-Id: <20250926033502.7486-5-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch modifies the descriptor allocation from blocking to
non-blocking with bounded retries or "timeouts".

This is necessary to prevent task blocked errors in high contention
scenarios, for instance, when the platform has only 1 IAA device
enabled. With 1 IAA device enabled per package on a dual-package
Sapphire Rapids with 56 cores/package, there are 112 logical cores
mapped to this single IAA device. In this scenario, the task blocked
errors can occur because idxd_alloc_desc() is called with
IDXD_OP_BLOCK. With batching, multiple descriptors will need to be
allocated per batch. Any process that is able to do so, can cause
contention for allocating descriptors for all other processes that share
the use of the same sbitmap_queue. Under IDXD_OP_BLOCK, this causes
compress/decompress jobs to stall in stress test scenarios
(e.g. zswap_store() of 2M folios).

In order to make the iaa_crypto driver be more fail-safe, this commit
implements the following:

1) Change compress/decompress descriptor allocations to be non-blocking
   with retries ("timeouts").
2) Return compress error to zswap if descriptor allocation with timeouts
   fails during compress ops. zswap_store() will return an error and the
   folio gets stored in the backing swap device.
3) Fallback to software decompress if descriptor allocation with timeouts
   fails during decompress ops.

With these fixes, there are no task blocked errors seen under stress
testing conditions, and no performance degradation observed.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto.h      |  5 ++
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 58 +++++++++++++++-------
 2 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index 549ac98a9366..cc76a047b54a 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -21,6 +21,9 @@
=20
 #define IAA_COMPLETION_TIMEOUT		1000000
=20
+#define IAA_ALLOC_DESC_COMP_TIMEOUT	   1000
+#define IAA_ALLOC_DESC_DECOMP_TIMEOUT	    500
+
 #define IAA_ANALYTICS_ERROR		0x0a
 #define IAA_ERROR_DECOMP_BUF_OVERFLOW	0x0b
 #define IAA_ERROR_COMP_BUF_OVERFLOW	0x19
@@ -141,6 +144,8 @@ enum iaa_mode {
=20
 struct iaa_compression_ctx {
 	enum iaa_mode	mode;
+	u16		alloc_comp_desc_timeout;
+	u16		alloc_decomp_desc_timeout;
 	bool		verify_compress;
 	bool		async_mode;
 	bool		use_irq;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index ed3325bb3291..1169cd44c8e7 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1596,7 +1596,8 @@ static int iaa_compress_verify(struct crypto_tfm *tfm=
, struct acomp_req *req,
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	u32 *compression_crc =3D acomp_request_ctx(req);
 	struct iaa_device *iaa_device;
-	struct idxd_desc *idxd_desc;
+	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
+	u16 alloc_desc_retries =3D 0;
 	struct iax_hw_desc *desc;
 	struct idxd_device *idxd;
 	struct iaa_wq *iaa_wq;
@@ -1612,7 +1613,11 @@ static int iaa_compress_verify(struct crypto_tfm *tf=
m, struct acomp_req *req,
=20
 	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
=20
-	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	while ((idxd_desc =3D=3D ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx=
->alloc_decomp_desc_timeout)) {
+		idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+		cpu_relax();
+	}
+
 	if (IS_ERR(idxd_desc)) {
 		dev_dbg(dev, "iaa compress_verify failed: idxd descriptor allocation fai=
lure: ret=3D%ld\n", PTR_ERR(idxd_desc));
 		return -ENODEV;
@@ -1772,7 +1777,8 @@ static int iaa_compress(struct crypto_tfm *tfm, struc=
t acomp_req *req,
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	u32 *compression_crc =3D acomp_request_ctx(req);
 	struct iaa_device *iaa_device;
-	struct idxd_desc *idxd_desc;
+	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
+	u16 alloc_desc_retries =3D 0;
 	struct iax_hw_desc *desc;
 	struct idxd_device *idxd;
 	struct iaa_wq *iaa_wq;
@@ -1788,7 +1794,11 @@ static int iaa_compress(struct crypto_tfm *tfm, stru=
ct acomp_req *req,
=20
 	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
=20
-	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	while ((idxd_desc =3D=3D ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx=
->alloc_comp_desc_timeout)) {
+		idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+		cpu_relax();
+	}
+
 	if (IS_ERR(idxd_desc)) {
 		dev_dbg(dev, "iaa compress failed: idxd descriptor allocation failure: r=
et=3D%ld\n",
 			PTR_ERR(idxd_desc));
@@ -1863,7 +1873,8 @@ static int iaa_decompress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 	struct iaa_device_compression_mode *active_compression_mode;
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	struct iaa_device *iaa_device;
-	struct idxd_desc *idxd_desc;
+	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
+	u16 alloc_desc_retries =3D 0;
 	struct iax_hw_desc *desc;
 	struct idxd_device *idxd;
 	struct iaa_wq *iaa_wq;
@@ -1879,12 +1890,17 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
=20
 	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
=20
-	idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+	while ((idxd_desc =3D=3D ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx=
->alloc_decomp_desc_timeout)) {
+		idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+		cpu_relax();
+	}
+
 	if (IS_ERR(idxd_desc)) {
 		ret =3D -ENODEV;
 		dev_dbg(dev, "%s: idxd descriptor allocation failed: ret=3D%ld\n", __fun=
c__,
 			PTR_ERR(idxd_desc));
-		return ret;
+		idxd_desc =3D NULL;
+		goto fallback_software_decomp;
 	}
 	desc =3D idxd_desc->iax_hw;
=20
@@ -1913,7 +1929,7 @@ static int iaa_decompress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 	ret =3D idxd_submit_desc(wq, idxd_desc);
 	if (ret) {
 		dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
-		goto err;
+		goto fallback_software_decomp;
 	}
=20
 	/* Update stats */
@@ -1926,19 +1942,21 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
 	}
=20
 	ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
+
+fallback_software_decomp:
 	if (ret) {
-		dev_dbg(dev, "%s: check_completion failed ret=3D%d\n", __func__, ret);
-		if (idxd_desc->iax_completion->status =3D=3D IAA_ANALYTICS_ERROR) {
+		dev_dbg(dev, "%s: desc allocation/submission/check_completion failed ret=
=3D%d\n", __func__, ret);
+		if (idxd_desc && idxd_desc->iax_completion->status =3D=3D IAA_ANALYTICS_=
ERROR) {
 			pr_warn("%s: falling back to deflate-generic decompress, "
 				"analytics error code %x\n", __func__,
 				idxd_desc->iax_completion->error_code);
-			ret =3D deflate_generic_decompress(req);
-			if (ret) {
-				dev_dbg(dev, "%s: deflate-generic failed ret=3D%d\n",
-					__func__, ret);
-				goto err;
-			}
-		} else {
+		}
+
+		ret =3D deflate_generic_decompress(req);
+
+		if (ret) {
+			pr_err("%s: iaa decompress failed: deflate-generic fallback error ret=
=3D%d\n",
+			       __func__, ret);
 			goto err;
 		}
 	} else {
@@ -2119,6 +2137,8 @@ static int iaa_comp_adecompress(struct acomp_req *req)
=20
 static void compression_ctx_init(struct iaa_compression_ctx *ctx)
 {
+	ctx->alloc_comp_desc_timeout =3D IAA_ALLOC_DESC_COMP_TIMEOUT;
+	ctx->alloc_decomp_desc_timeout =3D IAA_ALLOC_DESC_DECOMP_TIMEOUT;
 	ctx->verify_compress =3D iaa_verify_compress;
 	ctx->async_mode =3D async_mode;
 	ctx->use_irq =3D use_irq;
@@ -2133,10 +2153,10 @@ static int iaa_comp_init_fixed(struct crypto_acomp =
*acomp_tfm)
 	struct crypto_tfm *tfm =3D crypto_acomp_tfm(acomp_tfm);
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
=20
-	compression_ctx_init(ctx);
-
 	ctx->mode =3D IAA_MODE_FIXED;
=20
+	compression_ctx_init(ctx);
+
 	return 0;
 }
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6C6A1264623;
	Fri, 26 Sep 2025 03:35:11 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857713; cv=none;
 b=E7d4Gn1/hPDLsgh4RIF3F7VvqzW4fJAYAfFLMtWl9hyi2nAQjXABb55uKWfbg5Qe04LQGyl60V0xSBAkDiNrg6lcRH5glHYVyhsNFjGtXeZQ2i1LSPEE5SRGE2VGnEHlwVFCIcc6nE9LAmVRMK2EHEj+Np4lER0HhxWlbHqGpAY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857713; c=relaxed/simple;
	bh=e0xmkTjEegX9/gAzjxbdhPpVSNs/cVSQx7DHF0sEs5o=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=I/Y4tr4bQ3O/wuApuPQAXdP7jGsFX7xmA6uYuZ+FLS1YEE/cJpQMmXKiqrlIno1OKcV3/LpIYVbwtgxD0Xv1Pb9X1VuJvAJ0QGTt7WeXuSFa2Q5ooi17G8cymXw6pIEHTQ8JQFmbfyfqOuHYcqPHUmN/LgxRfr6tIHATFoWJRaM=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=TuVEqE6o; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="TuVEqE6o"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857712; x=1790393712;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=e0xmkTjEegX9/gAzjxbdhPpVSNs/cVSQx7DHF0sEs5o=;
  b=TuVEqE6opi+cgilhmViAxca0In9JQEHLpg7HE9Kx7o/uSRBT9YINjOUj
   QpK4A1otVpFX/pePPSC4LOzHo+OWJ8BnOTEpC8Ho7QZZusqpzRWmrDbEc
   A6B9ExZhW0yr+YmClBSpPD1MpOpC47dxt5knb6biBTHUsTCwqwQF1Z8ua
   UbpSBrhdzD5L0Grizg8sXX2exZrCTq3rLD50ydlwHRgUxuDaL4nS/wO2M
   TNnRri7dFhWHnNz1fDE3VGAahwSbyUAG18T2COEj2uNOJQEHcC0+GxOsH
   XuBOET9MZCi5tDIsA6qE3/zlYvV0T+uWmWhLV2vUt0s5bTVwuT/YQWctn
   w==;
X-CSE-ConnectionGUID: Wzfe0On3STemzVVbHyV5zQ==
X-CSE-MsgGUID: JKL6Q8ldR9G/EuAgVyLUHA==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819472"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819472"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: IRekTfCeRa6XCPu8OegLrA==
X-CSE-MsgGUID: B/fd+WcMQ9a+R9mFoITN3Q==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636560"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:03 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 05/23] crypto: iaa - iaa_wq uses percpu_refs for get/put
 reference counting.
Date: Thu, 25 Sep 2025 20:34:44 -0700
Message-Id: <20250926033502.7486-6-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch modifies the reference counting on "struct iaa_wq" to be a
percpu_ref in atomic mode, instead of an "int refcount" combined with
the "idxd->dev_lock" spin_lock currently used as a synchronization
mechanism to achieve get/put semantics.

This enables a more light-weight, cleaner and effective refcount
implementation for the iaa_wq, significantly reducing latency per
compress/decompress job submitted to the IAA accelerator:

  p50: -136 ns
  p99: -880 ns

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto.h      |   4 +-
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 119 +++++++--------------
 2 files changed, 41 insertions(+), 82 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index cc76a047b54a..9611f2518f42 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -47,8 +47,8 @@ struct iaa_wq {
 	struct list_head	list;
=20
 	struct idxd_wq		*wq;
-	int			ref;
-	bool			remove;
+	struct percpu_ref	ref;
+	bool			free;
 	bool			mapped;
=20
 	struct iaa_device	*iaa_device;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 1169cd44c8e7..5cb7c930158e 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -701,7 +701,7 @@ static void del_iaa_device(struct iaa_device *iaa_devic=
e)
=20
 static void free_iaa_device(struct iaa_device *iaa_device)
 {
-	if (!iaa_device)
+	if (!iaa_device || iaa_device->n_wq)
 		return;
=20
 	remove_device_compression_modes(iaa_device);
@@ -731,6 +731,13 @@ static bool iaa_has_wq(struct iaa_device *iaa_device, =
struct idxd_wq *wq)
 	return false;
 }
=20
+static void __iaa_wq_release(struct percpu_ref *ref)
+{
+	struct iaa_wq *iaa_wq =3D container_of(ref, typeof(*iaa_wq), ref);
+
+	iaa_wq->free =3D true;
+}
+
 static int add_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq,
 		      struct iaa_wq **new_wq)
 {
@@ -738,11 +745,20 @@ static int add_iaa_wq(struct iaa_device *iaa_device, =
struct idxd_wq *wq,
 	struct pci_dev *pdev =3D idxd->pdev;
 	struct device *dev =3D &pdev->dev;
 	struct iaa_wq *iaa_wq;
+	int ret;
=20
 	iaa_wq =3D kzalloc(sizeof(*iaa_wq), GFP_KERNEL);
 	if (!iaa_wq)
 		return -ENOMEM;
=20
+	ret =3D percpu_ref_init(&iaa_wq->ref, __iaa_wq_release,
+			      PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
+
+	if (ret) {
+		kfree(iaa_wq);
+		return -ENOMEM;
+	}
+
 	iaa_wq->wq =3D wq;
 	iaa_wq->iaa_device =3D iaa_device;
 	idxd_wq_set_private(wq, iaa_wq);
@@ -818,6 +834,9 @@ static void __free_iaa_wq(struct iaa_wq *iaa_wq)
 	if (!iaa_wq)
 		return;
=20
+	WARN_ON(!percpu_ref_is_zero(&iaa_wq->ref));
+	percpu_ref_exit(&iaa_wq->ref);
+
 	iaa_device =3D iaa_wq->iaa_device;
 	if (iaa_device->n_wq =3D=3D 0)
 		free_iaa_device(iaa_wq->iaa_device);
@@ -912,53 +931,6 @@ static int save_iaa_wq(struct idxd_wq *wq)
 	return 0;
 }
=20
-static int iaa_wq_get(struct idxd_wq *wq)
-{
-	struct idxd_device *idxd =3D wq->idxd;
-	struct iaa_wq *iaa_wq;
-	int ret =3D 0;
-
-	spin_lock(&idxd->dev_lock);
-	iaa_wq =3D idxd_wq_get_private(wq);
-	if (iaa_wq && !iaa_wq->remove) {
-		iaa_wq->ref++;
-		idxd_wq_get(wq);
-	} else {
-		ret =3D -ENODEV;
-	}
-	spin_unlock(&idxd->dev_lock);
-
-	return ret;
-}
-
-static int iaa_wq_put(struct idxd_wq *wq)
-{
-	struct idxd_device *idxd =3D wq->idxd;
-	struct iaa_wq *iaa_wq;
-	bool free =3D false;
-	int ret =3D 0;
-
-	spin_lock(&idxd->dev_lock);
-	iaa_wq =3D idxd_wq_get_private(wq);
-	if (iaa_wq) {
-		iaa_wq->ref--;
-		if (iaa_wq->ref =3D=3D 0 && iaa_wq->remove) {
-			idxd_wq_set_private(wq, NULL);
-			free =3D true;
-		}
-		idxd_wq_put(wq);
-	} else {
-		ret =3D -ENODEV;
-	}
-	spin_unlock(&idxd->dev_lock);
-	if (free) {
-		__free_iaa_wq(iaa_wq);
-		kfree(iaa_wq);
-	}
-
-	return ret;
-}
-
 /***************************************************************
  * Mapping IAA devices and wqs to cores with per-cpu wq_tables.
  ***************************************************************/
@@ -1765,7 +1737,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_=
desc,
=20
 	if (free_desc)
 		idxd_free_desc(idxd_desc->wq, idxd_desc);
-	iaa_wq_put(idxd_desc->wq);
+	percpu_ref_put(&iaa_wq->ref);
 }
=20
 static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
@@ -1996,19 +1968,13 @@ static int iaa_comp_acompress(struct acomp_req *req)
 	cpu =3D get_cpu();
 	wq =3D comp_wq_table_next_wq(cpu);
 	put_cpu();
-	if (!wq) {
-		pr_debug("no wq configured for cpu=3D%d\n", cpu);
-		return -ENODEV;
-	}
=20
-	ret =3D iaa_wq_get(wq);
-	if (ret) {
+	iaa_wq =3D wq ? idxd_wq_get_private(wq) : NULL;
+	if (unlikely(!iaa_wq || !percpu_ref_tryget(&iaa_wq->ref))) {
 		pr_debug("no wq available for cpu=3D%d\n", cpu);
 		return -ENODEV;
 	}
=20
-	iaa_wq =3D idxd_wq_get_private(wq);
-
 	dev =3D &wq->idxd->pdev->dev;
=20
 	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
@@ -2061,7 +2027,7 @@ static int iaa_comp_acompress(struct acomp_req *req)
 err_map_dst:
 	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
 out:
-	iaa_wq_put(wq);
+	percpu_ref_put(&iaa_wq->ref);
=20
 	return ret;
 }
@@ -2083,19 +2049,13 @@ static int iaa_comp_adecompress(struct acomp_req *r=
eq)
 	cpu =3D get_cpu();
 	wq =3D decomp_wq_table_next_wq(cpu);
 	put_cpu();
-	if (!wq) {
-		pr_debug("no wq configured for cpu=3D%d\n", cpu);
-		return -ENODEV;
-	}
=20
-	ret =3D iaa_wq_get(wq);
-	if (ret) {
+	iaa_wq =3D wq ? idxd_wq_get_private(wq) : NULL;
+	if (unlikely(!iaa_wq || !percpu_ref_tryget(&iaa_wq->ref))) {
 		pr_debug("no wq available for cpu=3D%d\n", cpu);
-		return -ENODEV;
+		return deflate_generic_decompress(req);
 	}
=20
-	iaa_wq =3D idxd_wq_get_private(wq);
-
 	dev =3D &wq->idxd->pdev->dev;
=20
 	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
@@ -2130,7 +2090,7 @@ static int iaa_comp_adecompress(struct acomp_req *req)
 err_map_dst:
 	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
 out:
-	iaa_wq_put(wq);
+	percpu_ref_put(&iaa_wq->ref);
=20
 	return ret;
 }
@@ -2303,7 +2263,6 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_d=
ev)
 	struct idxd_wq *wq =3D idxd_dev_to_wq(idxd_dev);
 	struct idxd_device *idxd =3D wq->idxd;
 	struct iaa_wq *iaa_wq;
-	bool free =3D false;
=20
 	atomic_set(&iaa_crypto_enabled, 0);
 	idxd_wq_quiesce(wq);
@@ -2324,18 +2283,18 @@ static void iaa_crypto_remove(struct idxd_dev *idxd=
_dev)
 		goto out;
 	}
=20
-	if (iaa_wq->ref) {
-		iaa_wq->remove =3D true;
-	} else {
-		wq =3D iaa_wq->wq;
-		idxd_wq_set_private(wq, NULL);
-		free =3D true;
-	}
+	/* Drop the initial reference. */
+	percpu_ref_kill(&iaa_wq->ref);
+
+	while (!iaa_wq->free)
+		cpu_relax();
+
+	__free_iaa_wq(iaa_wq);
+
+	idxd_wq_set_private(wq, NULL);
 	spin_unlock(&idxd->dev_lock);
-	if (free) {
-		__free_iaa_wq(iaa_wq);
-		kfree(iaa_wq);
-	}
+
+	kfree(iaa_wq);
=20
 	idxd_drv_disable_wq(wq);
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AFB632641CA;
	Fri, 26 Sep 2025 03:35:11 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857713; cv=none;
 b=VwrHa299UblwhwdeqoBf9FistcvA7HWxJZQUV2N5taLesT7OdKTlgam9iZU6lLFW5rRwRD0wVVswduWFK8Hl9mDOIjRKwTjYrG83XB4wcMgybzfDzAsQiHHvjlqmLmWxG66MDyq7TP4NDqTA3u9PLQymXZubFnDu6jdfnoD82Wo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857713; c=relaxed/simple;
	bh=ABrWTOP2f/rtttjKCvKr6tV1H34crqihnx6D1RT2A1I=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=gickpiMAxPK97SzI7ppPI/T11OXpSGTTC4eSAd9sRKlIstuPyBEATxZ1spx271f3x6iY0C4XLyCFSyjKSmQqq+ylEjVPPzeOzUuwHp65iOiBhhaI6C1uuNmNGfxSCNvNqmkPuHPMNDsxtljz1el3h6zpVwVvrFXkZ/tYf2eiTiY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=TqxbjgWP; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="TqxbjgWP"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857712; x=1790393712;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=ABrWTOP2f/rtttjKCvKr6tV1H34crqihnx6D1RT2A1I=;
  b=TqxbjgWPCOtCjafkMzhuym9sqTIrAWd2EI+OIpVL0BoRid6Jo44USJCn
   rO9lNQR8XK1fF6RUWnNe+XbHa4Vyf4nZNifZLwSZmNenKuJ+SHqSkTxmh
   viPxaFpe6DgWf6DVInmBYpDSHPL4N/tGBhqxPu02ROstId+XejCtVYVBH
   EGiOw2As4i5Yn8x48S3zoMEiB0+st3Ch7L8mN6T375uHBhHTAccWSxRKw
   7nfzwPLWxmLaMKjsoM0qu2yuUTyYsbrxHn2CzNmtSw9Etp0Ot2PEQqhxI
   8wNhYnkYRPAW09pREuqjUKWOyy0twjM08pXblznxa4KjY4qW7KgJYagBK
   g==;
X-CSE-ConnectionGUID: YRY4sbzcQpqyNI7BCg01SA==
X-CSE-MsgGUID: eeL9cr15SuC5txaPs5VJ5Q==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819485"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819485"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: vWeSZrnxQDS2LHhJdcvsXA==
X-CSE-MsgGUID: gfY3r1/0RoWmrVc8P/eIIQ==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636565"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:03 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 06/23] crypto: iaa - Simplify the code flow in
 iaa_compress() and iaa_decompress().
Date: Thu, 25 Sep 2025 20:34:45 -0700
Message-Id: <20250926033502.7486-7-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This commit simplifies and streamlines the logic in the core
iaa_compress() and iaa_decompress() routines, eliminates branches, etc.

This makes it easier to add improvements such as polling for job
completions, essential to accomplish batching with hardware
parallelism.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 114 ++++++++++++---------
 1 file changed, 67 insertions(+), 47 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 5cb7c930158e..38b4be0c10b0 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1792,7 +1792,34 @@ static int iaa_compress(struct crypto_tfm *tfm, stru=
ct acomp_req *req,
 	desc->src2_size =3D sizeof(struct aecs_comp_table_record);
 	desc->completion_addr =3D idxd_desc->compl_dma;
=20
-	if (ctx->use_irq) {
+	if (likely(!ctx->use_irq)) {
+		ret =3D idxd_submit_desc(wq, idxd_desc);
+		if (ret) {
+			dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
+			goto out;
+		}
+
+		/* Update stats */
+		update_total_comp_calls();
+		update_wq_comp_calls(wq);
+
+		if (ctx->async_mode)
+			return -EINPROGRESS;
+
+		ret =3D check_completion(dev, idxd_desc->iax_completion, true, false);
+		if (ret) {
+			dev_dbg(dev, "check_completion failed ret=3D%d\n", ret);
+			goto out;
+		}
+
+		*dlen =3D idxd_desc->iax_completion->output_size;
+
+		/* Update stats */
+		update_total_comp_bytes_out(*dlen);
+		update_wq_comp_bytes(wq, *dlen);
+
+		*compression_crc =3D idxd_desc->iax_completion->crc;
+	} else {
 		desc->flags |=3D IDXD_OP_FLAG_RCI;
=20
 		idxd_desc->crypto.req =3D req;
@@ -1800,40 +1827,23 @@ static int iaa_compress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 		idxd_desc->crypto.src_addr =3D src_addr;
 		idxd_desc->crypto.dst_addr =3D dst_addr;
 		idxd_desc->crypto.compress =3D true;
-	}
-
-	ret =3D idxd_submit_desc(wq, idxd_desc);
-	if (ret) {
-		dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
-		goto err;
-	}
=20
-	/* Update stats */
-	update_total_comp_calls();
-	update_wq_comp_calls(wq);
+		ret =3D idxd_submit_desc(wq, idxd_desc);
+		if (ret) {
+			dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
+			goto out;
+		}
=20
-	if (ctx->async_mode) {
-		ret =3D -EINPROGRESS;
-		goto out;
-	}
+		/* Update stats */
+		update_total_comp_calls();
+		update_wq_comp_calls(wq);
=20
-	ret =3D check_completion(dev, idxd_desc->iax_completion, true, false);
-	if (ret) {
-		dev_dbg(dev, "check_completion failed ret=3D%d\n", ret);
-		goto err;
+		return -EINPROGRESS;
 	}
=20
-	*dlen =3D idxd_desc->iax_completion->output_size;
-
-	/* Update stats */
-	update_total_comp_bytes_out(*dlen);
-	update_wq_comp_bytes(wq, *dlen);
-
-	*compression_crc =3D idxd_desc->iax_completion->crc;
-
-err:
-	idxd_free_desc(wq, idxd_desc);
 out:
+	idxd_free_desc(wq, idxd_desc);
+
 	return ret;
 }
=20
@@ -1888,7 +1898,22 @@ static int iaa_decompress(struct crypto_tfm *tfm, st=
ruct acomp_req *req,
 	desc->src1_size =3D slen;
 	desc->completion_addr =3D idxd_desc->compl_dma;
=20
-	if (ctx->use_irq) {
+	if (likely(!ctx->use_irq)) {
+		ret =3D idxd_submit_desc(wq, idxd_desc);
+		if (ret) {
+			dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
+			goto fallback_software_decomp;
+		}
+
+		/* Update stats */
+		update_total_decomp_calls();
+		update_wq_decomp_calls(wq);
+
+		if (ctx->async_mode)
+			return -EINPROGRESS;
+
+		ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
+	} else {
 		desc->flags |=3D IDXD_OP_FLAG_RCI;
=20
 		idxd_desc->crypto.req =3D req;
@@ -1896,25 +1921,20 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
 		idxd_desc->crypto.src_addr =3D src_addr;
 		idxd_desc->crypto.dst_addr =3D dst_addr;
 		idxd_desc->crypto.compress =3D false;
-	}
=20
-	ret =3D idxd_submit_desc(wq, idxd_desc);
-	if (ret) {
-		dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
-		goto fallback_software_decomp;
-	}
+		ret =3D idxd_submit_desc(wq, idxd_desc);
+		if (ret) {
+			dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
+			goto fallback_software_decomp;
+		}
=20
-	/* Update stats */
-	update_total_decomp_calls();
-	update_wq_decomp_calls(wq);
+		/* Update stats */
+		update_total_decomp_calls();
+		update_wq_decomp_calls(wq);
=20
-	if (ctx->async_mode) {
-		ret =3D -EINPROGRESS;
-		goto out;
+		return -EINPROGRESS;
 	}
=20
-	ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
-
 fallback_software_decomp:
 	if (ret) {
 		dev_dbg(dev, "%s: desc allocation/submission/check_completion failed ret=
=3D%d\n", __func__, ret);
@@ -1929,7 +1949,7 @@ static int iaa_decompress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 		if (ret) {
 			pr_err("%s: iaa decompress failed: deflate-generic fallback error ret=
=3D%d\n",
 			       __func__, ret);
-			goto err;
+			goto out;
 		}
 	} else {
 		req->dlen =3D idxd_desc->iax_completion->output_size;
@@ -1941,10 +1961,10 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
=20
 	*dlen =3D req->dlen;
=20
-err:
+out:
 	if (idxd_desc)
 		idxd_free_desc(wq, idxd_desc);
-out:
+
 	return ret;
 }
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 056012690EC;
	Fri, 26 Sep 2025 03:35:11 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857713; cv=none;
 b=qc3LcZn+S6GSxQ5ykYQTkLmu/gV1ej33NZeRDv/gKWoJ+6DoYogQhk4JEcfNDnOxSMIvlmzQQelUwcu/8Kc1NL24mz4XZOTwoAX1drl3vwLsPQn/o/401dC/GHweGiNTOSz+0mxYE97vDowMne5Yj3t1yPkaiPWDvS7ZHHL9CLE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857713; c=relaxed/simple;
	bh=FALZN5fZ5Gqh9fvIgLNDNXJiGlP/yWDeY85+GzyStiw=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=O3iZ3Az++pRGH4kNMUEXK8hfwK+ARvXErLdjgN0tO9MNf07CvPJ2VV56F4yiUHbma7H9gPkeYnKyuRWjIlnrwtIaHTXWSs0SUPLlMl6NEC3omWJQH8Npfhu1JicqBq9ApL3AZYMyWCoGChk14TmXnta44JN0Qnd4MlYSA2hC6ho=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=HT5Z+YqQ; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="HT5Z+YqQ"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857712; x=1790393712;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=FALZN5fZ5Gqh9fvIgLNDNXJiGlP/yWDeY85+GzyStiw=;
  b=HT5Z+YqQAWFGKKcMHoqizx4CicQlax4Mcfp25waeAmTg6m+JlpJBrskC
   OFsfjXTBfUIAAroAkuL7+rk4jnAVQWcreoLAAmoivD0ZMEzVzG+1oVamx
   eLswa+fjc34utcyK3ub5Ro2PqKv1H3e4kSEQdmYaQmSOA8uLKUpOwGXLr
   qOYBrd5CIm0BiWd50lPlTki484Igx0l7bkpRXA9Wq/vXwq9KjFB7UPvMA
   3Mog7j6/96i0wGDVYccvfOzwnUt/h6fFitT+JWAmNRqdOvlFO6FsleKSJ
   Y/b1axVRj72ogT4mOe2kuRDF1yl1nMTqsaOzI0WozYI85IqjU/frFMRvT
   w==;
X-CSE-ConnectionGUID: en5kAJHsSG2C7bCy4PpFWg==
X-CSE-MsgGUID: Q08jm3+wQEKCI8IT4wWT3w==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819491"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819491"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: eu3Q2VFWT52sG6FeWQnnQA==
X-CSE-MsgGUID: 0jC17iSzS8OrUGdUyu9Iew==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636568"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:04 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 07/23] crypto: iaa - Refactor hardware descriptor setup
 into separate procedures.
Date: Thu, 25 Sep 2025 20:34:46 -0700
Message-Id: <20250926033502.7486-8-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch refactors the code that sets up the "struct iax_hw_desc" for
compress/decompress ops, into distinct procedures to make the code more
readable.

Also, get_iaa_device_compression_mode() is deleted and the compression
mode directly accessed from the iaa_device in the calling procedures.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 99 ++++++++++++----------
 1 file changed, 56 insertions(+), 43 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 38b4be0c10b0..c94e7abd3909 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -483,12 +483,6 @@ int add_iaa_compression_mode(const char *name,
 }
 EXPORT_SYMBOL_GPL(add_iaa_compression_mode);
=20
-static struct iaa_device_compression_mode *
-get_iaa_device_compression_mode(struct iaa_device *iaa_device, int idx)
-{
-	return iaa_device->compression_modes[idx];
-}
-
 static void free_device_compression_mode(struct iaa_device *iaa_device,
 					 struct iaa_device_compression_mode *device_mode)
 {
@@ -1564,7 +1558,6 @@ static int iaa_compress_verify(struct crypto_tfm *tfm=
, struct acomp_req *req,
 			       dma_addr_t src_addr, unsigned int slen,
 			       dma_addr_t dst_addr, unsigned int dlen)
 {
-	struct iaa_device_compression_mode *active_compression_mode;
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	u32 *compression_crc =3D acomp_request_ctx(req);
 	struct iaa_device *iaa_device;
@@ -1583,8 +1576,6 @@ static int iaa_compress_verify(struct crypto_tfm *tfm=
, struct acomp_req *req,
 	pdev =3D idxd->pdev;
 	dev =3D &pdev->dev;
=20
-	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
-
 	while ((idxd_desc =3D=3D ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx=
->alloc_decomp_desc_timeout)) {
 		idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
 		cpu_relax();
@@ -1660,8 +1651,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_=
desc,
 	pdev =3D idxd->pdev;
 	dev =3D &pdev->dev;
=20
-	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device,
-								  compression_ctx->mode);
+	active_compression_mode =3D iaa_device->compression_modes[compression_ctx=
->mode];
 	dev_dbg(dev, "%s: compression mode %s,"
 		" ctx->src_addr %llx, ctx->dst_addr %llx\n", __func__,
 		active_compression_mode->name,
@@ -1740,12 +1730,63 @@ static void iaa_desc_complete(struct idxd_desc *idx=
d_desc,
 	percpu_ref_put(&iaa_wq->ref);
 }
=20
+static __always_inline struct iax_hw_desc *
+iaa_setup_compress_hw_desc(struct idxd_desc *idxd_desc,
+			   dma_addr_t src_addr,
+			   unsigned int slen,
+			   dma_addr_t dst_addr,
+			   unsigned int dlen,
+			   enum iaa_mode mode,
+			   struct iaa_device_compression_mode *active_compression_mode)
+{
+	struct iax_hw_desc *desc =3D idxd_desc->iax_hw;
+
+	desc->flags =3D IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode =3D IAX_OPCODE_COMPRESS;
+	desc->compr_flags =3D IAA_COMP_FLAGS;
+	desc->priv =3D 0;
+
+	desc->src1_addr =3D (u64)src_addr;
+	desc->src1_size =3D slen;
+	desc->dst_addr =3D (u64)dst_addr;
+	desc->max_dst_size =3D dlen;
+	desc->flags |=3D IDXD_OP_FLAG_RD_SRC2_AECS;
+	desc->src2_addr =3D active_compression_mode->aecs_comp_table_dma_addr;
+	desc->src2_size =3D sizeof(struct aecs_comp_table_record);
+	desc->completion_addr =3D idxd_desc->compl_dma;
+
+	return desc;
+}
+
+static __always_inline struct iax_hw_desc *
+iaa_setup_decompress_hw_desc(struct idxd_desc *idxd_desc,
+			     dma_addr_t src_addr,
+			     unsigned int slen,
+			     dma_addr_t dst_addr,
+			     unsigned int dlen)
+{
+	struct iax_hw_desc *desc =3D idxd_desc->iax_hw;
+
+	desc->flags =3D IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+	desc->opcode =3D IAX_OPCODE_DECOMPRESS;
+	desc->max_dst_size =3D PAGE_SIZE;
+	desc->decompr_flags =3D IAA_DECOMP_FLAGS;
+	desc->priv =3D 0;
+
+	desc->src1_addr =3D (u64)src_addr;
+	desc->dst_addr =3D (u64)dst_addr;
+	desc->max_dst_size =3D dlen;
+	desc->src1_size =3D slen;
+	desc->completion_addr =3D idxd_desc->compl_dma;
+
+	return desc;
+}
+
 static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
 			struct idxd_wq *wq,
 			dma_addr_t src_addr, unsigned int slen,
 			dma_addr_t dst_addr, unsigned int *dlen)
 {
-	struct iaa_device_compression_mode *active_compression_mode;
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	u32 *compression_crc =3D acomp_request_ctx(req);
 	struct iaa_device *iaa_device;
@@ -1764,8 +1805,6 @@ static int iaa_compress(struct crypto_tfm *tfm, struc=
t acomp_req *req,
 	pdev =3D idxd->pdev;
 	dev =3D &pdev->dev;
=20
-	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
-
 	while ((idxd_desc =3D=3D ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx=
->alloc_comp_desc_timeout)) {
 		idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
 		cpu_relax();
@@ -1776,21 +1815,9 @@ static int iaa_compress(struct crypto_tfm *tfm, stru=
ct acomp_req *req,
 			PTR_ERR(idxd_desc));
 		return -ENODEV;
 	}
-	desc =3D idxd_desc->iax_hw;
=20
-	desc->flags =3D IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR |
-		IDXD_OP_FLAG_RD_SRC2_AECS | IDXD_OP_FLAG_CC;
-	desc->opcode =3D IAX_OPCODE_COMPRESS;
-	desc->compr_flags =3D IAA_COMP_FLAGS;
-	desc->priv =3D 0;
-
-	desc->src1_addr =3D (u64)src_addr;
-	desc->src1_size =3D slen;
-	desc->dst_addr =3D (u64)dst_addr;
-	desc->max_dst_size =3D *dlen;
-	desc->src2_addr =3D active_compression_mode->aecs_comp_table_dma_addr;
-	desc->src2_size =3D sizeof(struct aecs_comp_table_record);
-	desc->completion_addr =3D idxd_desc->compl_dma;
+	desc =3D iaa_setup_compress_hw_desc(idxd_desc, src_addr, slen, dst_addr, =
*dlen,
+					  ctx->mode, iaa_device->compression_modes[ctx->mode]);
=20
 	if (likely(!ctx->use_irq)) {
 		ret =3D idxd_submit_desc(wq, idxd_desc);
@@ -1852,7 +1879,6 @@ static int iaa_decompress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 			  dma_addr_t src_addr, unsigned int slen,
 			  dma_addr_t dst_addr, unsigned int *dlen)
 {
-	struct iaa_device_compression_mode *active_compression_mode;
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	struct iaa_device *iaa_device;
 	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
@@ -1870,8 +1896,6 @@ static int iaa_decompress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 	pdev =3D idxd->pdev;
 	dev =3D &pdev->dev;
=20
-	active_compression_mode =3D get_iaa_device_compression_mode(iaa_device, c=
tx->mode);
-
 	while ((idxd_desc =3D=3D ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx=
->alloc_decomp_desc_timeout)) {
 		idxd_desc =3D idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
 		cpu_relax();
@@ -1884,19 +1908,8 @@ static int iaa_decompress(struct crypto_tfm *tfm, st=
ruct acomp_req *req,
 		idxd_desc =3D NULL;
 		goto fallback_software_decomp;
 	}
-	desc =3D idxd_desc->iax_hw;
=20
-	desc->flags =3D IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
-	desc->opcode =3D IAX_OPCODE_DECOMPRESS;
-	desc->max_dst_size =3D PAGE_SIZE;
-	desc->decompr_flags =3D IAA_DECOMP_FLAGS;
-	desc->priv =3D 0;
-
-	desc->src1_addr =3D (u64)src_addr;
-	desc->dst_addr =3D (u64)dst_addr;
-	desc->max_dst_size =3D *dlen;
-	desc->src1_size =3D slen;
-	desc->completion_addr =3D idxd_desc->compl_dma;
+	desc =3D iaa_setup_decompress_hw_desc(idxd_desc, src_addr, slen, dst_addr=
, *dlen);
=20
 	if (likely(!ctx->use_irq)) {
 		ret =3D idxd_submit_desc(wq, idxd_desc);
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id ABF8F26F2BD;
	Fri, 26 Sep 2025 03:35:13 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857715; cv=none;
 b=iryvL1l629yIGkY+PkNoiFQ0suhGdeYJ8X0pwHZsAo/2kUxGq5xuaIQ/VGL5M77XNnsejmifEURqP0u9NYy0dhp+HI1OT2AXH9VHzNd22y/1rq9uk5AEESxiGVeomMlQSxUmlKCOJtwxrwrWlt32Dy9ewTC4Pe6p8sCpa++feGY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857715; c=relaxed/simple;
	bh=YYmh8xZd4gk/n5yJ1RFX0hHDp3XB4RnrHYtV9eqN8Fc=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=a5kpiJ9t047jJPL6s4X7lfvXtLqoQHxq5l3mskvLPih4B/Eif04DpkOOVAQ3mQiRmRR2O/qY3I5bqVuMKsj2+k6yVA3QJORKPefNFOWAGWMh4BoCFOFiR0O54dY6gSQphUA4oM++5ROa7BbG3rCn7uz1ZncMtnCl8rThJxtHE/U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=OS5x9yPG; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="OS5x9yPG"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857714; x=1790393714;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=YYmh8xZd4gk/n5yJ1RFX0hHDp3XB4RnrHYtV9eqN8Fc=;
  b=OS5x9yPGDdjihU0xpHkiOvnEWG1yWFfskwIYdkRhYWgPiTjA0opXc/p2
   x+0FliUornoe0i36a5sGX8izFrwg+o9gPmXYDwn5aqU1Mzcvz/5wpMXMX
   G6OQEQ9+rGA3hTTnOyI/9dunqLAKz86y085rWrmiPoOWUp/OwQ+8JBFQK
   TePiAKi+jG6WlXF2BL+DOQg7cOLGXkSfWb0n5p+WIAZCjvX+vH56kPk66
   nDWwlNi3N7r9wIBqeKcvTBJoRoRQoanqHDxzq2jh+kheTQZl34jwYeYfO
   Vc1Fb98gi9H7X/dyxT3GaOwx+Xiq+ut8WVb+4VpkiOoG4Ng8p4khqsrAS
   w==;
X-CSE-ConnectionGUID: qD2Jy8xIRdO1WyydFgYVQg==
X-CSE-MsgGUID: LNeL7zKcSeG/lv+IYQvfNg==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819502"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819502"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: 57Vr0BPDSrGeV5s6DKEX4A==
X-CSE-MsgGUID: SFBy/7XIQiyBs6zCQpO1bg==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636571"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:04 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 08/23] crypto: iaa - Simplified,
 efficient job submissions for non-irq mode.
Date: Thu, 25 Sep 2025 20:34:47 -0700
Message-Id: <20250926033502.7486-9-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch adds a new procedure, iaa_submit_desc_movdir64b(), that
directly calls movdir64b. The core iaa_crypto routines that submit
compress and decompress jobs now invoke iaa_submit_desc_movdir64b() in
non-irq driver modes, instead of idxd_submit_desc().

idxd_submit_desc() is called only in irq mode.

This improves latency for the most commonly used iaa_crypto usage
(i.e., async non-irq) in zswap/zram by eliminating redundant computes
that would otherwise be incurred in idxd_submit_desc():

  p50: -32 ns
  p99: -1,048 ns

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 30 ++++++++++++++--------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index c94e7abd3909..cac39b418cf0 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1782,6 +1782,24 @@ iaa_setup_decompress_hw_desc(struct idxd_desc *idxd_=
desc,
 	return desc;
 }
=20
+/*
+ * Call this for non-irq, non-enqcmds job submissions.
+ */
+static __always_inline void iaa_submit_desc_movdir64b(struct idxd_wq *wq,
+						     struct idxd_desc *desc)
+{
+	void __iomem *portal =3D idxd_wq_portal_addr(wq);
+
+	/*
+	 * The wmb() flushes writes to coherent DMA data before
+	 * possibly triggering a DMA read. The wmb() is necessary
+	 * even on UP because the recipient is a device.
+	 */
+	wmb();
+
+	iosubmit_cmds512(portal, desc->hw, 1);
+}
+
 static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
 			struct idxd_wq *wq,
 			dma_addr_t src_addr, unsigned int slen,
@@ -1820,11 +1838,7 @@ static int iaa_compress(struct crypto_tfm *tfm, stru=
ct acomp_req *req,
 					  ctx->mode, iaa_device->compression_modes[ctx->mode]);
=20
 	if (likely(!ctx->use_irq)) {
-		ret =3D idxd_submit_desc(wq, idxd_desc);
-		if (ret) {
-			dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
-			goto out;
-		}
+		iaa_submit_desc_movdir64b(wq, idxd_desc);
=20
 		/* Update stats */
 		update_total_comp_calls();
@@ -1912,11 +1926,7 @@ static int iaa_decompress(struct crypto_tfm *tfm, st=
ruct acomp_req *req,
 	desc =3D iaa_setup_decompress_hw_desc(idxd_desc, src_addr, slen, dst_addr=
, *dlen);
=20
 	if (likely(!ctx->use_irq)) {
-		ret =3D idxd_submit_desc(wq, idxd_desc);
-		if (ret) {
-			dev_dbg(dev, "submit_desc failed ret=3D%d\n", ret);
-			goto fallback_software_decomp;
-		}
+		iaa_submit_desc_movdir64b(wq, idxd_desc);
=20
 		/* Update stats */
 		update_total_decomp_calls();
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B746B26F44D;
	Fri, 26 Sep 2025 03:35:13 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857715; cv=none;
 b=Ku8Gn5yJlxr1HAvkYf+aPyV+x7i00Uy1FHJSPXdeP3uA7sJgqAQxTzX/vUc5BfAzcfLrBAYXnyJAOZZtQ3uHvHD6TtOghjeu91BwYz9snJdirZBi8573b3vitV83czRxoDphLFEUp+vEWqFedaeqUIWcH7kMAAGY/BPjAjtYVJY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857715; c=relaxed/simple;
	bh=W86GZRwDZO7beezGCFVV/NiDdvS+RJYh/KFbuBLr8uM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=Tp3Sb9bhDntMzXF7Iyo+TzBKFiq5y9rOqYp78rX7dh1yXxgcxizSTCLzfRaS9sjzoifdPL/tct1TxSOvkgJGxv1R+DIrYzIT3i/6iSIJf8AHBpMjm7sAqppVA+SaL8GIgzFsIMtf5wb+3OGSrNXRr3AX1KIOdZKfdzW3d2lv/LY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=BPusNP0j; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="BPusNP0j"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857714; x=1790393714;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=W86GZRwDZO7beezGCFVV/NiDdvS+RJYh/KFbuBLr8uM=;
  b=BPusNP0jHuwNoJ+mJCvsOK6nWb+x3Vqg/znBxjcjjamxZmBSNz45Pipn
   6Ed0oDmGST7+atoX1PYkE7KsYR7UfOLBsDFpsCfyYtHwv4UHZqyXYzYqP
   E6ESPCiyorFHYqRF1GqP7jw8rOVv1tdBXcnuQ5JyoWlgy5Po9hlafbsuw
   1TBRtyT/kH84jssyJDCKyG1ld336Cpbx21Rzm0OakQYc0RdN6Y+p4fh62
   R++8gGW+wpTCpSarsz0JLkGxEHuxSm8MlC23p82lhoE8ad8emh4ou2oH0
   t7kAznNnG6G9WOQW2WYq4WAu0MMln+4jDbCsGyfgDv9pM72bxHGTZ1GeP
   g==;
X-CSE-ConnectionGUID: bJcZ0Zm8T7eig6TGfsWxOQ==
X-CSE-MsgGUID: Hzc8zmw6R+SkWVkoxvu23A==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819522"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819522"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: C0oLLjWxRGqYnyTAnjQ5wg==
X-CSE-MsgGUID: RG5o64MGTdivpZT0qFXZUw==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636574"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:04 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 09/23] crypto: iaa - Deprecate exporting add/remove IAA
 compression modes.
Date: Thu, 25 Sep 2025 20:34:48 -0700
Message-Id: <20250926033502.7486-10-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

There is no use case right now for kernel users to dynamically
add/remove IAA compression modes; hence this commit deletes the symbol
exports of add_iaa_compression_mode() and remove_iaa_compression_mode().

The only supported usage model of IAA compression modes is for the code
to be statically linked during the iaa_crypto module build,
e.g. iaa_crypto_comp_fixed.c, and for available modes to be registered
when the first IAA device wq is probed.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index cac39b418cf0..dd7c4831e092 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -367,10 +367,6 @@ static void free_iaa_compression_mode(struct iaa_compr=
ession_mode *mode)
  * These tables are typically generated and captured using statistics
  * collected from running actual compress/decompress workloads.
  *
- * A module or other kernel code can add and remove compression modes
- * with a given name using the exported @add_iaa_compression_mode()
- * and @remove_iaa_compression_mode functions.
- *
  * When a new compression mode is added, the tables are saved in a
  * global compression mode list.  When IAA devices are added, a
  * per-IAA device dma mapping is created for each IAA device, for each
@@ -404,7 +400,6 @@ void remove_iaa_compression_mode(const char *name)
 out:
 	mutex_unlock(&iaa_devices_lock);
 }
-EXPORT_SYMBOL_GPL(remove_iaa_compression_mode);
=20
 /**
  * add_iaa_compression_mode - Add an IAA compression mode
@@ -481,7 +476,6 @@ int add_iaa_compression_mode(const char *name,
 	free_iaa_compression_mode(mode);
 	goto out;
 }
-EXPORT_SYMBOL_GPL(add_iaa_compression_mode);
=20
 static void free_device_compression_mode(struct iaa_device *iaa_device,
 					 struct iaa_device_compression_mode *device_mode)
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1F5782727F9;
	Fri, 26 Sep 2025 03:35:14 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857716; cv=none;
 b=BmowWKbr2fG65665jl18zmXxk+SKj6VWPQpavb0XjU52FzEGjSUgXURyELmBlzhEr7ouxKJ6IXDRRyGCz32FQNKRfVeh7j2S+zwARwghx5Hqcwfk9ZQ/tMU7AObTZFZIrub2IKEU1WDR2dD/Bz8xsBFjv3niC2Lxh5LnEx6FsVs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857716; c=relaxed/simple;
	bh=YeucmBW4uE3v66fZdWNHbrEXnqb/5N0a3izK6ymDr0I=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=GHLp/6hiChWhgLMNzHUDzbceShXM+X3RheboqT3G7e4jIzpVUvNsD/T7zQe2pLB3DStIxdGu2phLsmm3Drl32wnKnQz3c2DZwlYGLpL3xOLwJLDtwdaQDVbAEDXLwzEbUtOd65IyAKMZNoCNIZk1EhqFDzKE8T7AexEy5maQpYI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=PrsRtVcd; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="PrsRtVcd"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857714; x=1790393714;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=YeucmBW4uE3v66fZdWNHbrEXnqb/5N0a3izK6ymDr0I=;
  b=PrsRtVcdmIuhfdhAnPYZ3O9WcoyVa5l+zprcbpUUFqVNdC6nA4aWOTmm
   vwlku5UTTyF7XhVJw9xAw7WIEXNhtHxwQACn0SU/KKqWKUDIeM0XuTjtA
   iHFqcEINDlHgZcyX1HYeqtLcpfvgiPcJWmt0U74/MSNzXDVIUyyHDrwru
   nFrxib6d4mnYQUgcBOqcDW1ZwsQK5wodyjEiqkqM/u2h8vDuNXtMqENxz
   Ly+KME4dBR4hhwcdamzFNcMtnnxc6bcBExF+f99Ga6al6vok7d+TgsT3w
   Pn4LzB2Oe/9t2gPQKe+lMg7wK5runM0zlROjusWWWuLZ6vKd1GxVnLaZo
   w==;
X-CSE-ConnectionGUID: fQxS3I+8SUO1zvEQ+LTE6w==
X-CSE-MsgGUID: tS183Z8DQsayw6bknf3Lsg==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819528"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819528"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:05 -0700
X-CSE-ConnectionGUID: /bspJ5WQSiWc8d4EuZm2iw==
X-CSE-MsgGUID: MXWsdFXuSoCqOvxAuuj0xw==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636577"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:04 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 10/23] crypto: iaa - Expect a single scatterlist for a
 [de]compress request's src/dst.
Date: Thu, 25 Sep 2025 20:34:49 -0700
Message-Id: <20250926033502.7486-11-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

The calls to dma_map_sg() were passing sg_nents() for the @nents
parameter, then error-ing out if more than one @nr_sgs were
returned. Furthermore, there are no use-cases for iaa_crypto that allow
multiple SG lists to be mapped for dma at once.

Moreover, as per Herbert's direction in [1] for the batching API from
higher mm layers to interface with crypto using SG lists, batching
within iaa_crypto will rely on there being exactly one SG list per
"unit" of [de]compression in a batch, where the component SG lists are
obtained by breaking down the @req->src and @req->dst.

Given all of the above, this patch simplifies the design by expecting
only 1 @nents in req->src and req->dst, which aligns with current and
batching use cases that will be developed in subsequent patches.

This alleviates the latency penalty of calling sg_nents() per
[de]compress op submitted to the hardware.

Some unlikely() annotations are added to conditionals in the core
[de]compress routines to further improve latency per op.

[1]: https://lore.kernel.org/all/aJ7Fk6RpNc815Ivd@gondor.apana.org.au/T/#m9=
9aea2ce3d284e6c5a3253061d97b08c4752a798

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 54 +++++++++++-----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index dd7c4831e092..16b071058f2b 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1514,11 +1514,11 @@ static int iaa_remap_for_verify(struct device *dev,=
 struct iaa_wq *iaa_wq,
 	int ret =3D 0;
 	int nr_sgs;
=20
-	dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
-	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
+	dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
=20
-	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+	nr_sgs =3D dma_map_sg(dev, req->src, 1, DMA_FROM_DEVICE);
+	if (unlikely(nr_sgs <=3D 0 || nr_sgs > 1)) {
 		dev_dbg(dev, "verify: couldn't map src sg for iaa device %d,"
 			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
 			iaa_wq->wq->id, ret);
@@ -1530,13 +1530,13 @@ static int iaa_remap_for_verify(struct device *dev,=
 struct iaa_wq *iaa_wq,
 		" req->slen %d, sg_dma_len(sg) %d\n", *src_addr, nr_sgs,
 		req->src, req->slen, sg_dma_len(req->src));
=20
-	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+	nr_sgs =3D dma_map_sg(dev, req->dst, 1, DMA_TO_DEVICE);
+	if (unlikely(nr_sgs <=3D 0 || nr_sgs > 1)) {
 		dev_dbg(dev, "verify: couldn't map dst sg for iaa device %d,"
 			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
 			iaa_wq->wq->id, ret);
 		ret =3D -EIO;
-		dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+		dma_unmap_sg(dev, req->src, 1, DMA_FROM_DEVICE);
 		goto out;
 	}
 	*dst_addr =3D sg_dma_address(req->dst);
@@ -1704,14 +1704,14 @@ static void iaa_desc_complete(struct idxd_desc *idx=
d_desc,
 			err =3D -EIO;
 		}
=20
-		dma_unmap_sg(dev, ctx->req->dst, sg_nents(ctx->req->dst), DMA_TO_DEVICE);
-		dma_unmap_sg(dev, ctx->req->src, sg_nents(ctx->req->src), DMA_FROM_DEVIC=
E);
+		dma_unmap_sg(dev, ctx->req->dst, 1, DMA_TO_DEVICE);
+		dma_unmap_sg(dev, ctx->req->src, 1, DMA_FROM_DEVICE);
=20
 		goto out;
 	}
 err:
-	dma_unmap_sg(dev, ctx->req->dst, sg_nents(ctx->req->dst), DMA_FROM_DEVICE=
);
-	dma_unmap_sg(dev, ctx->req->src, sg_nents(ctx->req->src), DMA_TO_DEVICE);
+	dma_unmap_sg(dev, ctx->req->dst, 1, DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, ctx->req->src, 1, DMA_TO_DEVICE);
 out:
 	if (ret !=3D 0)
 		dev_dbg(dev, "asynchronous compress failed ret=3D%d\n", ret);
@@ -2014,8 +2014,8 @@ static int iaa_comp_acompress(struct acomp_req *req)
=20
 	dev =3D &wq->idxd->pdev->dev;
=20
-	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+	nr_sgs =3D dma_map_sg(dev, req->src, 1, DMA_TO_DEVICE);
+	if (unlikely(nr_sgs <=3D 0 || nr_sgs > 1)) {
 		dev_dbg(dev, "couldn't map src sg for iaa device %d,"
 			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
 			iaa_wq->wq->id, ret);
@@ -2024,8 +2024,8 @@ static int iaa_comp_acompress(struct acomp_req *req)
 	}
 	src_addr =3D sg_dma_address(req->src);
=20
-	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+	nr_sgs =3D dma_map_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+	if (unlikely(nr_sgs <=3D 0 || nr_sgs > 1)) {
 		dev_dbg(dev, "couldn't map dst sg for iaa device %d,"
 			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
 			iaa_wq->wq->id, ret);
@@ -2051,18 +2051,18 @@ static int iaa_comp_acompress(struct acomp_req *req)
 		if (ret)
 			dev_dbg(dev, "asynchronous compress verification failed ret=3D%d\n", re=
t);
=20
-		dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
-		dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+		dma_unmap_sg(dev, req->dst, 1, DMA_TO_DEVICE);
+		dma_unmap_sg(dev, req->src, 1, DMA_FROM_DEVICE);
=20
 		goto out;
 	}
=20
-	if (ret)
+	if (unlikely(ret))
 		dev_dbg(dev, "asynchronous compress failed ret=3D%d\n", ret);
=20
-	dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
 err_map_dst:
-	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
+	dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
 out:
 	percpu_ref_put(&iaa_wq->ref);
=20
@@ -2095,8 +2095,8 @@ static int iaa_comp_adecompress(struct acomp_req *req)
=20
 	dev =3D &wq->idxd->pdev->dev;
=20
-	nr_sgs =3D dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+	nr_sgs =3D dma_map_sg(dev, req->src, 1, DMA_TO_DEVICE);
+	if (unlikely(nr_sgs <=3D 0 || nr_sgs > 1)) {
 		dev_dbg(dev, "couldn't map src sg for iaa device %d,"
 			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
 			iaa_wq->wq->id, ret);
@@ -2105,8 +2105,8 @@ static int iaa_comp_adecompress(struct acomp_req *req)
 	}
 	src_addr =3D sg_dma_address(req->src);
=20
-	nr_sgs =3D dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
-	if (nr_sgs <=3D 0 || nr_sgs > 1) {
+	nr_sgs =3D dma_map_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+	if (unlikely(nr_sgs <=3D 0 || nr_sgs > 1)) {
 		dev_dbg(dev, "couldn't map dst sg for iaa device %d,"
 			" wq %d: ret=3D%d\n", iaa_wq->iaa_device->idxd->id,
 			iaa_wq->wq->id, ret);
@@ -2120,12 +2120,12 @@ static int iaa_comp_adecompress(struct acomp_req *r=
eq)
 	if (ret =3D=3D -EINPROGRESS)
 		return ret;
=20
-	if (ret !=3D 0)
+	if (unlikely(ret !=3D 0))
 		dev_dbg(dev, "asynchronous decompress failed ret=3D%d\n", ret);
=20
-	dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
 err_map_dst:
-	dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
+	dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
 out:
 	percpu_ref_put(&iaa_wq->ref);
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B76572765CD;
	Fri, 26 Sep 2025 03:35:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857719; cv=none;
 b=iCC5GQuij5fjPfabpC5QgHHLSiLfzouuNf9Lzjs4QzgBaj4Y1oTq03ztsCwCkm8yWGUFGqbJ+pZj8sytfSDkAelJAaPklvkll3N8tpeRoGBbK1ZMO2OiXVfg5Nlx1qqrREy5ICj/HpTOV/H27YoWdQ1lHx0R/L3XmUk5ifxVbF8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857719; c=relaxed/simple;
	bh=4EOscegYPP+9xMNxcAjMzEg+gx+YSF1MAsraAkJlVlI=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=devslK4at3u9dNmuJFRNhooS4tzhzz59u7ZAJjg4SCNXwP8sGYyjz0MOdNcwPhTc+Udmla171Jc/dSxcrNGszp+pCo29k9x5vSDKGC4Ci5LYCNMiBG2hZI9KooyaftAoJqnh9jWLhw7Stg52Id8CaXeFAYUyeU4GhGnSEsRhx1I=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=NWzDV+/L; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="NWzDV+/L"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857716; x=1790393716;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=4EOscegYPP+9xMNxcAjMzEg+gx+YSF1MAsraAkJlVlI=;
  b=NWzDV+/LWGHRx2aXpCWc0csaQPGmT/24Y0+pZnws7abnR0lBB2Djwjhs
   Zg02wdi9beL1i4NJh/EWA85WweHay81/PvFi3VUMO0V9UN3vu2WPYyOlM
   plDCkV5kKOya4VBu17+WALGJM/tsvKIGdOmHiv6dwPpM6XV0+nWttORR5
   CxLkW3StoF2aVKFJtq/fwRwg/YZHkpjNOleWcE5zJ0nbgc5ud/j7MNYBC
   8TgfBH7bXTY8fO+M6jm8YhzT8raQHXtP+mK69a7BQrnmlbpG98KoOCjHq
   JH5ti0MZPSv0ANs8zZjEHS7kaydyarYz5857EAWQZ4J74AkdH8PvOGMyf
   w==;
X-CSE-ConnectionGUID: uDTmSGojSgiQeU4payOwbA==
X-CSE-MsgGUID: 5yZjsZ1EQguavBAwH1L/4Q==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819537"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819537"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:06 -0700
X-CSE-ConnectionGUID: PzDYgtpZQXC/MHceiWR4vQ==
X-CSE-MsgGUID: mVBRvsC2TJmV40jSzQbBZA==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636580"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:05 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 11/23] crypto: iaa - Rearchitect the iaa_crypto driver to
 be usable by zswap and zram.
Date: Thu, 25 Sep 2025 20:34:50 -0700
Message-Id: <20250926033502.7486-12-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch rearchitects the iaa_crypto driver to be usable by
non-crypto_acomp kernel users such as zram. The crypto_acomp interface
is also preserved for use by zswap. The core driver code is moved under
a crypto_acomp-agnostic layer that relies only on idxd, dma and
scatterlist.

Additionally, this patch resolves a race condition triggered when
IAA wqs and devices are continuously disabled/enabled when workloads are
using IAA for compression/decompression. This commit, in combination
with patches 0002 ("crypto: iaa - New architecture for IAA device WQ
comp/decomp usage & core mapping.) and 0005 (crypto: iaa - iaa_wq uses
percpu_refs for get/put reference counting.) in this series fix the race
condition. This has been verified using bisecting.

The newly added include/linux/iaa_comp.h provides the data structures
and API for use by non-crypto_acomp kernel code such as zram.

This allows kernel users i.e., zswap and zram, to use IAA's hardware
acceleration for compression/decompression without/with crypto_acomp.

Towards this goal, most of the driver code has been made independent of
crypto_acomp, by introducing a new "struct iaa_req" data structure, and
light-weight internal translation routines to/from crypto_acomp, namely,
acomp_to_iaa() and iaa_to_acomp().

The exception is that the driver defines a "static struct crypto_acomp
*deflate_crypto_comp" for the software decompress fall-back
path. Hopefully this shouldn't be an issue for zram because it is
encapsulated within the iaa_crypto driver.

The acomp_alg .compress() and .decompress() interfaces call into
iaa_comp_acompress_main() and iaa_comp_adecompress_main(), which are
wrappers around the core crypto-independent driver functions.

A zram/zcomp backend for iaa_crypto will be submitted as a separate
patch series, using these interfaces from iaa_comp.h:

       int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *req);

       int iaa_comp_decompress(enum iaa_mode mode, struct iaa_req *req);

These iaa_crypto interfaces will continue to be available through
crypto_acomp for use in zswap:

       int crypto_acomp_compress(struct acomp_req *req);
       int crypto_acomp_decompress(struct acomp_req *req);

Some other changes introduced by this commit are:

1) iaa_crypto symbol namespace is changed from "IDXD" to
   "CRYPTO_DEV_IAA_CRYPTO".

2) Some constants and data structures are moved to
   include/linux/iaa_comp.h so as to be usable in developing the zram
   iaa_crypto backend.

Fixes: ea7a5cbb4369 ("crypto: iaa - Add Intel IAA Compression Accelerator c=
rypto driver core")
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/Makefile          |   2 +-
 drivers/crypto/intel/iaa/iaa_crypto.h      |   7 +-
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 373 ++++++++++++++++++---
 include/linux/iaa_comp.h                   |  86 +++++
 4 files changed, 406 insertions(+), 62 deletions(-)
 create mode 100644 include/linux/iaa_comp.h

diff --git a/drivers/crypto/intel/iaa/Makefile b/drivers/crypto/intel/iaa/M=
akefile
index 55bda7770fac..ebfa1a425f80 100644
--- a/drivers/crypto/intel/iaa/Makefile
+++ b/drivers/crypto/intel/iaa/Makefile
@@ -3,7 +3,7 @@
 # Makefile for IAA crypto device drivers
 #
=20
-ccflags-y +=3D -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=
=3D'"IDXD"'
+ccflags-y +=3D -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE=
=3D'"CRYPTO_DEV_IAA_CRYPTO"'
=20
 obj-$(CONFIG_CRYPTO_DEV_IAA_CRYPTO) :=3D iaa_crypto.o
=20
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index 9611f2518f42..190157967e3b 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -6,6 +6,7 @@
=20
 #include <linux/crypto.h>
 #include <linux/idxd.h>
+#include <linux/iaa_comp.h>
 #include <uapi/linux/idxd.h>
=20
 #define IDXD_SUBDRIVER_NAME		"crypto"
@@ -29,8 +30,6 @@
 #define IAA_ERROR_COMP_BUF_OVERFLOW	0x19
 #define IAA_ERROR_WATCHDOG_EXPIRED	0x24
=20
-#define IAA_COMP_MODES_MAX		2
-
 #define FIXED_HDR			0x2
 #define FIXED_HDR_SIZE			3
=20
@@ -138,10 +137,6 @@ int add_iaa_compression_mode(const char *name,
=20
 void remove_iaa_compression_mode(const char *name);
=20
-enum iaa_mode {
-	IAA_MODE_FIXED,
-};
-
 struct iaa_compression_ctx {
 	enum iaa_mode	mode;
 	u16		alloc_comp_desc_timeout;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 16b071058f2b..f5abad950371 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -11,6 +11,7 @@
 #include <linux/highmem.h>
 #include <linux/sched/smt.h>
 #include <crypto/internal/acompress.h>
+#include <linux/iaa_comp.h>
=20
 #include "idxd.h"
 #include "iaa_crypto.h"
@@ -51,6 +52,9 @@ static struct wq_table_entry **pkg_global_decomp_wqs;
 /* All comp wqs from IAAs on a package. */
 static struct wq_table_entry **pkg_global_comp_wqs;
=20
+/* For software deflate fallback compress/decompress. */
+static struct crypto_acomp *deflate_crypto_acomp;
+
 LIST_HEAD(iaa_devices);
 DEFINE_MUTEX(iaa_devices_lock);
=20
@@ -93,9 +97,18 @@ static atomic_t iaa_crypto_enabled =3D ATOMIC_INIT(0);
 static struct idxd_wq *first_wq_found;
 DEFINE_MUTEX(first_wq_found_lock);
=20
-static bool iaa_crypto_registered;
+const char *iaa_compression_mode_names[IAA_COMP_MODES_MAX] =3D {
+	"fixed",
+};
+
+const char *iaa_compression_alg_names[IAA_COMP_MODES_MAX] =3D {
+	"deflate-iaa",
+};
=20
 static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_M=
AX];
+static struct iaa_compression_ctx *iaa_ctx[IAA_COMP_MODES_MAX];
+static bool iaa_mode_registered[IAA_COMP_MODES_MAX];
+static u8 num_iaa_modes_registered;
=20
 /* Distribute decompressions across all IAAs on the package. */
 static bool iaa_distribute_decomps;
@@ -353,6 +366,20 @@ static struct iaa_compression_mode *find_iaa_compressi=
on_mode(const char *name,
 	return NULL;
 }
=20
+static bool iaa_alg_is_registered(const char *name, int *idx)
+{
+	int i;
+
+	for (i =3D 0; i < IAA_COMP_MODES_MAX; ++i) {
+		if (!strcmp(name, iaa_compression_alg_names[i]) && iaa_mode_registered[i=
]) {
+			*idx =3D i;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static void free_iaa_compression_mode(struct iaa_compression_mode *mode)
 {
 	kfree(mode->name);
@@ -466,6 +493,7 @@ int add_iaa_compression_mode(const char *name,
 		 mode->name, idx);
=20
 	iaa_compression_modes[idx] =3D mode;
+	++num_iaa_modes_registered;
=20
 	ret =3D 0;
 out:
@@ -1434,11 +1462,15 @@ static struct idxd_wq *comp_wq_table_next_wq(int cp=
u)
  * Core iaa_crypto compress/decompress functions.
  *************************************************/
=20
-static int deflate_generic_decompress(struct acomp_req *req)
+static int deflate_generic_decompress(struct iaa_req *req)
 {
-	ACOMP_FBREQ_ON_STACK(fbreq, req);
+	ACOMP_REQUEST_ON_STACK(fbreq, deflate_crypto_acomp);
 	int ret;
=20
+	acomp_request_set_callback(fbreq, 0, NULL, NULL);
+	acomp_request_set_params(fbreq, req->src, req->dst, req->slen,
+				 PAGE_SIZE);
+
 	ret =3D crypto_acomp_decompress(fbreq);
 	req->dlen =3D fbreq->dlen;
=20
@@ -1447,6 +1479,25 @@ static int deflate_generic_decompress(struct acomp_r=
eq *req)
 	return ret;
 }
=20
+static __always_inline void acomp_to_iaa(struct acomp_req *areq,
+					 struct iaa_req *req,
+					 struct iaa_compression_ctx *ctx)
+{
+	req->src =3D areq->src;
+	req->dst =3D areq->dst;
+	req->slen =3D areq->slen;
+	req->dlen =3D areq->dlen;
+	req->flags =3D areq->base.flags;
+	if (unlikely(ctx->use_irq))
+		req->drv_data =3D areq;
+}
+
+static __always_inline void iaa_to_acomp(int dlen, struct acomp_req *areq)
+{
+	areq->dst->length =3D dlen;
+	areq->dlen =3D dlen;
+}
+
 static inline int check_completion(struct device *dev,
 				   struct iax_completion_record *comp,
 				   bool compress,
@@ -1508,7 +1559,7 @@ static inline int check_completion(struct device *dev,
 }
=20
 static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
-				struct acomp_req *req,
+				struct iaa_req *req,
 				dma_addr_t *src_addr, dma_addr_t *dst_addr)
 {
 	int ret =3D 0;
@@ -1547,13 +1598,11 @@ static int iaa_remap_for_verify(struct device *dev,=
 struct iaa_wq *iaa_wq,
 	return ret;
 }
=20
-static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *r=
eq,
+static int iaa_compress_verify(struct iaa_compression_ctx *ctx, struct iaa=
_req *req,
 			       struct idxd_wq *wq,
 			       dma_addr_t src_addr, unsigned int slen,
 			       dma_addr_t dst_addr, unsigned int dlen)
 {
-	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
-	u32 *compression_crc =3D acomp_request_ctx(req);
 	struct iaa_device *iaa_device;
 	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
 	u16 alloc_desc_retries =3D 0;
@@ -1606,10 +1655,10 @@ static int iaa_compress_verify(struct crypto_tfm *t=
fm, struct acomp_req *req,
 		goto err;
 	}
=20
-	if (*compression_crc !=3D idxd_desc->iax_completion->crc) {
+	if (req->compression_crc !=3D idxd_desc->iax_completion->crc) {
 		ret =3D -EINVAL;
 		dev_dbg(dev, "(verify) iaa comp/decomp crc mismatch:"
-			" comp=3D0x%x, decomp=3D0x%x\n", *compression_crc,
+			" comp=3D0x%x, decomp=3D0x%x\n", req->compression_crc,
 			idxd_desc->iax_completion->crc);
 		print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET,
 			       8, 1, idxd_desc->iax_completion, 64, 0);
@@ -1635,6 +1684,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_=
desc,
 	struct iaa_wq *iaa_wq;
 	struct pci_dev *pdev;
 	struct device *dev;
+	struct iaa_req req;
 	int ret, err =3D 0;
=20
 	compression_ctx =3D crypto_tfm_ctx(ctx->tfm);
@@ -1660,12 +1710,18 @@ static void iaa_desc_complete(struct idxd_desc *idx=
d_desc,
 			pr_warn("%s: falling back to deflate-generic decompress, "
 				"analytics error code %x\n", __func__,
 				idxd_desc->iax_completion->error_code);
-			ret =3D deflate_generic_decompress(ctx->req);
+
+			acomp_to_iaa(ctx->req, &req, compression_ctx);
+			ret =3D deflate_generic_decompress(&req);
+			iaa_to_acomp(req.dlen, ctx->req);
+
 			if (ret) {
 				dev_dbg(dev, "%s: deflate-generic failed ret=3D%d\n",
 					__func__, ret);
 				err =3D -EIO;
 				goto err;
+			} else {
+				goto verify;
 			}
 		} else {
 			err =3D -EIO;
@@ -1684,21 +1740,26 @@ static void iaa_desc_complete(struct idxd_desc *idx=
d_desc,
 		update_wq_decomp_bytes(iaa_wq->wq, ctx->req->slen);
 	}
=20
+verify:
 	if (ctx->compress && compression_ctx->verify_compress) {
-		u32 *compression_crc =3D acomp_request_ctx(ctx->req);
 		dma_addr_t src_addr, dst_addr;
=20
-		*compression_crc =3D idxd_desc->iax_completion->crc;
+		acomp_to_iaa(ctx->req, &req, compression_ctx);
+		req.compression_crc =3D idxd_desc->iax_completion->crc;
+
+		ret =3D iaa_remap_for_verify(dev, iaa_wq, &req, &src_addr, &dst_addr);
+		iaa_to_acomp(req.dlen, ctx->req);
=20
-		ret =3D iaa_remap_for_verify(dev, iaa_wq, ctx->req, &src_addr, &dst_addr=
);
 		if (ret) {
 			dev_dbg(dev, "%s: compress verify remap failed ret=3D%d\n", __func__, r=
et);
 			err =3D -EIO;
 			goto out;
 		}
=20
-		ret =3D iaa_compress_verify(ctx->tfm, ctx->req, iaa_wq->wq, src_addr,
+		ret =3D iaa_compress_verify(compression_ctx, &req, iaa_wq->wq, src_addr,
 					  ctx->req->slen, dst_addr, ctx->req->dlen);
+		iaa_to_acomp(req.dlen, ctx->req);
+
 		if (ret) {
 			dev_dbg(dev, "%s: compress verify failed ret=3D%d\n", __func__, ret);
 			err =3D -EIO;
@@ -1724,7 +1785,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_=
desc,
 	percpu_ref_put(&iaa_wq->ref);
 }
=20
-static __always_inline struct iax_hw_desc *
+static struct iax_hw_desc *
 iaa_setup_compress_hw_desc(struct idxd_desc *idxd_desc,
 			   dma_addr_t src_addr,
 			   unsigned int slen,
@@ -1752,7 +1813,7 @@ iaa_setup_compress_hw_desc(struct idxd_desc *idxd_des=
c,
 	return desc;
 }
=20
-static __always_inline struct iax_hw_desc *
+static struct iax_hw_desc *
 iaa_setup_decompress_hw_desc(struct idxd_desc *idxd_desc,
 			     dma_addr_t src_addr,
 			     unsigned int slen,
@@ -1794,13 +1855,11 @@ static __always_inline void iaa_submit_desc_movdir6=
4b(struct idxd_wq *wq,
 	iosubmit_cmds512(portal, desc->hw, 1);
 }
=20
-static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
+static int iaa_compress(struct iaa_compression_ctx *ctx, struct iaa_req *r=
eq,
 			struct idxd_wq *wq,
 			dma_addr_t src_addr, unsigned int slen,
 			dma_addr_t dst_addr, unsigned int *dlen)
 {
-	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
-	u32 *compression_crc =3D acomp_request_ctx(req);
 	struct iaa_device *iaa_device;
 	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
 	u16 alloc_desc_retries =3D 0;
@@ -1848,17 +1907,18 @@ static int iaa_compress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 		}
=20
 		*dlen =3D idxd_desc->iax_completion->output_size;
+		req->compression_crc =3D idxd_desc->iax_completion->crc;
=20
 		/* Update stats */
 		update_total_comp_bytes_out(*dlen);
 		update_wq_comp_bytes(wq, *dlen);
-
-		*compression_crc =3D idxd_desc->iax_completion->crc;
 	} else {
+		struct acomp_req *areq =3D req->drv_data;
+
 		desc->flags |=3D IDXD_OP_FLAG_RCI;
=20
-		idxd_desc->crypto.req =3D req;
-		idxd_desc->crypto.tfm =3D tfm;
+		idxd_desc->crypto.req =3D areq;
+		idxd_desc->crypto.tfm =3D areq->base.tfm;
 		idxd_desc->crypto.src_addr =3D src_addr;
 		idxd_desc->crypto.dst_addr =3D dst_addr;
 		idxd_desc->crypto.compress =3D true;
@@ -1882,12 +1942,11 @@ static int iaa_compress(struct crypto_tfm *tfm, str=
uct acomp_req *req,
 	return ret;
 }
=20
-static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
+static int iaa_decompress(struct iaa_compression_ctx *ctx, struct iaa_req =
*req,
 			  struct idxd_wq *wq,
 			  dma_addr_t src_addr, unsigned int slen,
 			  dma_addr_t dst_addr, unsigned int *dlen)
 {
-	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
 	struct iaa_device *iaa_device;
 	struct idxd_desc *idxd_desc =3D ERR_PTR(-EAGAIN);
 	u16 alloc_desc_retries =3D 0;
@@ -1931,10 +1990,12 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
=20
 		ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
 	} else {
+		struct acomp_req *areq =3D req->drv_data;
+
 		desc->flags |=3D IDXD_OP_FLAG_RCI;
=20
-		idxd_desc->crypto.req =3D req;
-		idxd_desc->crypto.tfm =3D tfm;
+		idxd_desc->crypto.req =3D areq;
+		idxd_desc->crypto.tfm =3D areq->base.tfm;
 		idxd_desc->crypto.src_addr =3D src_addr;
 		idxd_desc->crypto.dst_addr =3D dst_addr;
 		idxd_desc->crypto.compress =3D false;
@@ -1985,20 +2046,16 @@ static int iaa_decompress(struct crypto_tfm *tfm, s=
truct acomp_req *req,
 	return ret;
 }
=20
-static int iaa_comp_acompress(struct acomp_req *req)
+static int iaa_comp_acompress(struct iaa_compression_ctx *ctx, struct iaa_=
req *req)
 {
-	struct iaa_compression_ctx *compression_ctx;
-	struct crypto_tfm *tfm =3D req->base.tfm;
 	dma_addr_t src_addr, dst_addr;
 	int nr_sgs, cpu, ret =3D 0;
 	struct iaa_wq *iaa_wq;
 	struct idxd_wq *wq;
 	struct device *dev;
=20
-	compression_ctx =3D crypto_tfm_ctx(tfm);
-
-	if (!req->src || !req->slen) {
-		pr_debug("invalid src, not compressing\n");
+	if (!req->src || !req->slen || !req->dst) {
+		pr_debug("invalid src/dst, not compressing\n");
 		return -EINVAL;
 	}
=20
@@ -2034,19 +2091,19 @@ static int iaa_comp_acompress(struct acomp_req *req)
 	}
 	dst_addr =3D sg_dma_address(req->dst);
=20
-	ret =3D iaa_compress(tfm, req, wq, src_addr, req->slen, dst_addr,
+	ret =3D iaa_compress(ctx, req, wq, src_addr, req->slen, dst_addr,
 			   &req->dlen);
 	if (ret =3D=3D -EINPROGRESS)
 		return ret;
=20
-	if (!ret && compression_ctx->verify_compress) {
+	if (!ret && ctx->verify_compress) {
 		ret =3D iaa_remap_for_verify(dev, iaa_wq, req, &src_addr, &dst_addr);
 		if (ret) {
 			dev_dbg(dev, "%s: compress verify remap failed ret=3D%d\n", __func__, r=
et);
 			goto out;
 		}
=20
-		ret =3D iaa_compress_verify(tfm, req, wq, src_addr, req->slen,
+		ret =3D iaa_compress_verify(ctx, req, wq, src_addr, req->slen,
 					  dst_addr, req->dlen);
 		if (ret)
 			dev_dbg(dev, "asynchronous compress verification failed ret=3D%d\n", re=
t);
@@ -2069,9 +2126,8 @@ static int iaa_comp_acompress(struct acomp_req *req)
 	return ret;
 }
=20
-static int iaa_comp_adecompress(struct acomp_req *req)
+static int iaa_comp_adecompress(struct iaa_compression_ctx *ctx, struct ia=
a_req *req)
 {
-	struct crypto_tfm *tfm =3D req->base.tfm;
 	dma_addr_t src_addr, dst_addr;
 	int nr_sgs, cpu, ret =3D 0;
 	struct iaa_wq *iaa_wq;
@@ -2115,7 +2171,7 @@ static int iaa_comp_adecompress(struct acomp_req *req)
 	}
 	dst_addr =3D sg_dma_address(req->dst);
=20
-	ret =3D iaa_decompress(tfm, req, wq, src_addr, req->slen,
+	ret =3D iaa_decompress(ctx, req, wq, src_addr, req->slen,
 			     dst_addr, &req->dlen);
 	if (ret =3D=3D -EINPROGRESS)
 		return ret;
@@ -2132,8 +2188,9 @@ static int iaa_comp_adecompress(struct acomp_req *req)
 	return ret;
 }
=20
-static void compression_ctx_init(struct iaa_compression_ctx *ctx)
+static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa=
_mode mode)
 {
+	ctx->mode =3D mode;
 	ctx->alloc_comp_desc_timeout =3D IAA_ALLOC_DESC_COMP_TIMEOUT;
 	ctx->alloc_decomp_desc_timeout =3D IAA_ALLOC_DESC_DECOMP_TIMEOUT;
 	ctx->verify_compress =3D iaa_verify_compress;
@@ -2141,26 +2198,164 @@ static void compression_ctx_init(struct iaa_compre=
ssion_ctx *ctx)
 	ctx->use_irq =3D use_irq;
 }
=20
+static __always_inline bool iaa_compressor_enabled(void)
+{
+	return (atomic_read(&iaa_crypto_enabled) && num_iaa_modes_registered);
+}
+
+static __always_inline enum iaa_mode iaa_compressor_is_registered(const ch=
ar *compressor_name)
+{
+	u8 i;
+
+	if (!atomic_read(&iaa_crypto_enabled) || !num_iaa_modes_registered)
+		return IAA_MODE_NONE;
+
+	for (i =3D 0; i < IAA_COMP_MODES_MAX; ++i) {
+		if (iaa_mode_registered[i] &&
+		    !strcmp(iaa_compression_alg_names[i], compressor_name))
+			return (enum iaa_mode)i;
+	}
+
+	return IAA_MODE_NONE;
+}
+
+/***********************************************************
+ * Interfaces for non-crypto_acomp kernel users, e.g. zram.
+ ***********************************************************/
+
+__always_inline bool iaa_comp_enabled(void)
+{
+	return iaa_compressor_enabled();
+}
+EXPORT_SYMBOL_GPL(iaa_comp_enabled);
+
+__always_inline enum iaa_mode iaa_comp_get_compressor_mode(const char *com=
pressor_name)
+{
+	return iaa_compressor_is_registered(compressor_name);
+}
+EXPORT_SYMBOL_GPL(iaa_comp_get_compressor_mode);
+
+__always_inline bool iaa_comp_mode_is_registered(enum iaa_mode mode)
+{
+	return iaa_mode_registered[mode];
+}
+EXPORT_SYMBOL_GPL(iaa_comp_mode_is_registered);
+
+void iaa_comp_put_modes(char **iaa_mode_names, enum iaa_mode *iaa_modes, u=
8 nr_modes)
+{
+	u8 i;
+
+	if (iaa_mode_names) {
+		for (i =3D 0; i < nr_modes; ++i)
+			kfree(iaa_mode_names[i]);
+		kfree(iaa_mode_names);
+	}
+
+	kfree(iaa_modes);
+}
+EXPORT_SYMBOL_GPL(iaa_comp_put_modes);
+
+u8 iaa_comp_get_modes(char **iaa_mode_names, enum iaa_mode *iaa_modes)
+{
+	u8 i, nr_modes =3D 0;
+
+	if (!atomic_read(&iaa_crypto_enabled) || !num_iaa_modes_registered)
+		return 0;
+
+	iaa_mode_names =3D kcalloc(num_iaa_modes_registered, sizeof(char *), GFP_=
KERNEL);
+	if (!iaa_mode_names)
+		goto err;
+
+	iaa_modes =3D kcalloc(num_iaa_modes_registered, sizeof(enum iaa_mode), GF=
P_KERNEL);
+	if (!iaa_modes)
+		goto err;
+
+	for (i =3D 0; i < IAA_COMP_MODES_MAX; ++i) {
+		if (iaa_mode_registered[i]) {
+			iaa_mode_names[nr_modes] =3D kzalloc(sizeof(char) * 30, GFP_KERNEL);
+			if (!iaa_mode_names[nr_modes])
+				goto err;
+			strscpy(iaa_mode_names[nr_modes], iaa_compression_alg_names[i],
+				sizeof(iaa_mode_names[nr_modes]));
+			iaa_modes[nr_modes] =3D (enum iaa_mode)nr_modes;
+			++nr_modes;
+		}
+	}
+
+	return nr_modes;
+
+err:
+	iaa_comp_put_modes(iaa_mode_names, iaa_modes, num_iaa_modes_registered);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iaa_comp_get_modes);
+
+__always_inline int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *=
req)
+{
+	return iaa_comp_acompress(iaa_ctx[mode], req);
+}
+EXPORT_SYMBOL_GPL(iaa_comp_compress);
+
+__always_inline int iaa_comp_decompress(enum iaa_mode mode, struct iaa_req=
 *req)
+{
+	return iaa_comp_adecompress(iaa_ctx[mode], req);
+}
+EXPORT_SYMBOL_GPL(iaa_comp_decompress);
+
 /*********************************************
  * Interfaces to crypto_alg and crypto_acomp.
  *********************************************/
=20
+static int iaa_comp_acompress_main(struct acomp_req *areq)
+{
+	struct crypto_tfm *tfm =3D areq->base.tfm;
+	struct iaa_compression_ctx *ctx;
+	struct iaa_req req;
+	int ret =3D -ENODEV, idx;
+
+	if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
+		ctx =3D iaa_ctx[idx];
+
+		acomp_to_iaa(areq, &req, ctx);
+		ret =3D iaa_comp_acompress(ctx, &req);
+		iaa_to_acomp(unlikely(ret) ? ret : req.dlen, areq);
+	}
+
+	return ret;
+}
+
+static int iaa_comp_adecompress_main(struct acomp_req *areq)
+{
+	struct crypto_tfm *tfm =3D areq->base.tfm;
+	struct iaa_compression_ctx *ctx;
+	struct iaa_req req;
+	int ret =3D -ENODEV, idx;
+
+	if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
+		ctx =3D iaa_ctx[idx];
+
+		acomp_to_iaa(areq, &req, ctx);
+		ret =3D iaa_comp_adecompress(ctx, &req);
+		iaa_to_acomp(unlikely(ret) ? ret : req.dlen, areq);
+	}
+
+	return ret;
+}
+
 static int iaa_comp_init_fixed(struct crypto_acomp *acomp_tfm)
 {
 	struct crypto_tfm *tfm =3D crypto_acomp_tfm(acomp_tfm);
 	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
=20
-	ctx->mode =3D IAA_MODE_FIXED;
-
-	compression_ctx_init(ctx);
+	ctx =3D iaa_ctx[IAA_MODE_FIXED];
=20
 	return 0;
 }
=20
 static struct acomp_alg iaa_acomp_fixed_deflate =3D {
 	.init			=3D iaa_comp_init_fixed,
-	.compress		=3D iaa_comp_acompress,
-	.decompress		=3D iaa_comp_adecompress,
+	.compress		=3D iaa_comp_acompress_main,
+	.decompress		=3D iaa_comp_adecompress_main,
 	.base			=3D {
 		.cra_name		=3D "deflate",
 		.cra_driver_name	=3D "deflate-iaa",
@@ -2172,29 +2367,89 @@ static struct acomp_alg iaa_acomp_fixed_deflate =3D=
 {
 	}
 };
=20
+/*******************************************
+ * Implement idxd_device_driver interfaces.
+ *******************************************/
+
+static void iaa_unregister_compression_device(void)
+{
+	unsigned int i;
+
+	atomic_set(&iaa_crypto_enabled, 0);
+
+	for (i =3D 0; i < IAA_COMP_MODES_MAX; ++i) {
+		iaa_mode_registered[i] =3D false;
+		kfree(iaa_ctx[i]);
+		iaa_ctx[i] =3D NULL;
+	}
+
+	num_iaa_modes_registered =3D 0;
+}
+
 static int iaa_register_compression_device(void)
 {
-	int ret;
+	struct iaa_compression_mode *mode;
+	int i, idx;
+
+	for (i =3D 0; i < IAA_COMP_MODES_MAX; ++i) {
+		iaa_mode_registered[i] =3D false;
+		mode =3D find_iaa_compression_mode(iaa_compression_mode_names[i], &idx);
+		if (mode) {
+			iaa_ctx[i] =3D kmalloc(sizeof(struct iaa_compression_ctx), GFP_KERNEL);
+			if (!iaa_ctx[i])
+				goto err;
+
+			compression_ctx_init(iaa_ctx[i], (enum iaa_mode)i);
+			iaa_mode_registered[i] =3D true;
+		}
+	}
+
+	BUG_ON(!iaa_mode_registered[IAA_MODE_FIXED]);
+	return 0;
+
+err:
+	iaa_unregister_compression_device();
+	return -ENODEV;
+}
+
+static int iaa_register_acomp_compression_device(void)
+{
+	int ret =3D -ENOMEM;
+
+	deflate_crypto_acomp =3D crypto_alloc_acomp("deflate", 0, 0);
+	if (IS_ERR_OR_NULL(deflate_crypto_acomp))
+		goto err_deflate_acomp;
=20
 	ret =3D crypto_register_acomp(&iaa_acomp_fixed_deflate);
 	if (ret) {
 		pr_err("deflate algorithm acomp fixed registration failed (%d)\n", ret);
-		goto out;
+		goto err_fixed;
 	}
=20
-	iaa_crypto_registered =3D true;
-out:
+	return 0;
+
+err_fixed:
+	if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
+		crypto_free_acomp(deflate_crypto_acomp);
+		deflate_crypto_acomp =3D NULL;
+	}
+
+err_deflate_acomp:
+	iaa_unregister_compression_device();
 	return ret;
 }
=20
-static int iaa_unregister_compression_device(void)
+static void iaa_unregister_acomp_compression_device(void)
 {
 	atomic_set(&iaa_crypto_enabled, 0);
=20
-	if (iaa_crypto_registered)
+	if (iaa_mode_registered[IAA_MODE_FIXED])
 		crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
=20
-	return 0;
+	if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
+		crypto_free_acomp(deflate_crypto_acomp);
+		deflate_crypto_acomp =3D NULL;
+	}
 }
=20
 static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
@@ -2264,6 +2519,12 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_de=
v)
 			goto err_register;
 		}
=20
+		ret =3D iaa_register_acomp_compression_device();
+		if (ret !=3D 0) {
+			dev_dbg(dev, "IAA compression device acomp registration failed\n");
+			goto err_register;
+		}
+
 		if (!rebalance_wq_table()) {
 			dev_dbg(dev, "%s: Rerun after registration: IAA rebalancing device wq t=
ables failed\n", __func__);
 			goto err_register;
@@ -2340,6 +2601,8 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_d=
ev)
 		pkg_global_wqs_dealloc();
 		free_wq_tables();
 		BUG_ON(!list_empty(&iaa_devices));
+		iaa_unregister_acomp_compression_device();
+		iaa_unregister_compression_device();
 		INIT_LIST_HEAD(&iaa_devices);
 		module_put(THIS_MODULE);
=20
@@ -2456,8 +2719,8 @@ static int __init iaa_crypto_init_module(void)
=20
 static void __exit iaa_crypto_cleanup_module(void)
 {
-	if (iaa_unregister_compression_device())
-		pr_debug("IAA compression device unregister failed\n");
+	iaa_unregister_acomp_compression_device();
+	iaa_unregister_compression_device();
=20
 	iaa_crypto_debugfs_cleanup();
 	driver_remove_file(&iaa_crypto_driver.drv,
diff --git a/include/linux/iaa_comp.h b/include/linux/iaa_comp.h
new file mode 100644
index 000000000000..ec061315f477
--- /dev/null
+++ b/include/linux/iaa_comp.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2021 Intel Corporation. All rights rsvd. */
+
+#ifndef __IAA_COMP_H__
+#define __IAA_COMP_H__
+
+#if IS_ENABLED(CONFIG_CRYPTO_DEV_IAA_CRYPTO)
+
+#include <linux/scatterlist.h>
+
+#define IAA_COMP_MODES_MAX  IAA_MODE_NONE
+
+enum iaa_mode {
+	IAA_MODE_FIXED =3D 0,
+	IAA_MODE_NONE =3D 1,
+};
+
+struct iaa_req {
+	struct scatterlist *src;
+	struct scatterlist *dst;
+	unsigned int slen;
+	unsigned int dlen;
+	u32 flags;
+	u32 compression_crc;
+	void *drv_data; /* for driver internal use */
+};
+
+extern bool iaa_comp_enabled(void);
+
+extern enum iaa_mode iaa_comp_get_compressor_mode(const char *compressor_n=
ame);
+
+extern bool iaa_comp_mode_is_registered(enum iaa_mode mode);
+
+extern u8 iaa_comp_get_modes(char **iaa_mode_names, enum iaa_mode *iaa_mod=
es);
+
+extern void iaa_comp_put_modes(char **iaa_mode_names, enum iaa_mode *iaa_m=
odes, u8 nr_modes);
+
+extern int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *req);
+
+extern int iaa_comp_decompress(enum iaa_mode mode, struct iaa_req *req);
+
+#else /* CONFIG_CRYPTO_DEV_IAA_CRYPTO */
+
+enum iaa_mode {
+	IAA_MODE_NONE =3D 1,
+};
+
+struct iaa_req {};
+
+static inline bool iaa_comp_enabled(void)
+{
+	return false;
+}
+
+static inline enum iaa_mode iaa_comp_get_compressor_mode(const char *compr=
essor_name)
+{
+	return IAA_MODE_NONE;
+}
+
+static inline bool iaa_comp_mode_is_registered(enum iaa_mode mode)
+{
+	return false;
+}
+
+static inline u8 iaa_comp_get_modes(char **iaa_mode_names, enum iaa_mode *=
iaa_modes)
+{
+	return 0;
+}
+
+static inline void iaa_comp_put_modes(char **iaa_mode_names, enum iaa_mode=
 *iaa_modes, u8 nr_modes)
+{
+}
+
+static inline int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *re=
q)
+{
+	return -EINVAL;
+}
+
+static inline int iaa_comp_decompress(enum iaa_mode mode, struct iaa_req *=
req)
+{
+	return -EINVAL;
+}
+
+#endif /* CONFIG_CRYPTO_DEV_IAA_CRYPTO */
+
+#endif
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5FADF2777E0;
	Fri, 26 Sep 2025 03:35:16 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857718; cv=none;
 b=POIlRSD98Eontt8LGW4x5Iz5SX7rv5HTj8wAeR3neElLhDeWv66SofIYXLqD050U4KRMWETOxqA55YRCSB8OfNFpp/svlmd1D6BCl4RFTmvg6tdx/2PR/EV2AqNHnzToRbc4sAIyY2lMdq6S2Y4b+GQRf3g0uJwFEjlBsOI+R0c=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857718; c=relaxed/simple;
	bh=O6Y/KqSHNNgjHA7lJ/nseE1s3+rHj9XrHsce22HNXnk=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=aLeLPL00McffNChB+VmVu/kNKfhQgsiDHL8vhQN0rp8tuFU6D6tnWVtyfykGfyLvh3OoIjbxq2D+sNneO5HZLrNTRaNmazp/nX9wOR/Fn9E/ptQfzmjiXcLiPq0ZOmy1j2u/UVFWicsAsYuc5sR4U6BqhF7w9o8RiREdTJmH4Ks=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=Thg1hPZu; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="Thg1hPZu"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857716; x=1790393716;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=O6Y/KqSHNNgjHA7lJ/nseE1s3+rHj9XrHsce22HNXnk=;
  b=Thg1hPZumAoSpibEGOtmBcCzlGbj/dOGCcK2Y12RrFRbVg31YKiNTCk2
   Ab0/AUrc4HP83I9gj75ysxM6FkwTthdTcuboTRBMzw0IFn7wGKLvMJro3
   G796EPo1vlvOsHEf1pnPzafvNo0KgMgHYxCo75yoT1KMIyiPvNkwDMyug
   F5kuSejKmNgMC/Iwi9PZssm+iKAvRggEeqUZddlGN4e9C/+VU5Pjx3wT4
   iN7wDj1aDhwUCbKk70u8YSf/TTA1ig903SBJ1te0ipSGGULXJ1mZJuT8P
   iZtClt549MIx8FbKNVdYADkvHRYZfiN22EjLEyDHs3vBYOvLhRvpGTxiu
   w==;
X-CSE-ConnectionGUID: cNCpi9JkSmec8Qhh+thtnw==
X-CSE-MsgGUID: wZv0OPK2RhKcWRPMZKuPOQ==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819548"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819548"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:06 -0700
X-CSE-ConnectionGUID: AFxCyJRuTT65k9GCwiosJw==
X-CSE-MsgGUID: +ZdoxfwoQgS5NAUBnswviQ==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636583"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:05 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 12/23] crypto: iaa - Enablers for submitting descriptors
 then polling for completion.
Date: Thu, 25 Sep 2025 20:34:51 -0700
Message-Id: <20250926033502.7486-13-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch adds capabilities in the IAA driver for kernel users to avail
of the benefits of compressing/decompressing multiple jobs in parallel
using IAA hardware acceleration, without the use of interrupts. Instead,
this is accomplished using an async "submit-poll" mechanism.

To achieve this, we break down a compress/decompress job into two
separate activities if the driver is configured for non-irq async mode:

1) Submit a descriptor after caching the "idxd_desc" descriptor in the
   req->drv_data, and return -EINPROGRESS.
2) Poll: Given a request, retrieve the descriptor and poll its completion
   status for success/error.

This is enabled by the following additions in the driver:

1) The idxd_desc is cached in the "drv_data" member of "struct iaa_req".

2) IAA_REQ_POLL_FLAG: if set in the iaa_req's flags, this tells
   the driver that it should submit the descriptor and return
   -EINPROGRESS. If not set, the driver will proceed to call
   check_completion() in fully synchronous mode, until the hardware
   returns a completion status.

3) iaa_comp_poll() procedure: This routine is intended to be called
   after submission returns -EINPROGRESS. It will check the completion
   status once, and return -EAGAIN if the job has not completed. If the
   job has completed, it will return the completion status.

The purpose of this commit is to allow kernel users of iaa_crypto, such
as zswap, to be able to invoke the crypto_acomp_compress() API in fully
synchronous mode for sequential/non-batching use cases (i.e. today's
status-quo), wherein zswap calls:

  crypto_wait_req(crypto_acomp_compress(req), wait);

and to non-instrusively invoke the fully asynchronous batch
compress/decompress functionality that will be introduced in subsequent
patches. Both use cases need to reuse same code paths in the driver to
interface with hardware: the IAA_REQ_POLL_FLAG allows this
shared code to determine whether we need to process an iaa_req
synchronously/asynchronously. The idea is to simplify iaa_crypto's
sequential/batching interfaces for use by zswap and zram.

Thus, regardless of the iaa_crypto driver's 'sync_mode' setting, it
can still be forced to use synchronous mode by *not setting* the
IAA_REQ_POLL_FLAG in iaa_req->flags: this is the default to support
sequential use cases in zswap today.

When IAA batching functionality is introduced subsequently, it will set
the IAA_REQ_POLL_FLAG for the requests in a batch. We will submit the
descriptors for each request in the batch in iaa_[de]compress(), and
return -EINPROGRESS. The hardware will begin processing each request as
soon as it is submitted, essentially all compress/decompress jobs will
be parallelized. The polling function, "iaa_comp_poll()", will retrieve
the descriptor from each iaa_req->drv_data to check its completion
status. This enables the iaa_crypto driver to implement true async
"submit-polling" for parallel compressions and decompressions in the IAA
hardware accelerator.

To summarize, both these conditions need to be met for a request to be
processed in fully async submit-poll mode:

 1) use_irq should be "false"
 2) iaa_req->flags & IAA_REQ_POLL_FLAG should be "true"

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto.h      |  6 ++
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 71 +++++++++++++++++++++-
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index 190157967e3b..1cc383c94fb8 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -41,6 +41,12 @@
 					 IAA_DECOMP_CHECK_FOR_EOB | \
 					 IAA_DECOMP_STOP_ON_EOB)
=20
+/*
+ * If set, the driver must have a way to submit the req, then
+ * poll its completion status for success/error.
+ */
+#define IAA_REQ_POLL_FLAG		0x00000002
+
 /* Representation of IAA workqueue */
 struct iaa_wq {
 	struct list_head	list;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index f5abad950371..7395822430b1 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1891,13 +1891,14 @@ static int iaa_compress(struct iaa_compression_ctx =
*ctx, struct iaa_req *req,
 					  ctx->mode, iaa_device->compression_modes[ctx->mode]);
=20
 	if (likely(!ctx->use_irq)) {
+		req->drv_data =3D idxd_desc;
 		iaa_submit_desc_movdir64b(wq, idxd_desc);
=20
 		/* Update stats */
 		update_total_comp_calls();
 		update_wq_comp_calls(wq);
=20
-		if (ctx->async_mode)
+		if (req->flags & IAA_REQ_POLL_FLAG)
 			return -EINPROGRESS;
=20
 		ret =3D check_completion(dev, idxd_desc->iax_completion, true, false);
@@ -1979,13 +1980,14 @@ static int iaa_decompress(struct iaa_compression_ct=
x *ctx, struct iaa_req *req,
 	desc =3D iaa_setup_decompress_hw_desc(idxd_desc, src_addr, slen, dst_addr=
, *dlen);
=20
 	if (likely(!ctx->use_irq)) {
+		req->drv_data =3D idxd_desc;
 		iaa_submit_desc_movdir64b(wq, idxd_desc);
=20
 		/* Update stats */
 		update_total_decomp_calls();
 		update_wq_decomp_calls(wq);
=20
-		if (ctx->async_mode)
+		if (req->flags & IAA_REQ_POLL_FLAG)
 			return -EINPROGRESS;
=20
 		ret =3D check_completion(dev, idxd_desc->iax_completion, false, false);
@@ -2188,6 +2190,71 @@ static int iaa_comp_adecompress(struct iaa_compressi=
on_ctx *ctx, struct iaa_req
 	return ret;
 }
=20
+static int __maybe_unused iaa_comp_poll(struct iaa_compression_ctx *ctx, s=
truct iaa_req *req)
+{
+	struct idxd_desc *idxd_desc;
+	struct idxd_device *idxd;
+	struct iaa_wq *iaa_wq;
+	struct pci_dev *pdev;
+	struct device *dev;
+	struct idxd_wq *wq;
+	bool compress_op;
+	int ret;
+
+	idxd_desc =3D req->drv_data;
+	if (!idxd_desc)
+		return -EAGAIN;
+
+	compress_op =3D (idxd_desc->iax_hw->opcode =3D=3D IAX_OPCODE_COMPRESS);
+	wq =3D idxd_desc->wq;
+	iaa_wq =3D idxd_wq_get_private(wq);
+	idxd =3D iaa_wq->iaa_device->idxd;
+	pdev =3D idxd->pdev;
+	dev =3D &pdev->dev;
+
+	ret =3D check_completion(dev, idxd_desc->iax_completion, compress_op, tru=
e);
+	if (ret =3D=3D -EAGAIN)
+		return ret;
+	if (ret)
+		goto out;
+
+	req->dlen =3D idxd_desc->iax_completion->output_size;
+
+	/* Update stats */
+	if (compress_op) {
+		update_total_comp_bytes_out(req->dlen);
+		update_wq_comp_bytes(wq, req->dlen);
+	} else {
+		update_total_decomp_bytes_in(req->slen);
+		update_wq_decomp_bytes(wq, req->slen);
+	}
+
+	if (compress_op && ctx->verify_compress) {
+		dma_addr_t src_addr, dst_addr;
+
+		req->compression_crc =3D idxd_desc->iax_completion->crc;
+
+		dma_sync_sg_for_device(dev, req->dst, 1, DMA_FROM_DEVICE);
+		dma_sync_sg_for_device(dev, req->src, 1, DMA_TO_DEVICE);
+
+		src_addr =3D sg_dma_address(req->src);
+		dst_addr =3D sg_dma_address(req->dst);
+
+		ret =3D iaa_compress_verify(ctx, req, wq, src_addr, req->slen,
+					  dst_addr, req->dlen);
+	}
+
+out:
+	/* caller doesn't call crypto_wait_req, so no acomp_request_complete() */
+	dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
+
+	idxd_free_desc(idxd_desc->wq, idxd_desc);
+	percpu_ref_put(&iaa_wq->ref);
+
+	return ret;
+}
+
 static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa=
_mode mode)
 {
 	ctx->mode =3D mode;
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F3ECD2773C3;
	Fri, 26 Sep 2025 03:35:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857718; cv=none;
 b=BoipipNuBNUS8Grh2h+1ru/HCaJLZrbpg3ZIj2xu5vfMnTonaKHEz0Y7YaHzjidN+5cmTU9mg2fnT2r9hBhP1lEpZRtJ8JmmMwN+OnJLT6kr0zJK4aUNC3+jShRm5sXTulqRdli263EwR03Qt2trkAdyWy/DvcEAKelpy2+PEIw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857718; c=relaxed/simple;
	bh=pwC8gGnN93tPpG8XUySSvRTzqGEe+iI+19bkiX/xCZY=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=iYJUEn7zCwdClF2nz48yTMRR2qc378CoQsmPRREHKsiGrOCjeBv+/zA7IkAS3DxWNTzllZ35dJYxyRQPMVu2TNvRpwit5NWzkVLluzfB0ValpWmwEJYO6/EgUGgMFHwJKbQ1SfuC/LzYIVljaMoCULodomHXxBS954pKs7pVNXI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=aEgwT3AV; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="aEgwT3AV"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857716; x=1790393716;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=pwC8gGnN93tPpG8XUySSvRTzqGEe+iI+19bkiX/xCZY=;
  b=aEgwT3AVsePz72KHk2xQhMTgmnP0pJqbMnuUBIlRZ7vKEpcU8N3GiPTf
   S4+WD0rO/GpGvIAR6si2K0gcrQ4qBcRK84i+rwm59+TObwxsxwSYq/qV7
   DWv+pFZPw3/kewU63nqX2KxOJbnRP40rZapO1kzA7iavYOJWkVUWoNR7w
   k08YJ/UvunJ6Kez7Eon9Wbs8+YVfCbpdx3ud8JixXbP79cVqizgmCprnZ
   KSJVTRB05K1NRalhIeiuwN+LJMh4TurxJDipNxpbMn++JvqDavxrKr3ZI
   4bNizonZaSjOpKsWZ06XaSWztA/Dbz+/aqiDfCaXlcyWHc4JlawcZEyTg
   w==;
X-CSE-ConnectionGUID: DQaP5aACSiWkiffZH+6RtA==
X-CSE-MsgGUID: R+GxTc2ZSVGmxsOjDK+6/g==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819554"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819554"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:06 -0700
X-CSE-ConnectionGUID: 5JNP4bi0RRaUIOdbR0R7xg==
X-CSE-MsgGUID: wIVCNUzbRyyBSK7y3Qvb8g==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636587"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:05 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 13/23] crypto: acomp - Define a unit_size in struct
 acomp_req to enable batching.
Date: Thu, 25 Sep 2025 20:34:52 -0700
Message-Id: <20250926033502.7486-14-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

We add a new @unit_size data member to struct acomp_req along with a
helper function acomp_request_set_unit_size() for kernel modules to set
the unit size to use while breaking down the request's src/dst
scatterlists.

An acomp_alg can implement batching by using the @req->unit_size to
break down the SG lists passed in via @req->dst and/or @req->src, to
submit individual @req->slen/@req->unit_size compress jobs or
@req->dlen/@req->unit_size decompress jobs, for batch compression and
batch decompression respectively.

In case of batch compression, the folio's pages for the batch can be
retrieved from the @req->src scatterlist by using an struct sg_page_iter
after determining the number of pages as @req->slen/@req->unit_size.

Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 include/crypto/acompress.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 9eacb9fa375d..0f1334168f1b 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -79,6 +79,7 @@ struct acomp_req_chain {
  * @dvirt:	Destination virtual address
  * @slen:	Size of the input buffer
  * @dlen:	Size of the output buffer and number of bytes produced
+ * @unit_size:  Unit size for the request for use in batching
  * @chain:	Private API code data, do not use
  * @__ctx:	Start of private context data
  */
@@ -94,6 +95,7 @@ struct acomp_req {
 	};
 	unsigned int slen;
 	unsigned int dlen;
+	unsigned int unit_size;
=20
 	struct acomp_req_chain chain;
=20
@@ -328,9 +330,43 @@ static inline void acomp_request_set_callback(struct a=
comp_req *req,
 {
 	flgs &=3D ~CRYPTO_ACOMP_REQ_PRIVATE;
 	flgs |=3D req->base.flags & CRYPTO_ACOMP_REQ_PRIVATE;
+	req->unit_size =3D 0;
 	crypto_request_set_callback(&req->base, flgs, cmpl, data);
 }
=20
+/**
+ * acomp_request_set_unit_size() -- Sets the unit size for the request.
+ *
+ * As suggested by Herbert Xu, this is a new helper function that enables
+ * batching for zswap, IPComp, etc.
+ *
+ * Example usage model:
+ *
+ * A module like zswap that wants to use batch compression of @nr_pages wi=
th
+ * crypto_acomp must create an output SG table for the batch, initialized =
to
+ * contain @nr_pages SG lists. Each scatterlist is mapped to the nth
+ * destination buffer for the batch.
+ *
+ * An acomp_alg can implement batching by using the @req->unit_size to
+ * break down the SG lists passed in via @req->dst and/or @req->src, to
+ * submit individual @req->slen/@req->unit_size compress jobs or
+ * @req->dlen/@req->unit_size decompress jobs, for batch compression and
+ * batch decompression respectively.
+ *
+ * This API must be called after acomp_request_set_callback(),
+ * which sets @req->unit_size to 0.
+ *
+ * @du would be PAGE_SIZE for zswap, it could be the MTU for IPsec.
+ *
+ * @req:	asynchronous compress request
+ * @du:		data unit size of the input buffer scatterlist.
+ */
+static inline void acomp_request_set_unit_size(struct acomp_req *req,
+					       unsigned int du)
+{
+	req->unit_size =3D du;
+}
+
 /**
  * acomp_request_set_params() -- Sets request parameters
  *
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 336EF27A469;
	Fri, 26 Sep 2025 03:35:18 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857722; cv=none;
 b=R0Wah6us5r4BdETZ66cq4IVVLve1HS2A7SNsF+a0n9A4ZZR1UicRle1eJRltvcI2luB7gzQFxG4nZDH6FME+xTFtaYJsx6mAJu0Pv1tgCa23o4tjR8jhDY/J6PHJouCSEZal74K06s0gxj3H6hjkttK6kKksQiHQsFgxE8WKV4g=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857722; c=relaxed/simple;
	bh=N8aXNrIp+qZCywAl09Ev5Oxt+e20/Z67wlwTT/DfLEI=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=ko0LfnIM9EA4UXN9+xkymFfWi+cvm62T3Kx5AAQ7kfzcZrDPktySapPbaMDdXUBCpXVMqNeDcM4KVqCGURcUL9m0PK0+XH+FgAtNzwGU+3tGzvdK8ZG/LopoSnXjItWkO0sRXUHaiLgRK0zQ+IPVC8vzuxayu3rd9kF9bWHFAUQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=RCtnjCXF; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="RCtnjCXF"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857718; x=1790393718;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=N8aXNrIp+qZCywAl09Ev5Oxt+e20/Z67wlwTT/DfLEI=;
  b=RCtnjCXFkM2zHtLRRHgFRsH1PXvMh3jleXora/PTI96DzPJzBkolYzdQ
   qDVovdLp+MLaz708iYt/l/VJRsxDDMRTkVCGt6ArrVgqRTYA5RTUQvJsT
   KHjvem1p4n6gjt4fmO4IQAzUxFhJNHT5PD1pMhDO5jvel4qf48nmzmaIF
   ZQyc26ooGQfreEPZJZ3D2YxfHO4sDcwd+1Fda8FF4V2aGOM3mOlQVJQZ4
   kWuM4eXcN0EpVkfcc26HcpeVwWOeshkgm/mbCf5al5UEo+dnvACsbW/jQ
   uEzQpQQkIKGZAowDtuQ2vuUczH0JFpAvHhYnYunXz6kVvQsUQb/sbvraN
   A==;
X-CSE-ConnectionGUID: h+kL4AAaSaShLQSLaM+uHg==
X-CSE-MsgGUID: Axvs5UvbQACzIjib8He6yg==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819573"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819573"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:06 -0700
X-CSE-ConnectionGUID: 64DpwJ5kRa6KB1khQF1v+Q==
X-CSE-MsgGUID: kfRBRszPSgWsxoKPzYQUTg==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636592"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:05 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 14/23] crypto: iaa - IAA Batching for parallel
 compressions/decompressions.
Date: Thu, 25 Sep 2025 20:34:53 -0700
Message-Id: <20250926033502.7486-15-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch introduces batch compressions/decompressions in
iaa_crypto. Two new interfaces are provided for use in the kernel,
either directly, in the zram/zcomp backend, or by calling
crypto_acomp_[de]compress() in the case of zswap.

IAA Batching allows the kernel swap modules to compress/decompress
multiple pages/buffers in parallel in hardware, significantly improving
swapout/swapin latency and throughput.

The patch defines an iaa_crypto constant, IAA_CRYPTO_MAX_BATCH_SIZE
(set to 8U currently). This is the maximum batch-size for IAA, and
represents the maximum number of pages/buffers that can be
compressed/decompressed in parallel, respectively.

In order to support IAA batching, the iaa_crypto driver allocates
IAA_CRYPTO_MAX_BATCH_SIZE "struct iaa_req *reqs[]" per-CPU, upon
initialization. Notably, the task of allocating multiple requests to
submit to the hardware for parallel [de]compressions is taken over by
iaa_crypto, so that zswap/zram don't need to allocate the reqs.

Compress batching is expected to be called by kernel modules such as
zswap by passing the folio pages for the "source" SG list of the
acomp_req, and by constructing an SG table of SG lists for the output
buffers and setting the acomp_req's "dst" to the head of this list of
scatterlists. Thanks to Herbert Xu for suggesting this batching
architecture.

Within the iaa_crypto driver's compress batching function:

1) The per-CPU iaa_reqs are populated from the acomp_req's src/dst SG
   lists.

2) All iaa_reqs are submitted to the hardware in async mode, using
   movdir64b. This enables hardware parallelism, because we don't wait
   for one compress/decompress job to finish before submitting the next
   one.

3) The iaa_reqs submitted are polled for completion statuses in a
   non-blocking manner in a while loop: each request that is still
   pending is polled once, and this repeats, until all requests have
   completed.

IAA's maximum batch-size can be queried with the following API:

  unsigned int iaa_comp_get_max_batch_size(void);

This allows swap modules such as zram to allocate required batching
dst buffers and then invoke fully asynchronous batch parallel
compression/decompression of pages/buffers on systems with Intel IAA, by
invoking these batching API, respectively:

 int iaa_comp_compress_batch(
        enum iaa_mode mode,
        struct iaa_req *parent_req,
        unsigned int unit_size);

 int iaa_comp_decompress_batch(
        enum iaa_mode mode,
        struct iaa_req *parent_req,
        unsigned int unit_size);

The parameter @unit_size represents the unit size in bytes, for
dis-assembling the source/destination
@parent_req->slen/@parent_req->dlen and SG lists passed in through
@parent_req->src and @parent_req->dst.

A zram/zcomp backend_deflate_iaa.c will be submitted as a separate patch
series, and will enable single-page and batch IAA compress/decompress
ops.

The zswap interface to these batching API will be done by setting up the
acomp_req through these crypto API:

 acomp_request_set_src_folio()
 acomp_request_set_dst_sg()
 acomp_request_set_unit_size()

before proceeding to invoke batch compression/decompression using the
existing crypto_acomp_compress()/crypto_acomp_decompress() interfaces.

The new crypto_acomp-agnostic iaa_comp_[de]compress_batch() API result
in impressive latency improvements for zswap batch [de]compression, as
compared to a crypto_acomp based batching interface, most likely because
we avoid the overhead of crypto_acomp: we observe 17.78 micro-seconds
p99 latency savings for a decompress batch of 8 with the new
iaa_comp_decompress_batch() API.

Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto.h      |  15 +
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 346 ++++++++++++++++++++-
 include/linux/iaa_comp.h                   |  35 +++
 3 files changed, 388 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index 1cc383c94fb8..db1e50574662 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -47,6 +47,21 @@
  */
 #define IAA_REQ_POLL_FLAG		0x00000002
=20
+/*
+ * The maximum compress/decompress batch size for IAA's batch compression
+ * and batch decompression functionality.
+ */
+#define IAA_CRYPTO_MAX_BATCH_SIZE 8U
+
+/*
+ * Used to create per-CPU structure comprising of IAA_CRYPTO_MAX_BATCH_SIZE
+ * reqs for batch [de]compressions.
+ */
+struct iaa_batch_ctx {
+	struct iaa_req **reqs;
+	struct mutex mutex;
+};
+
 /* Representation of IAA workqueue */
 struct iaa_wq {
 	struct list_head	list;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 7395822430b1..0a620f2dc58e 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -55,6 +55,9 @@ static struct wq_table_entry **pkg_global_comp_wqs;
 /* For software deflate fallback compress/decompress. */
 static struct crypto_acomp *deflate_crypto_acomp;
=20
+/* Per-cpu iaa_reqs for batching. */
+static struct iaa_batch_ctx __percpu *iaa_batch_ctx;
+
 LIST_HEAD(iaa_devices);
 DEFINE_MUTEX(iaa_devices_lock);
=20
@@ -2190,7 +2193,7 @@ static int iaa_comp_adecompress(struct iaa_compressio=
n_ctx *ctx, struct iaa_req
 	return ret;
 }
=20
-static int __maybe_unused iaa_comp_poll(struct iaa_compression_ctx *ctx, s=
truct iaa_req *req)
+static int iaa_comp_poll(struct iaa_compression_ctx *ctx, struct iaa_req *=
req)
 {
 	struct idxd_desc *idxd_desc;
 	struct idxd_device *idxd;
@@ -2255,6 +2258,234 @@ static int __maybe_unused iaa_comp_poll(struct iaa_=
compression_ctx *ctx, struct
 	return ret;
 }
=20
+static __always_inline void iaa_set_req_poll(
+	struct iaa_req *reqs[],
+	int nr_reqs,
+	bool set_flag)
+{
+	int i;
+
+	for (i =3D 0; i < nr_reqs; ++i) {
+		set_flag ? (reqs[i]->flags |=3D IAA_REQ_POLL_FLAG) :
+			   (reqs[i]->flags &=3D ~IAA_REQ_POLL_FLAG);
+	}
+}
+
+/**
+ * This API provides IAA compress batching functionality for use by swap
+ * modules.
+ *
+ * @ctx:  compression ctx for the requested IAA mode (fixed/dynamic).
+ * @parent_req: The "parent" iaa_req that contains SG lists for the batch's
+ *              inputs and outputs.
+ * @unit_size: The unit size to apply to @parent_req->slen to get the numb=
er of
+ *             scatterlists it contains.
+ *
+ * The caller should check the individual sg->lengths in the @parent_req f=
or
+ * errors, including incompressible page errors.
+ *
+ * Returns 0 if all compress requests in the batch complete successfully,
+ * -EINVAL otherwise.
+ */
+static int iaa_comp_acompress_batch(
+	struct iaa_compression_ctx *ctx,
+	struct iaa_req *parent_req,
+	unsigned int unit_size)
+{
+	struct iaa_batch_ctx *cpu_ctx =3D raw_cpu_ptr(iaa_batch_ctx);
+	int nr_reqs =3D parent_req->slen / unit_size;
+	int errors[IAA_CRYPTO_MAX_BATCH_SIZE];
+	bool compressions_done =3D false;
+	struct sg_page_iter sgiter;
+	struct scatterlist *sg;
+	struct iaa_req **reqs;
+	int i, err =3D 0;
+
+	mutex_lock(&cpu_ctx->mutex);
+
+	reqs =3D cpu_ctx->reqs;
+
+	__sg_page_iter_start(&sgiter, parent_req->src, nr_reqs,
+			     parent_req->src->offset/unit_size);
+
+	for (i =3D 0; i < nr_reqs; ++i, ++sgiter.sg_pgoffset) {
+		sg_set_page(reqs[i]->src, sg_page_iter_page(&sgiter), PAGE_SIZE, 0);
+		reqs[i]->slen =3D PAGE_SIZE;
+	}
+
+	for_each_sg(parent_req->dst, sg, nr_reqs, i) {
+		sg->length =3D PAGE_SIZE;
+		parent_req->dlens[i] =3D &sg->length;
+		reqs[i]->dst =3D sg;
+		reqs[i]->dlen =3D PAGE_SIZE;
+	}
+
+	iaa_set_req_poll(reqs, nr_reqs, true);
+
+	/*
+	 * Prepare and submit the batch of iaa_reqs to IAA. IAA will process
+	 * these compress jobs in parallel.
+	 */
+	for (i =3D 0; i < nr_reqs; ++i) {
+		errors[i] =3D iaa_comp_acompress(ctx, reqs[i]);
+
+		if (likely(errors[i] =3D=3D -EINPROGRESS)) {
+			errors[i] =3D -EAGAIN;
+		} else if (unlikely(errors[i])) {
+			*parent_req->dlens[i] =3D errors[i];
+			err =3D -EINVAL;
+		} else {
+			*parent_req->dlens[i] =3D reqs[i]->dlen;
+		}
+	}
+
+	/*
+	 * Asynchronously poll for and process IAA compress job completions.
+	 */
+	while (!compressions_done) {
+		compressions_done =3D true;
+
+		for (i =3D 0; i < nr_reqs; ++i) {
+			/*
+			 * Skip, if the compression has already completed
+			 * successfully or with an error.
+			 */
+			if (errors[i] !=3D -EAGAIN)
+				continue;
+
+			errors[i] =3D iaa_comp_poll(ctx, reqs[i]);
+
+			if (errors[i]) {
+				if (likely(errors[i] =3D=3D -EAGAIN)) {
+					compressions_done =3D false;
+				} else {
+					*parent_req->dlens[i] =3D errors[i];
+					err =3D -EINVAL;
+				}
+			} else {
+				*parent_req->dlens[i] =3D reqs[i]->dlen;
+			}
+		}
+	}
+
+	/*
+	 * For the same 'reqs[]' to be usable by
+	 * iaa_comp_acompress()/iaa_comp_adecompress(),
+	 * clear the IAA_REQ_POLL_FLAG bit on all iaa_reqs.
+	 */
+	iaa_set_req_poll(reqs, nr_reqs, false);
+
+	mutex_unlock(&cpu_ctx->mutex);
+	return err;
+}
+
+/**
+ * This API provides IAA decompress batching functionality for use by swap
+ * modules.
+ *
+ * @ctx:  compression ctx for the requested IAA mode (fixed/dynamic).
+ * @parent_req: The "parent" iaa_req that contains SG lists for the batch's
+ *              inputs and outputs.
+ * @unit_size: The unit size to apply to @parent_req->dlen to get the numb=
er of
+ *             scatterlists it contains.
+ *
+ * The caller should check @parent_req->dst scatterlist's component SG lis=
ts'
+ * @length for errors and handle @length !=3D PAGE_SIZE.
+ *
+ * Returns 0 if all decompress requests complete successfully,
+ * -EINVAL otherwise.
+ */
+static int iaa_comp_adecompress_batch(
+	struct iaa_compression_ctx *ctx,
+	struct iaa_req *parent_req,
+	unsigned int unit_size)
+{
+	struct iaa_batch_ctx *cpu_ctx =3D raw_cpu_ptr(iaa_batch_ctx);
+	int nr_reqs =3D parent_req->dlen / unit_size;
+	int errors[IAA_CRYPTO_MAX_BATCH_SIZE];
+	bool decompressions_done =3D false;
+	struct scatterlist *sg;
+	struct iaa_req **reqs;
+	int i, err =3D 0;
+
+	mutex_lock(&cpu_ctx->mutex);
+
+	reqs =3D cpu_ctx->reqs;
+
+	for_each_sg(parent_req->src, sg, nr_reqs, i) {
+		reqs[i]->src =3D sg;
+		reqs[i]->slen =3D sg->length;
+	}
+
+	for_each_sg(parent_req->dst, sg, nr_reqs, i) {
+		parent_req->dlens[i] =3D &sg->length;
+		reqs[i]->dst =3D sg;
+		reqs[i]->dlen =3D PAGE_SIZE;
+	}
+
+	iaa_set_req_poll(reqs, nr_reqs, true);
+
+	/*
+	 * Prepare and submit the batch of iaa_reqs to IAA. IAA will process
+	 * these decompress jobs in parallel.
+	 */
+	for (i =3D 0; i < nr_reqs; ++i) {
+		errors[i] =3D iaa_comp_adecompress(ctx, reqs[i]);
+
+		/*
+		 * If it failed desc allocation/submission, errors[i] can
+		 * be 0 or error value from software decompress.
+		 */
+		if (likely(errors[i] =3D=3D -EINPROGRESS)) {
+			errors[i] =3D -EAGAIN;
+		} else if (unlikely(errors[i])) {
+			*parent_req->dlens[i] =3D errors[i];
+			err =3D -EINVAL;
+		} else {
+			*parent_req->dlens[i] =3D reqs[i]->dlen;
+		}
+	}
+
+	/*
+	 * Asynchronously poll for and process IAA decompress job completions.
+	 */
+	while (!decompressions_done) {
+		decompressions_done =3D true;
+
+		for (i =3D 0; i < nr_reqs; ++i) {
+			/*
+			 * Skip, if the decompression has already completed
+			 * successfully or with an error.
+			 */
+			if (errors[i] !=3D -EAGAIN)
+				continue;
+
+			errors[i] =3D iaa_comp_poll(ctx, reqs[i]);
+
+			if (errors[i]) {
+				if (likely(errors[i] =3D=3D -EAGAIN)) {
+					decompressions_done =3D false;
+				} else {
+					*parent_req->dlens[i] =3D errors[i];
+					err =3D -EINVAL;
+				}
+			} else {
+				*parent_req->dlens[i] =3D reqs[i]->dlen;
+			}
+		}
+	}
+
+	/*
+	 * For the same 'reqs[]' to be usable by
+	 * iaa_comp_acompress()/iaa_comp_adecompress(),
+	 * clear the IAA_REQ_POLL_FLAG bit on all iaa_reqs.
+	 */
+	iaa_set_req_poll(reqs, nr_reqs, false);
+
+	mutex_unlock(&cpu_ctx->mutex);
+	return err;
+}
+
 static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa=
_mode mode)
 {
 	ctx->mode =3D mode;
@@ -2357,6 +2588,12 @@ u8 iaa_comp_get_modes(char **iaa_mode_names, enum ia=
a_mode *iaa_modes)
 }
 EXPORT_SYMBOL_GPL(iaa_comp_get_modes);
=20
+__always_inline unsigned int iaa_comp_get_max_batch_size(void)
+{
+	return IAA_CRYPTO_MAX_BATCH_SIZE;
+}
+EXPORT_SYMBOL_GPL(iaa_comp_get_max_batch_size);
+
 __always_inline int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *=
req)
 {
 	return iaa_comp_acompress(iaa_ctx[mode], req);
@@ -2369,6 +2606,24 @@ __always_inline int iaa_comp_decompress(enum iaa_mod=
e mode, struct iaa_req *req)
 }
 EXPORT_SYMBOL_GPL(iaa_comp_decompress);
=20
+__always_inline int iaa_comp_compress_batch(
+	enum iaa_mode mode,
+	struct iaa_req *parent_req,
+	unsigned int unit_size)
+{
+	return iaa_comp_acompress_batch(iaa_ctx[mode], parent_req, unit_size);
+}
+EXPORT_SYMBOL_GPL(iaa_comp_compress_batch);
+
+__always_inline int iaa_comp_decompress_batch(
+	enum iaa_mode mode,
+	struct iaa_req *parent_req,
+	unsigned int unit_size)
+{
+	return iaa_comp_adecompress_batch(iaa_ctx[mode], parent_req, unit_size);
+}
+EXPORT_SYMBOL_GPL(iaa_comp_decompress_batch);
+
 /*********************************************
  * Interfaces to crypto_alg and crypto_acomp.
  *********************************************/
@@ -2383,9 +2638,16 @@ static int iaa_comp_acompress_main(struct acomp_req =
*areq)
 	if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
 		ctx =3D iaa_ctx[idx];
=20
-		acomp_to_iaa(areq, &req, ctx);
-		ret =3D iaa_comp_acompress(ctx, &req);
-		iaa_to_acomp(unlikely(ret) ? ret : req.dlen, areq);
+		if (likely(areq->slen =3D=3D areq->unit_size)) {
+			acomp_to_iaa(areq, &req, ctx);
+			ret =3D iaa_comp_acompress(ctx, &req);
+			iaa_to_acomp(unlikely(ret) ? ret : req.dlen, areq);
+			return ret;
+		} else {
+			acomp_to_iaa(areq, &req, ctx);
+			ret =3D iaa_comp_acompress_batch(ctx, &req, areq->unit_size);
+			return ret;
+		}
 	}
=20
 	return ret;
@@ -2401,9 +2663,16 @@ static int iaa_comp_adecompress_main(struct acomp_re=
q *areq)
 	if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
 		ctx =3D iaa_ctx[idx];
=20
-		acomp_to_iaa(areq, &req, ctx);
-		ret =3D iaa_comp_adecompress(ctx, &req);
-		iaa_to_acomp(unlikely(ret) ? ret : req.dlen, areq);
+		if (likely(areq->dlen =3D=3D areq->unit_size)) {
+			acomp_to_iaa(areq, &req, ctx);
+			ret =3D iaa_comp_adecompress(ctx, &req);
+			iaa_to_acomp(unlikely(ret) ? ret : req.dlen, areq);
+			return ret;
+		} else {
+			acomp_to_iaa(areq, &req, ctx);
+			ret =3D iaa_comp_adecompress_batch(ctx, &req, areq->unit_size);
+			return ret;
+		}
 	}
=20
 	return ret;
@@ -2699,9 +2968,31 @@ static struct idxd_device_driver iaa_crypto_driver =
=3D {
  * Module init/exit.
  ********************/
=20
+static void iaa_batch_ctx_dealloc(void)
+{
+	int cpu;
+	u8 i;
+
+	if (!iaa_batch_ctx)
+		return;
+
+	for (cpu =3D 0; cpu < nr_cpus; cpu++) {
+		struct iaa_batch_ctx *cpu_ctx =3D per_cpu_ptr(iaa_batch_ctx, cpu);
+
+		if (cpu_ctx && cpu_ctx->reqs) {
+			for (i =3D 0; i < IAA_CRYPTO_MAX_BATCH_SIZE; ++i)
+				kfree(cpu_ctx->reqs[i]);
+			kfree(cpu_ctx->reqs);
+		}
+	}
+
+	free_percpu(iaa_batch_ctx);
+}
+
 static int __init iaa_crypto_init_module(void)
 {
-	int ret =3D 0;
+	int cpu, ret =3D 0;
+	u8 i;
=20
 	INIT_LIST_HEAD(&iaa_devices);
=20
@@ -2756,6 +3047,39 @@ static int __init iaa_crypto_init_module(void)
 		goto err_sync_attr_create;
 	}
=20
+	/* Allocate batching resources for iaa_crypto. */
+	iaa_batch_ctx =3D alloc_percpu_gfp(struct iaa_batch_ctx, GFP_KERNEL | __G=
FP_ZERO);
+	if (!iaa_batch_ctx) {
+		pr_debug("Failed to allocate per-cpu iaa_batch_ctx\n");
+		goto batch_ctx_fail;
+	}
+
+	for (cpu =3D 0; cpu < nr_cpus; cpu++) {
+		struct iaa_batch_ctx *cpu_ctx =3D per_cpu_ptr(iaa_batch_ctx, cpu);
+		int cpu_node =3D cpu_to_node(cpu);
+
+		cpu_ctx->reqs =3D kcalloc_node(IAA_CRYPTO_MAX_BATCH_SIZE,
+					     sizeof(struct iaa_req *),
+					     GFP_KERNEL, cpu_node);
+
+		if (!cpu_ctx->reqs)
+			goto reqs_fail;
+
+		for (i =3D 0; i < IAA_CRYPTO_MAX_BATCH_SIZE; ++i) {
+			cpu_ctx->reqs[i] =3D kzalloc_node(sizeof(struct iaa_req),
+							GFP_KERNEL, cpu_node);
+			if (!cpu_ctx->reqs[i]) {
+				pr_debug("Could not alloc iaa_req reqs[%d]\n", i);
+				goto reqs_fail;
+			}
+
+			sg_init_table(&cpu_ctx->reqs[i]->sg_src, 1);
+			cpu_ctx->reqs[i]->src =3D &cpu_ctx->reqs[i]->sg_src;
+		}
+
+		mutex_init(&cpu_ctx->mutex);
+	}
+
 	if (iaa_crypto_debugfs_init())
 		pr_warn("debugfs init failed, stats not available\n");
=20
@@ -2763,6 +3087,11 @@ static int __init iaa_crypto_init_module(void)
 out:
 	return ret;
=20
+reqs_fail:
+	iaa_batch_ctx_dealloc();
+batch_ctx_fail:
+	driver_remove_file(&iaa_crypto_driver.drv,
+			   &driver_attr_sync_mode);
 err_sync_attr_create:
 	driver_remove_file(&iaa_crypto_driver.drv,
 			   &driver_attr_verify_compress);
@@ -2789,6 +3118,7 @@ static void __exit iaa_crypto_cleanup_module(void)
 	iaa_unregister_acomp_compression_device();
 	iaa_unregister_compression_device();
=20
+	iaa_batch_ctx_dealloc();
 	iaa_crypto_debugfs_cleanup();
 	driver_remove_file(&iaa_crypto_driver.drv,
 			   &driver_attr_sync_mode);
diff --git a/include/linux/iaa_comp.h b/include/linux/iaa_comp.h
index ec061315f477..7b765760485c 100644
--- a/include/linux/iaa_comp.h
+++ b/include/linux/iaa_comp.h
@@ -18,11 +18,13 @@ enum iaa_mode {
 struct iaa_req {
 	struct scatterlist *src;
 	struct scatterlist *dst;
+	struct scatterlist sg_src;
 	unsigned int slen;
 	unsigned int dlen;
 	u32 flags;
 	u32 compression_crc;
 	void *drv_data; /* for driver internal use */
+	int **dlens;
 };
=20
 extern bool iaa_comp_enabled(void);
@@ -35,10 +37,22 @@ extern u8 iaa_comp_get_modes(char **iaa_mode_names, enu=
m iaa_mode *iaa_modes);
=20
 extern void iaa_comp_put_modes(char **iaa_mode_names, enum iaa_mode *iaa_m=
odes, u8 nr_modes);
=20
+extern unsigned int iaa_comp_get_max_batch_size(void);
+
 extern int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *req);
=20
 extern int iaa_comp_decompress(enum iaa_mode mode, struct iaa_req *req);
=20
+extern int iaa_comp_compress_batch(
+	enum iaa_mode mode,
+	struct iaa_req *parent_req,
+	unsigned int unit_size);
+
+extern int iaa_comp_decompress_batch(
+	enum iaa_mode mode,
+	struct iaa_req *parent_req,
+	unsigned int unit_size);
+
 #else /* CONFIG_CRYPTO_DEV_IAA_CRYPTO */
=20
 enum iaa_mode {
@@ -71,6 +85,11 @@ static inline void iaa_comp_put_modes(char **iaa_mode_na=
mes, enum iaa_mode *iaa_
 {
 }
=20
+static inline unsigned int iaa_comp_get_max_batch_size(void)
+{
+	return 0;
+}
+
 static inline int iaa_comp_compress(enum iaa_mode mode, struct iaa_req *re=
q)
 {
 	return -EINVAL;
@@ -81,6 +100,22 @@ static inline int iaa_comp_decompress(enum iaa_mode mod=
e, struct iaa_req *req)
 	return -EINVAL;
 }
=20
+static inline int iaa_comp_compress_batch(
+	enum iaa_mode mode,
+	struct iaa_req *parent_req,
+	unsigned int unit_size)
+{
+	return -EINVAL;
+}
+
+static inline int iaa_comp_decompress_batch(
+	enum iaa_mode mode,
+	struct iaa_req *parent_req,
+	unsigned int unit_size)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_CRYPTO_DEV_IAA_CRYPTO */
=20
 #endif
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id BF62427B331;
	Fri, 26 Sep 2025 03:35:18 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857720; cv=none;
 b=UceqdW+YYjzU+j9KkNaEkkDJTYYCYUjDxFM0T0h4S4+vll2A74pMoMUFlCB5P0hEV43OOp3D/NEC/FB1fvscXRSJ+U0JnNzUAUkt1hoWbTVp9FuDXRZks1Mjc3DgftWmyfG/+mMjxZmFX9VfdCsnALFZe+MGE3sz6UYre7iXnxs=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857720; c=relaxed/simple;
	bh=+uznglJhVLAkGBIwRR16VJzAmGsqt4JlgKBPGDbBxF4=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=tn8+0pFrThw3ba2QLFljgXJbmFVKkgf3oIZ01/dxodRkS1qc/UO3HWLBYXs1EmqFtZdBZ3TNdPTjsDHnMhcFeojEhSad2cPZnTU1vRHHX0qay6UsiES7vsBgWpNiod5KWDpRr71lZbHPFzvNTON4PHrLMCsstCsYqJ+yaysYO8M=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=Knxes3Ld; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="Knxes3Ld"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857719; x=1790393719;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=+uznglJhVLAkGBIwRR16VJzAmGsqt4JlgKBPGDbBxF4=;
  b=Knxes3LdH97otFRCJFgIWrrcgEpyklNtlx1qOi9XKFrhC1EkSSEPNXpq
   8rj+C/T3gWinmwq0FMxd9nkV9FhQwshTnw2bh78s8GMb97CP/ztdwe1Qw
   gsUOgEfB0SVhEv6fEMeOSkClBLf2Ssfwr+Loh2w2CF7VBvvkhTHXBHn+0
   8QiLvVExKD8V44byiyentnEtl7t/aaGocsqQa9DK+csDKMMvUz3kl2s0O
   TwVgl5asnTY5iYGNm3Y89UJ6H5JdjevwHtuA5O4ZIKWlg1Js4YJdDwZOJ
   jhXu9yOWVlTfstxKpfCVGMjyDGEQ9uVz2l6rFaWpkLfAKiaohRut1a+eD
   g==;
X-CSE-ConnectionGUID: D4jV1qaZSdmZsxbySebotg==
X-CSE-MsgGUID: Eew0nq+hTfCrWkpDXsOBhg==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819585"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819585"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:06 -0700
X-CSE-ConnectionGUID: igqGqvaHSDWQ8xow2+RIQA==
X-CSE-MsgGUID: uN+xYXNsTeCmj2a2HrhHZw==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636595"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:05 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 15/23] crypto: iaa - Enable async mode and make it the
 default.
Date: Thu, 25 Sep 2025 20:34:54 -0700
Message-Id: <20250926033502.7486-16-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch enables the 'async' sync_mode in the driver. Further, it sets
the default sync_mode to 'async', which makes it easier for IAA hardware
acceleration in the iaa_crypto driver to be loaded by default in the most
efficient/recommended 'async' mode for parallel
compressions/decompressions, namely, asynchronous submission of
descriptors, followed by polling for job completions. Earlier, the
"sync" mode used to be the default.

The iaa_crypto driver documentation has been updated with these
changes.

This way, anyone who wants to use IAA for zswap/zram can do so after
building the kernel, and without having to go through these steps to use
async mode:

  1) disable all the IAA device/wq bindings that happen at boot time
  2) rmmod iaa_crypto
  3) modprobe iaa_crypto
  4) echo async > /sys/bus/dsa/drivers/crypto/sync_mode
  5) re-run initialization of the IAA devices and wqs

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 Documentation/driver-api/crypto/iaa/iaa-crypto.rst | 11 ++---------
 drivers/crypto/intel/iaa/iaa_crypto_main.c         |  4 ++--
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documenta=
tion/driver-api/crypto/iaa/iaa-crypto.rst
index 0ff4ec603b43..d5e610ef4612 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -272,7 +272,7 @@ The available attributes are:
       echo async_irq > /sys/bus/dsa/drivers/crypto/sync_mode
=20
     Async mode without interrupts (caller must poll) can be enabled by
-    writing 'async' to it (please see Caveat)::
+    writing 'async' to it::
=20
       echo async > /sys/bus/dsa/drivers/crypto/sync_mode
=20
@@ -281,14 +281,7 @@ The available attributes are:
=20
       echo sync > /sys/bus/dsa/drivers/crypto/sync_mode
=20
-    The default mode is 'sync'.
-
-    Caveat: since the only mechanism that iaa_crypto currently implements
-    for async polling without interrupts is via the 'sync' mode as
-    described earlier, writing 'async' to
-    '/sys/bus/dsa/drivers/crypto/sync_mode' will internally enable the
-    'sync' mode. This is to ensure correct iaa_crypto behavior until true
-    async polling without interrupts is enabled in iaa_crypto.
+    The default mode is 'async'.
=20
   - g_comp_wqs_per_iaa
=20
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 0a620f2dc58e..c4f40984e9bf 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -152,7 +152,7 @@ static bool iaa_verify_compress =3D true;
  */
=20
 /* Use async mode */
-static bool async_mode;
+static bool async_mode =3D true;
 /* Use interrupts */
 static bool use_irq;
=20
@@ -206,7 +206,7 @@ static int set_iaa_sync_mode(const char *name)
 		async_mode =3D false;
 		use_irq =3D false;
 	} else if (sysfs_streq(name, "async")) {
-		async_mode =3D false;
+		async_mode =3D true;
 		use_irq =3D false;
 	} else if (sysfs_streq(name, "async_irq")) {
 		async_mode =3D true;
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4D18A27CB04;
	Fri, 26 Sep 2025 03:35:19 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857721; cv=none;
 b=EBibk6ZxDgMBuo+hLNEKo+GcosaGJC7hIneGKYUbJZOZPfk5kLAbGg4/X+aFxRmAQq804l1Yc7EXHOWTh8ss8ZuqCkdPTUougOPDwRYesEtzPFl88IMrluwC8ol6IPux1yMnBRtvIV+uz9o40eteDB8lJy1ZyK8bMHl8C1kmoLc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857721; c=relaxed/simple;
	bh=dRlpVlM91EY2UmAflAtlYIoSrF1HS7+q+5czUjAJLDU=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=TrB3oJ0OwfJSEH3WZiNNEtOqENdfld0Fn0SakxS8uAurktEhtKWwMyProDC44EEtrD6gNbq0WXl682L7MP+TXRTMNF/6C5xlvhgeyd/c+m2T+bjqu3cyhgm+QyZoTfLiJwk0kkgCr0BLMYXGZDRjIC4uRajhDlSWJsED5SEtICQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=HF/nv6Nr; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="HF/nv6Nr"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857719; x=1790393719;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=dRlpVlM91EY2UmAflAtlYIoSrF1HS7+q+5czUjAJLDU=;
  b=HF/nv6NrxTOmbZkrk65ewLX0mT4ugaZeeWlp0EK/vvmzNhvy6qrFPpBl
   OmD7EoWBkcP6G8P7+zAETVNfjQ2yCQSQeeyyyCMav9Qo4gjqshDsVQztX
   WFGjRhk3WFSnhGdtj4yJ4L/zIbtDqgFguatAeQIbgOUA3RcJPkfYreKql
   QyEVv2XB7kDLEPxdFzwNmHVmo67K5jPbs4EPNuVOt0/yZEkziEvAYAKXQ
   YxvW+SrFqGyN2wsW/4jQ2ie31X5AurJc9UQkXBznClKRmYwGWKEuobuvq
   eakKLsQseFMYIVqW6kxvn24q2Cpx5YKrJcUtOxPsA6Cw5kt1bOIYe5RfH
   Q==;
X-CSE-ConnectionGUID: xe58Pi7pQVCb8dCrV6Zybg==
X-CSE-MsgGUID: mMbJ6Hn2QTKhsJRBtLNhEQ==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819608"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819608"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:06 -0700
X-CSE-ConnectionGUID: xkKr1NNhQJW1leB93+byNQ==
X-CSE-MsgGUID: 60oS/FuYTWqw+ZeslG2Hig==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636598"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:06 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 16/23] crypto: iaa - Disable iaa_verify_compress by
 default.
Date: Thu, 25 Sep 2025 20:34:55 -0700
Message-Id: <20250926033502.7486-17-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch makes it easier for IAA hardware acceleration in the iaa_crypto
driver to be loaded by default with "iaa_verify_compress" disabled, to
facilitate performance comparisons with software compressors (which also
do not run compress verification by default). Earlier, iaa_crypto compress
verification used to be enabled by default.

The iaa_crypto driver documentation has been updated with this change.

With this patch, if users want to enable compress verification, they can do
so with these steps:

  1) disable all the IAA device/wq bindings that happen at boot time
  2) rmmod iaa_crypto
  3) modprobe iaa_crypto
  4) echo 1 > /sys/bus/dsa/drivers/crypto/verify_compress
  5) re-run initialization of the IAA devices and wqs

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 Documentation/driver-api/crypto/iaa/iaa-crypto.rst | 2 +-
 drivers/crypto/intel/iaa/iaa_crypto_main.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documenta=
tion/driver-api/crypto/iaa/iaa-crypto.rst
index d5e610ef4612..81a7dbd15f8b 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -239,7 +239,7 @@ The available attributes are:
=20
       echo 0 > /sys/bus/dsa/drivers/crypto/verify_compress
=20
-    The default setting is '1' - verify all compresses.
+    The default setting is '0' - to not verify compresses.
=20
   - sync_mode
=20
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index c4f40984e9bf..5b933c138e50 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -120,7 +120,7 @@ static bool iaa_distribute_decomps;
 static bool iaa_distribute_comps =3D true;
=20
 /* Verify results of IAA compress or not */
-static bool iaa_verify_compress =3D true;
+static bool iaa_verify_compress;
=20
 /*
  * The iaa crypto driver supports three 'sync' methods determining how
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0DB7D280024;
	Fri, 26 Sep 2025 03:35:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857723; cv=none;
 b=EWN33f6h3BlW16TYosEdZyrvRrvv5M0KG49/AOpkg/wlg2H/nY1JLnFh+t6+5BzEMC3qqA6GwXIGkWtADywF7kEzl7zAA1PUm1/IpIdH4Jopj1rVUbh3g5iRytCJVvfAS+kNKDBwpgkL77F7UUlZl95QJRxl7DCdNrMSLoEngCc=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857723; c=relaxed/simple;
	bh=RdtG4oYnaUiZ1ttzY4avkjog/6/16IXF7YMAS9x4VlM=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=SwM1A4BBRs9IHgSeKs2ZsrSd+C1iPIOgnoiP7Ty2YmlISeZ3jLt6YwpoU8nYRnrz1jonYjVJijcf+icARtgWLho+Ea93NGuPGKUbcrbSc7zhDR09iWsZlT3QhEZzNjwUw2acTkNVhBXiUhAMT9pol2k2XpvQFxqiUwsO9wGYipE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=SmL3JIGP; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="SmL3JIGP"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857721; x=1790393721;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=RdtG4oYnaUiZ1ttzY4avkjog/6/16IXF7YMAS9x4VlM=;
  b=SmL3JIGPDGbhIytW4lTutUywBoaQidBPEF+Tch33u/EOfKy5UjvLm6kV
   qv76gGWK6S4Q0h6ChaPMtrcX6iSrVTb4oC7IkyVjgzgn5J5H+Vfe+geCj
   dbEWMPITM9ANuB6TcDJUUWxTVfyZt/HDBIV4P5YbttxeWnokRj1pMtPO2
   cAS8OqgnusUdUmKEx7Dyq/G3H7mn9541CVen5G1zUNZ37ykmZ1IRgsJdw
   DUpfKvN6kK8+lDd5qMOTpW/1m2FeNSBFKoeGkOH4PaOCxceBjxHoBjkti
   jjSvVRBn7YwIer0H17JqkdW4b+QmzMLr5d/2uz0YY1TBM+kYww7fY0ywf
   w==;
X-CSE-ConnectionGUID: b5HhuYFqQ5mwmPqgnkrpmA==
X-CSE-MsgGUID: vmLXtCFyST2T8PotRSiUVw==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819621"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819621"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:07 -0700
X-CSE-ConnectionGUID: cAKDv1LcRUqGg49d9GbJ8g==
X-CSE-MsgGUID: YeuClNc0QHeBobxThiltsw==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636602"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:06 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 17/23] crypto: iaa - Submit the two largest source buffers
 first in decompress batching.
Date: Thu, 25 Sep 2025 20:34:56 -0700
Message-Id: <20250926033502.7486-18-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch finds the two largest source buffers in a given decompression
batch, and submits them first to the IAA decompress engines.

This improves decompress batching latency because the hardware has a
head start on decompressing the highest latency source buffers in the
batch. Workload performance is also significantly improved as a result
of this optimization.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c | 61 +++++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 5b933c138e50..0669ae155e90 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -2379,6 +2379,36 @@ static int iaa_comp_acompress_batch(
 	return err;
 }
=20
+/*
+ * Find the two largest source buffers in @slens for a decompress batch,
+ * and pass their indices back in @idx_max and @idx_next_max.
+ *
+ * Returns true if there is no second largest source buffer, only a max bu=
ffer.
+ */
+static bool decomp_batch_get_max_slens_idx(
+	struct iaa_req *reqs[],
+	int nr_pages,
+	int *idx_max,
+	int *idx_next_max)
+{
+	int i, max_i =3D 0, next_max_i =3D 0;
+
+	for (i =3D 0; i < nr_pages; ++i) {
+		if (reqs[i]->slen >=3D reqs[max_i]->slen) {
+			next_max_i =3D max_i;
+			max_i =3D i;
+		} else if ((next_max_i =3D=3D max_i) ||
+			   (reqs[i]->slen > reqs[next_max_i]->slen)) {
+			next_max_i =3D i;
+		}
+	}
+
+	*idx_max =3D max_i;
+	*idx_next_max =3D next_max_i;
+
+	return (next_max_i =3D=3D max_i);
+}
+
 /**
  * This API provides IAA decompress batching functionality for use by swap
  * modules.
@@ -2401,12 +2431,13 @@ static int iaa_comp_adecompress_batch(
 	unsigned int unit_size)
 {
 	struct iaa_batch_ctx *cpu_ctx =3D raw_cpu_ptr(iaa_batch_ctx);
+	bool max_processed =3D false, next_max_processed =3D false;
 	int nr_reqs =3D parent_req->dlen / unit_size;
 	int errors[IAA_CRYPTO_MAX_BATCH_SIZE];
+	int i =3D 0, max_i, next_max_i, err =3D 0;
 	bool decompressions_done =3D false;
 	struct scatterlist *sg;
 	struct iaa_req **reqs;
-	int i, err =3D 0;
=20
 	mutex_lock(&cpu_ctx->mutex);
=20
@@ -2425,11 +2456,28 @@ static int iaa_comp_adecompress_batch(
=20
 	iaa_set_req_poll(reqs, nr_reqs, true);
=20
+	/*
+	 * Get the indices of the two largest decomp buffers in the batch.
+	 * Submit them first. This improves latency of the batch.
+	 */
+	next_max_processed =3D decomp_batch_get_max_slens_idx(reqs, nr_reqs,
+							    &max_i, &next_max_i);
+
+	i =3D max_i;
+
 	/*
 	 * Prepare and submit the batch of iaa_reqs to IAA. IAA will process
 	 * these decompress jobs in parallel.
 	 */
-	for (i =3D 0; i < nr_reqs; ++i) {
+	for (; i < nr_reqs; ++i) {
+		if ((i =3D=3D max_i) && max_processed)
+			continue;
+		if ((i =3D=3D next_max_i) && max_processed && next_max_processed)
+			continue;
+
+		if (max_processed && !next_max_processed)
+			i =3D next_max_i;
+
 		errors[i] =3D iaa_comp_adecompress(ctx, reqs[i]);
=20
 		/*
@@ -2444,6 +2492,15 @@ static int iaa_comp_adecompress_batch(
 		} else {
 			*parent_req->dlens[i] =3D reqs[i]->dlen;
 		}
+
+		if (i =3D=3D max_i) {
+			max_processed =3D true;
+			i =3D -1;
+		}
+		if (i =3D=3D next_max_i) {
+			next_max_processed =3D true;
+			i =3D -1;
+		}
 	}
=20
 	/*
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4E7B2281357;
	Fri, 26 Sep 2025 03:35:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857724; cv=none;
 b=nAu+aYmf5QiJxBGUvRrrXh9WscgST5va/uXLAIbsKsYD4DAzy6q3N28pRqHhDPQnOg6IC5eDU8QajGgAm+09qnrEHsT0TZg5AaGKpmM33hOVhg/vhcqRmPyloojXq+rSZp1ilDYcaoXQWty9/d0hWwjLKv7bMyGAj8EkmOiXYAE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857724; c=relaxed/simple;
	bh=8zO287VZ8tZ+k+EEFCCO9Fum+HPgmZKBhvlLu2+6EXc=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=Q++CbY2RyFlSeLA9xykkW1r0PaNf2RTPjTvCqT/CZxbfIhLUUk/2uvHj/j+JywDz6RKDEnpr8lMW/xNxyPRe/YQNwznrtAW3UG2e1SSKrRFB15OVs8YdK1utcpDuG5wVZ81To2ULiDVXYmoS7D3hcv+swcM67i4O/ZI+16HyyGI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=O7oxnBCY; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="O7oxnBCY"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857721; x=1790393721;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=8zO287VZ8tZ+k+EEFCCO9Fum+HPgmZKBhvlLu2+6EXc=;
  b=O7oxnBCYWNp6GCxLQNpEm7CI/ZoDA5vHQe7QGzVprrVO0H1F9ppaXvWZ
   zkFeMCNW6kABbBV+daBe1browFJgjXLIKPbszw+AYDZle31Zn49wJ7bBv
   5gCXLyMq9uJNLM5qC83yJ8ngx0ekTjdVjOatof62RZreSbI5pMzmPEYfs
   vVbEFJ4uK5sSFhw0Pg2By+WNhczTNUTgQ4WFIAPFMlxGGIrXUNG1BEqwm
   NVR4GaoNJFyVuTSUahepWaeHL3WdcO7oDVxoln5wdxrOkFjKSO8LasG1p
   csaFPkVeGLlAYOfwpvMTgWt52Z+5xEDcbB+9BcRISPM3GYpnWXTptKbCU
   Q==;
X-CSE-ConnectionGUID: Na72yx2PTYmliI8LITV2Pw==
X-CSE-MsgGUID: bCYXeLhTQ+65aC2cSNTDxA==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819654"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819654"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:07 -0700
X-CSE-ConnectionGUID: L8b29UPGQ+iHnCb3FvLCxg==
X-CSE-MsgGUID: LaW3gcQfRke+KlzqwXJc/w==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636606"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:06 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 18/23] crypto: iaa - Add deflate-iaa-dynamic compression
 mode.
Date: Thu, 25 Sep 2025 20:34:57 -0700
Message-Id: <20250926033502.7486-19-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Some versions of Intel IAA support dynamic compression where the hardware
dynamically computes the Huffman tables and generates a Deflate header
if the input size is no larger than 4KB. This patch will use IAA for
dynamic compression if an appropriate IAA is present and the input size is
not too big. If an IAA is not present, the algorithm will not
be available. Otherwise, if the size of the input is greater than
PAGE_SIZE, zlib is used to do the compression. If the algorithm is
selected, IAA will be used for decompression. If the compressed stream
contains a reference whose distance is greater than 4KB, hardware
decompression will fail, and the decompression will be done with zlib.

Intel IAA dynamic compression results in a compression ratio that is
better than or equal to the currently supported "fixed" compression mode
on the same data set. Compressing a data set of 4300 4KB pages sampled
from SPEC CPU17 workloads produces a compression ratio of 3.14 for IAA
dynamic compression and 2.69 for IAA fixed compression.

If an appropriate IAA exists, dynamic mode can be chosen as the IAA
compression mode by selecting the corresponding algorithm.

For example, to use IAA dynamic mode in zswap:

      echo deflate-iaa-dynamic > /sys/module/zswap/parameters/compressor

This patch also adds a deflate_generic_compress() fallback when dynamic
mode is selected and the input size is over 4KB; along with stats
support that will count these software fallback calls as
"total_sw_comp_calls" in the driver's global_stats.

Furthermore, we define IAA_DYN_ALLOC_DESC_COMP_TIMEOUT as 2000 for
dynamic mode compression on Granite Rapids.

Signed-off-by: Andre Glover <andre.glover@linux.intel.com>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 .../driver-api/crypto/iaa/iaa-crypto.rst      | 21 ++++
 crypto/testmgr.c                              | 10 ++
 crypto/testmgr.h                              | 74 ++++++++++++++
 drivers/crypto/intel/iaa/Makefile             |  2 +-
 drivers/crypto/intel/iaa/iaa_crypto.h         |  5 +
 .../intel/iaa/iaa_crypto_comp_dynamic.c       | 22 +++++
 drivers/crypto/intel/iaa/iaa_crypto_main.c    | 98 +++++++++++++++++--
 drivers/crypto/intel/iaa/iaa_crypto_stats.c   |  8 ++
 drivers/crypto/intel/iaa/iaa_crypto_stats.h   |  2 +
 include/linux/iaa_comp.h                      |  5 +-
 10 files changed, 236 insertions(+), 11 deletions(-)
 create mode 100644 drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c

diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documenta=
tion/driver-api/crypto/iaa/iaa-crypto.rst
index 81a7dbd15f8b..e841a33564db 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -33,6 +33,8 @@ compresses and decompresses.
 Currently, there is only one compression modes available, 'fixed'
 mode.
=20
+'dynamic' mode is available on certain generations of IAA hardware.
+
 The 'fixed' compression mode implements the compression scheme
 specified by RFC 1951 and is given the crypto algorithm name
 'deflate-iaa'.  (Because the IAA hardware has a 4k history-window
@@ -43,6 +45,25 @@ the IAA fixed mode deflate algorithm is given its own al=
gorithm name
 rather than simply 'deflate').
=20
=20
+The 'dynamic' compression mode implements a compression scheme where
+the IAA hardware will internally do one pass through the data, compute the
+Huffman tables and generate a Deflate header, then automatically do a
+second pass through the data, generating the final compressed output. IAA
+dynamic compression can be used if an appropriate IAA is present and the
+input size is not too big.  If an appropriate IAA is not present, the
+algorithm will not be available. Otherwise, if the size of the input is too
+big, zlib is used to do the compression. If the algorithm is selected,
+IAA will be used for decompression. If the compressed stream contains a
+reference whose distance is greater than 4KB, hardware decompression will
+fail, and the decompression will be done with zlib. If an appropriate IAA
+exists, 'dynamic' compression, it is implemented by the
+'deflate-iaa-dynamic' crypto algorithm.
+
+A zswap device can select the IAA 'dynamic' mode represented by
+selecting the 'deflate-iaa-dynamic' crypto compression algorithm::
+
+  # echo deflate-iaa-dynamic> /sys/module/zswap/parameters/compressor
+
 Config options and other setup
 =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D
=20
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ee33ba21ae2b..d0f271ea1201 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4665,6 +4665,16 @@ static const struct alg_test_desc alg_test_descs[] =
=3D {
 				.decomp =3D __VECS(deflate_decomp_tv_template)
 			}
 		}
+	}, {
+		.alg =3D "deflate-iaa-dynamic",
+		.test =3D alg_test_comp,
+		.fips_allowed =3D 1,
+		.suite =3D {
+			.comp =3D {
+				.comp =3D __VECS(deflate_iaa_dynamic_comp_tv_template),
+				.decomp =3D __VECS(deflate_iaa_dynamic_decomp_tv_template)
+			}
+		}
 	}, {
 		.alg =3D "dh",
 		.test =3D alg_test_kpp,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 32d099ac9e73..42db2399013e 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -34575,6 +34575,80 @@ static const struct comp_testvec deflate_decomp_tv=
_template[] =3D {
 	},
 };
=20
+static const struct comp_testvec deflate_iaa_dynamic_comp_tv_template[] =
=3D {
+	{
+		.inlen	=3D 70,
+		.outlen	=3D 46,
+		.input	=3D "Join us now and share the software "
+			"Join us now and share the software ",
+		.output =3D "\x85\xca\xc1\x09\x00\x20\x08\x05"
+			  "\xd0\x55\xfe\x3c\x6e\x21\x64\xd8"
+			  "\x45\x21\x0d\xd7\xb7\x26\xe8\xf8"
+			  "\xe0\x91\x2f\xc3\x09\x98\x17\xd8"
+			  "\x06\x42\x79\x0b\x52\x05\xe1\x33"
+			  "\xeb\x81\x3e\xe5\xa2\x01",
+	}, {
+		.inlen	=3D 191,
+		.outlen	=3D 121,
+		.input	=3D "This document describes a compression method based on the DE=
FLATE"
+			"compression algorithm.  This document defines the application of "
+			"the DEFLATE algorithm to the IP Payload Compression Protocol.",
+		.output =3D "\x5d\x8d\xc1\x0d\xc2\x30\x10\x04"
+			  "\x5b\xd9\x0a\xd2\x03\x82\x20\x21"
+			  "\xf1\xf0\x23\x0d\x5c\xec\x0b\xb6"
+			  "\x64\xfb\x2c\xdf\xf1\xa0\x7b\x12"
+			  "\x3e\x58\x79\xae\x76\x67\x76\x89"
+			  "\x49\x11\xc4\xbf\x0b\x57\x43\x60"
+			  "\xf5\x3d\xad\xac\x20\x78\x29\xad"
+			  "\xb3\x6a\x92\x8a\xc2\x16\x25\x60"
+			  "\x25\xe5\x80\x3d\x5b\x64\xdc\xe6"
+			  "\xfb\xf3\xb2\xcc\xe3\x8c\xf2\x4b"
+			  "\x7a\xb2\x58\x26\xe0\x2c\xde\x52"
+			  "\xdd\xb5\x07\x48\xad\xe5\xe4\xc9"
+			  "\x0e\x42\xb6\xd1\xf5\x17\xc0\xe4"
+			  "\x57\x3c\x1c\x1c\x7d\xb2\x50\xc0"
+			  "\x75\x38\x72\x5d\x4c\xbc\xe4\xe9"
+			  "\x0b",
+	},
+};
+
+static const struct comp_testvec deflate_iaa_dynamic_decomp_tv_template[] =
=3D {
+	{
+		.inlen	=3D 121,
+		.outlen	=3D 191,
+		.input	=3D "\x5d\x8d\xc1\x0d\xc2\x30\x10\x04"
+			  "\x5b\xd9\x0a\xd2\x03\x82\x20\x21"
+			  "\xf1\xf0\x23\x0d\x5c\xec\x0b\xb6"
+			  "\x64\xfb\x2c\xdf\xf1\xa0\x7b\x12"
+			  "\x3e\x58\x79\xae\x76\x67\x76\x89"
+			  "\x49\x11\xc4\xbf\x0b\x57\x43\x60"
+			  "\xf5\x3d\xad\xac\x20\x78\x29\xad"
+			  "\xb3\x6a\x92\x8a\xc2\x16\x25\x60"
+			  "\x25\xe5\x80\x3d\x5b\x64\xdc\xe6"
+			  "\xfb\xf3\xb2\xcc\xe3\x8c\xf2\x4b"
+			  "\x7a\xb2\x58\x26\xe0\x2c\xde\x52"
+			  "\xdd\xb5\x07\x48\xad\xe5\xe4\xc9"
+			  "\x0e\x42\xb6\xd1\xf5\x17\xc0\xe4"
+			  "\x57\x3c\x1c\x1c\x7d\xb2\x50\xc0"
+			  "\x75\x38\x72\x5d\x4c\xbc\xe4\xe9"
+			  "\x0b",
+		.output	=3D "This document describes a compression method based on the D=
EFLATE"
+			"compression algorithm.  This document defines the application of "
+			"the DEFLATE algorithm to the IP Payload Compression Protocol.",
+	}, {
+		.inlen	=3D 46,
+		.outlen	=3D 70,
+		.input	=3D "\x85\xca\xc1\x09\x00\x20\x08\x05"
+			  "\xd0\x55\xfe\x3c\x6e\x21\x64\xd8"
+			  "\x45\x21\x0d\xd7\xb7\x26\xe8\xf8"
+			  "\xe0\x91\x2f\xc3\x09\x98\x17\xd8"
+			  "\x06\x42\x79\x0b\x52\x05\xe1\x33"
+			  "\xeb\x81\x3e\xe5\xa2\x01",
+		.output	=3D "Join us now and share the software "
+			"Join us now and share the software ",
+	},
+};
+
 /*
  * LZO test vectors (null-terminated strings).
  */
diff --git a/drivers/crypto/intel/iaa/Makefile b/drivers/crypto/intel/iaa/M=
akefile
index ebfa1a425f80..96f22cd39924 100644
--- a/drivers/crypto/intel/iaa/Makefile
+++ b/drivers/crypto/intel/iaa/Makefile
@@ -7,6 +7,6 @@ ccflags-y +=3D -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYM=
BOL_NAMESPACE=3D'"CRYPTO_
=20
 obj-$(CONFIG_CRYPTO_DEV_IAA_CRYPTO) :=3D iaa_crypto.o
=20
-iaa_crypto-y :=3D iaa_crypto_main.o iaa_crypto_comp_fixed.o
+iaa_crypto-y :=3D iaa_crypto_main.o iaa_crypto_comp_fixed.o iaa_crypto_com=
p_dynamic.o
=20
 iaa_crypto-$(CONFIG_CRYPTO_DEV_IAA_CRYPTO_STATS) +=3D iaa_crypto_stats.o
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/i=
aa/iaa_crypto.h
index db1e50574662..5a8dec3be9fa 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -19,12 +19,15 @@
=20
 #define IAA_COMP_FLUSH_OUTPUT		BIT(1)
 #define IAA_COMP_APPEND_EOB		BIT(2)
+#define IAA_COMP_GEN_HDR_1_PASS		(BIT(12) | BIT(13))
=20
 #define IAA_COMPLETION_TIMEOUT		1000000
=20
 #define IAA_ALLOC_DESC_COMP_TIMEOUT	   1000
 #define IAA_ALLOC_DESC_DECOMP_TIMEOUT	    500
=20
+#define IAA_DYN_ALLOC_DESC_COMP_TIMEOUT	   2000
+
 #define IAA_ANALYTICS_ERROR		0x0a
 #define IAA_ERROR_DECOMP_BUF_OVERFLOW	0x0b
 #define IAA_ERROR_COMP_BUF_OVERFLOW	0x19
@@ -134,6 +137,8 @@ struct aecs_comp_table_record {
=20
 int iaa_aecs_init_fixed(void);
 void iaa_aecs_cleanup_fixed(void);
+int iaa_aecs_init_dynamic(void);
+void iaa_aecs_cleanup_dynamic(void);
=20
 typedef int (*iaa_dev_comp_init_fn_t) (struct iaa_device_compression_mode =
*mode);
 typedef int (*iaa_dev_comp_free_fn_t) (struct iaa_device_compression_mode =
*mode);
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c b/drivers/c=
rypto/intel/iaa/iaa_crypto_comp_dynamic.c
new file mode 100644
index 000000000000..3a93d7913443
--- /dev/null
+++ b/drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Intel Corporation. All rights rsvd. */
+
+#include "idxd.h"
+#include "iaa_crypto.h"
+
+int iaa_aecs_init_dynamic(void)
+{
+	int ret;
+
+	ret =3D add_iaa_compression_mode("dynamic", NULL, 0, NULL, 0, NULL, NULL);
+
+	if (!ret)
+		pr_debug("IAA dynamic compression mode initialized\n");
+
+	return ret;
+}
+
+void iaa_aecs_cleanup_dynamic(void)
+{
+	remove_iaa_compression_mode("dynamic");
+}
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 0669ae155e90..cbe3a2457253 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -102,10 +102,12 @@ DEFINE_MUTEX(first_wq_found_lock);
=20
 const char *iaa_compression_mode_names[IAA_COMP_MODES_MAX] =3D {
 	"fixed",
+	"dynamic",
 };
=20
 const char *iaa_compression_alg_names[IAA_COMP_MODES_MAX] =3D {
 	"deflate-iaa",
+	"deflate-iaa-dynamic",
 };
=20
 static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_M=
AX];
@@ -1482,6 +1484,23 @@ static int deflate_generic_decompress(struct iaa_req=
 *req)
 	return ret;
 }
=20
+static int deflate_generic_compress(struct iaa_req *req)
+{
+	ACOMP_REQUEST_ON_STACK(fbreq, deflate_crypto_acomp);
+	int ret;
+
+	acomp_request_set_callback(fbreq, 0, NULL, NULL);
+	acomp_request_set_params(fbreq, req->src, req->dst, req->slen,
+				 PAGE_SIZE);
+
+	ret =3D crypto_acomp_compress(fbreq);
+	req->dlen =3D fbreq->dlen;
+
+	update_total_sw_comp_calls();
+
+	return ret;
+}
+
 static __always_inline void acomp_to_iaa(struct acomp_req *areq,
 					 struct iaa_req *req,
 					 struct iaa_compression_ctx *ctx)
@@ -1808,9 +1827,13 @@ iaa_setup_compress_hw_desc(struct idxd_desc *idxd_de=
sc,
 	desc->src1_size =3D slen;
 	desc->dst_addr =3D (u64)dst_addr;
 	desc->max_dst_size =3D dlen;
-	desc->flags |=3D IDXD_OP_FLAG_RD_SRC2_AECS;
-	desc->src2_addr =3D active_compression_mode->aecs_comp_table_dma_addr;
-	desc->src2_size =3D sizeof(struct aecs_comp_table_record);
+	if (mode =3D=3D IAA_MODE_DYNAMIC) {
+		desc->compr_flags |=3D IAA_COMP_GEN_HDR_1_PASS;
+	} else {
+		desc->flags |=3D IDXD_OP_FLAG_RD_SRC2_AECS;
+		desc->src2_addr =3D active_compression_mode->aecs_comp_table_dma_addr;
+		desc->src2_size =3D sizeof(struct aecs_comp_table_record);
+	}
 	desc->completion_addr =3D idxd_desc->compl_dma;
=20
 	return desc;
@@ -2064,6 +2087,9 @@ static int iaa_comp_acompress(struct iaa_compression_=
ctx *ctx, struct iaa_req *r
 		return -EINVAL;
 	}
=20
+	if (ctx->mode =3D=3D IAA_MODE_DYNAMIC && req->slen > PAGE_SIZE)
+		return deflate_generic_compress(req);
+
 	cpu =3D get_cpu();
 	wq =3D comp_wq_table_next_wq(cpu);
 	put_cpu();
@@ -2546,7 +2572,9 @@ static int iaa_comp_adecompress_batch(
 static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa=
_mode mode)
 {
 	ctx->mode =3D mode;
-	ctx->alloc_comp_desc_timeout =3D IAA_ALLOC_DESC_COMP_TIMEOUT;
+	ctx->alloc_comp_desc_timeout =3D (mode =3D=3D IAA_MODE_DYNAMIC ?
+					IAA_DYN_ALLOC_DESC_COMP_TIMEOUT :
+					IAA_ALLOC_DESC_COMP_TIMEOUT);
 	ctx->alloc_decomp_desc_timeout =3D IAA_ALLOC_DESC_DECOMP_TIMEOUT;
 	ctx->verify_compress =3D iaa_verify_compress;
 	ctx->async_mode =3D async_mode;
@@ -2760,6 +2788,30 @@ static struct acomp_alg iaa_acomp_fixed_deflate =3D {
 	}
 };
=20
+static int iaa_comp_init_dynamic(struct crypto_acomp *acomp_tfm)
+{
+	struct crypto_tfm *tfm =3D crypto_acomp_tfm(acomp_tfm);
+	struct iaa_compression_ctx *ctx =3D crypto_tfm_ctx(tfm);
+
+	ctx =3D iaa_ctx[IAA_MODE_DYNAMIC];
+
+	return 0;
+}
+
+static struct acomp_alg iaa_acomp_dynamic_deflate =3D {
+	.init			=3D iaa_comp_init_dynamic,
+	.compress		=3D iaa_comp_acompress_main,
+	.decompress		=3D iaa_comp_adecompress_main,
+	.base			=3D {
+		.cra_name		=3D "deflate",
+		.cra_driver_name	=3D "deflate-iaa-dynamic",
+		.cra_flags		=3D CRYPTO_ALG_ASYNC,
+		.cra_ctxsize		=3D sizeof(struct iaa_compression_ctx),
+		.cra_module		=3D THIS_MODULE,
+		.cra_priority		=3D IAA_ALG_PRIORITY + 1,
+	}
+};
+
 /*******************************************
  * Implement idxd_device_driver interfaces.
  *******************************************/
@@ -2779,7 +2831,7 @@ static void iaa_unregister_compression_device(void)
 	num_iaa_modes_registered =3D 0;
 }
=20
-static int iaa_register_compression_device(void)
+static int iaa_register_compression_device(struct idxd_device *idxd)
 {
 	struct iaa_compression_mode *mode;
 	int i, idx;
@@ -2788,6 +2840,13 @@ static int iaa_register_compression_device(void)
 		iaa_mode_registered[i] =3D false;
 		mode =3D find_iaa_compression_mode(iaa_compression_mode_names[i], &idx);
 		if (mode) {
+			/* Header Generation Capability is required for the dynamic algorithm. =
*/
+			if ((!strcmp(mode->name, "dynamic")) && !idxd->hw.iaa_cap.header_gen) {
+				if (num_iaa_modes_registered > 0)
+					--num_iaa_modes_registered;
+				continue;
+			}
+
 			iaa_ctx[i] =3D kmalloc(sizeof(struct iaa_compression_ctx), GFP_KERNEL);
 			if (!iaa_ctx[i])
 				goto err;
@@ -2805,7 +2864,7 @@ static int iaa_register_compression_device(void)
 	return -ENODEV;
 }
=20
-static int iaa_register_acomp_compression_device(void)
+static int iaa_register_acomp_compression_device(struct idxd_device *idxd)
 {
 	int ret =3D -ENOMEM;
=20
@@ -2819,8 +2878,19 @@ static int iaa_register_acomp_compression_device(voi=
d)
 		goto err_fixed;
 	}
=20
+	if (iaa_mode_registered[IAA_MODE_DYNAMIC]) {
+		ret =3D crypto_register_acomp(&iaa_acomp_dynamic_deflate);
+		if (ret) {
+			pr_err("deflate algorithm acomp dynamic registration failed (%d)\n", re=
t);
+			goto err_dynamic;
+		}
+	}
+
 	return 0;
=20
+err_dynamic:
+	crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
+
 err_fixed:
 	if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
 		crypto_free_acomp(deflate_crypto_acomp);
@@ -2839,6 +2909,9 @@ static void iaa_unregister_acomp_compression_device(v=
oid)
 	if (iaa_mode_registered[IAA_MODE_FIXED])
 		crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
=20
+	if (iaa_mode_registered[IAA_MODE_DYNAMIC])
+		crypto_unregister_acomp(&iaa_acomp_dynamic_deflate);
+
 	if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
 		crypto_free_acomp(deflate_crypto_acomp);
 		deflate_crypto_acomp =3D NULL;
@@ -2906,13 +2979,13 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_d=
ev)
 	atomic_set(&iaa_crypto_enabled, 1);
=20
 	if (first_wq) {
-		ret =3D iaa_register_compression_device();
+		ret =3D iaa_register_compression_device(idxd);
 		if (ret !=3D 0) {
 			dev_dbg(dev, "IAA compression device registration failed\n");
 			goto err_register;
 		}
=20
-		ret =3D iaa_register_acomp_compression_device();
+		ret =3D iaa_register_acomp_compression_device(idxd);
 		if (ret !=3D 0) {
 			dev_dbg(dev, "IAA compression device acomp registration failed\n");
 			goto err_register;
@@ -3063,6 +3136,12 @@ static int __init iaa_crypto_init_module(void)
 		goto err_aecs_init;
 	}
=20
+	ret =3D iaa_aecs_init_dynamic();
+	if (ret < 0) {
+		pr_debug("IAA dynamic compression mode init failed\n");
+		goto err_dynamic;
+	}
+
 	ret =3D idxd_driver_register(&iaa_crypto_driver);
 	if (ret) {
 		pr_debug("IAA wq sub-driver registration failed\n");
@@ -3164,6 +3243,8 @@ static int __init iaa_crypto_init_module(void)
 err_g_comp_wqs_per_iaa_attr_create:
 	idxd_driver_unregister(&iaa_crypto_driver);
 err_driver_reg:
+	iaa_aecs_cleanup_dynamic();
+err_dynamic:
 	iaa_aecs_cleanup_fixed();
 err_aecs_init:
=20
@@ -3188,6 +3269,7 @@ static void __exit iaa_crypto_cleanup_module(void)
 	driver_remove_file(&iaa_crypto_driver.drv,
 			   &driver_attr_g_comp_wqs_per_iaa);
 	idxd_driver_unregister(&iaa_crypto_driver);
+	iaa_aecs_cleanup_dynamic();
 	iaa_aecs_cleanup_fixed();
=20
 	pr_debug("cleaned up\n");
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_stats.c b/drivers/crypto/i=
ntel/iaa/iaa_crypto_stats.c
index f5cc3d29ca19..42aae8a738ac 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_stats.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_stats.c
@@ -19,6 +19,7 @@
=20
 static atomic64_t total_comp_calls;
 static atomic64_t total_decomp_calls;
+static atomic64_t total_sw_comp_calls;
 static atomic64_t total_sw_decomp_calls;
 static atomic64_t total_comp_bytes_out;
 static atomic64_t total_decomp_bytes_in;
@@ -43,6 +44,11 @@ void update_total_decomp_calls(void)
 	atomic64_inc(&total_decomp_calls);
 }
=20
+void update_total_sw_comp_calls(void)
+{
+	atomic64_inc(&total_sw_comp_calls);
+}
+
 void update_total_sw_decomp_calls(void)
 {
 	atomic64_inc(&total_sw_decomp_calls);
@@ -174,6 +180,8 @@ static int global_stats_show(struct seq_file *m, void *=
v)
 		   atomic64_read(&total_comp_calls));
 	seq_printf(m, "  total_decomp_calls: %llu\n",
 		   atomic64_read(&total_decomp_calls));
+	seq_printf(m, "  total_sw_comp_calls: %llu\n",
+		   atomic64_read(&total_sw_comp_calls));
 	seq_printf(m, "  total_sw_decomp_calls: %llu\n",
 		   atomic64_read(&total_sw_decomp_calls));
 	seq_printf(m, "  total_comp_bytes_out: %llu\n",
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_stats.h b/drivers/crypto/i=
ntel/iaa/iaa_crypto_stats.h
index 3787a5f507eb..6e0c6f9939bf 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_stats.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto_stats.h
@@ -11,6 +11,7 @@ void	iaa_crypto_debugfs_cleanup(void);
 void	update_total_comp_calls(void);
 void	update_total_comp_bytes_out(int n);
 void	update_total_decomp_calls(void);
+void	update_total_sw_comp_calls(void);
 void	update_total_sw_decomp_calls(void);
 void	update_total_decomp_bytes_in(int n);
 void	update_completion_einval_errs(void);
@@ -29,6 +30,7 @@ static inline void	iaa_crypto_debugfs_cleanup(void) {}
 static inline void	update_total_comp_calls(void) {}
 static inline void	update_total_comp_bytes_out(int n) {}
 static inline void	update_total_decomp_calls(void) {}
+static inline void	update_total_sw_comp_calls(void) {}
 static inline void	update_total_sw_decomp_calls(void) {}
 static inline void	update_total_decomp_bytes_in(int n) {}
 static inline void	update_completion_einval_errs(void) {}
diff --git a/include/linux/iaa_comp.h b/include/linux/iaa_comp.h
index 7b765760485c..ec28a9d9f2b4 100644
--- a/include/linux/iaa_comp.h
+++ b/include/linux/iaa_comp.h
@@ -12,7 +12,8 @@
=20
 enum iaa_mode {
 	IAA_MODE_FIXED =3D 0,
-	IAA_MODE_NONE =3D 1,
+	IAA_MODE_DYNAMIC =3D 1,
+	IAA_MODE_NONE =3D 2,
 };
=20
 struct iaa_req {
@@ -56,7 +57,7 @@ extern int iaa_comp_decompress_batch(
 #else /* CONFIG_CRYPTO_DEV_IAA_CRYPTO */
=20
 enum iaa_mode {
-	IAA_MODE_NONE =3D 1,
+	IAA_MODE_NONE =3D 2,
 };
=20
 struct iaa_req {};
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 47E99283CA7;
	Fri, 26 Sep 2025 03:35:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857724; cv=none;
 b=S2jkyxBF6qFzNeOhEpgbPGOh46jBqST7GGmKu3xFCkKrbtUaGv4/Q2rAu9B/QEyzBJcXhBcqMKn4f3NYeBhKEwXsbWdNVGy3YmPkYi3POh7pm7833fBIgavnBcMMiuFCU4ofBwMWqOQmYHx9o5PYdr0CkzB91k34x5z5ie2j5DU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857724; c=relaxed/simple;
	bh=2jjGz92VWP6NF7MEg84WWTreqEFFwrPZBW5/7GBPLbg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=GfL5c+co4X7HFlcRUo2fcROWsOgOHwyR8Ovwdxh4/NfOC5Syo8AlOnGlpXQ1Oo7Bb2m/k352b4CKcdS5bpDFHpJ33FfpExnJkx9NK/NTyWWBYQ4nOpHStm3D1Oo+mcS0sI9G/3++V6JY3h9kCvAy60FSoDKpbYLaV0QAP03S6HA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=Km525Uo1; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="Km525Uo1"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857722; x=1790393722;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=2jjGz92VWP6NF7MEg84WWTreqEFFwrPZBW5/7GBPLbg=;
  b=Km525Uo1wuvWAmRR5u2E6qW4b7YgrOlAqUiF1dnHWs4K2eeCP8SjN78r
   5W8G3rBDnznj957NkeX9wWlDDf00+unwqIJYLZas46A846iKzEbU/iqWj
   EZwDPPZvdQ52YDuOu1ZoKXswhqIfN7eFdwYMXIM0QmwtBwFIL5Llt3epH
   x7xf2+0txlPQo+pdc6zGvRMghSyWGqM73ieVmKZH/HIjmjQ2BRFzfKkeq
   UhytasyZI+jqaHWCK1SqQ2O32QJ0adzY8FNjITeTURqjt42Eq7SvDxIvb
   zg1QYus8XfYGqnAvOxGRHUOmu6mGE52RVdTBJ4lYtB3A6NOgahCFggyCj
   g==;
X-CSE-ConnectionGUID: LY41BFcUQd6bN7Fr8J54OQ==
X-CSE-MsgGUID: fp6NZCVASlWX4y980PAc2w==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819658"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819658"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:07 -0700
X-CSE-ConnectionGUID: c0bOvcQGQX+dEcSCyb0UBw==
X-CSE-MsgGUID: eyUfyWhnTnOLmHZ5RSOtoA==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636609"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:06 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 19/23] crypto: acomp - Add crypto_acomp_batch_size() to
 get an algorithm's batch-size.
Date: Thu, 25 Sep 2025 20:34:58 -0700
Message-Id: <20250926033502.7486-20-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This commit adds a @batch_size data member to:

  struct acomp_alg
  struct crypto_acomp

A crypto_acomp compression algorithm that supports batching of
compressions and decompressions must provide a @batch_size greater than
one, representing the maximum batch-size that the compressor supports,
so that kernel users of crypto_acomp, such as zswap, can allocate
resources for submitting multiple compress/decompress jobs that can be
batched, and invoke batching of [de]compressions.

A new helper function acomp_has_async_batching() can be invoked to query
if a crypto_acomp has defined a @batch_size.

The new crypto_acomp_batch_size() API uses this helper function to return
the batch-size for compressors that have registered a @batch_size. If the
algorithm does not define a @batch_size, a default of "1" is returned.

zswap can invoke crypto_acomp_batch_size() to query the maximum number
of requests that can be batch [de]compressed. Based on this, zswap
can use the minimum of any zswap-specific upper limits for batch-size
and the compressor's max @batch_size, to allocate batching resources.

The IAA acomp_algs Fixed ("deflate-iaa") and Dynamic
("deflate-iaa-dynamic") register @batch_size as
IAA_CRYPTO_MAX_BATCH_SIZE.

This enables zswap to compress/decompress pages in parallel in the IAA
hardware accelerator to improve swapout/swapin performance and memory
savings.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 crypto/acompress.c                         |  1 +
 drivers/crypto/intel/iaa/iaa_crypto_main.c |  2 ++
 include/crypto/acompress.h                 | 27 ++++++++++++++++++++++
 include/crypto/internal/acompress.h        |  3 +++
 4 files changed, 33 insertions(+)

diff --git a/crypto/acompress.c b/crypto/acompress.c
index be28cbfd22e3..a1bdfa21e688 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -105,6 +105,7 @@ static int crypto_acomp_init_tfm(struct crypto_tfm *tfm)
=20
 	acomp->compress =3D alg->compress;
 	acomp->decompress =3D alg->decompress;
+	acomp->batch_size =3D alg->batch_size;
 	acomp->reqsize =3D alg->base.cra_reqsize;
=20
 	acomp->base.exit =3D crypto_acomp_exit_tfm;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index cbe3a2457253..2fa38176034d 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -2777,6 +2777,7 @@ static struct acomp_alg iaa_acomp_fixed_deflate =3D {
 	.init			=3D iaa_comp_init_fixed,
 	.compress		=3D iaa_comp_acompress_main,
 	.decompress		=3D iaa_comp_adecompress_main,
+	.batch_size		=3D IAA_CRYPTO_MAX_BATCH_SIZE,
 	.base			=3D {
 		.cra_name		=3D "deflate",
 		.cra_driver_name	=3D "deflate-iaa",
@@ -2802,6 +2803,7 @@ static struct acomp_alg iaa_acomp_dynamic_deflate =3D=
 {
 	.init			=3D iaa_comp_init_dynamic,
 	.compress		=3D iaa_comp_acompress_main,
 	.decompress		=3D iaa_comp_adecompress_main,
+	.batch_size		=3D IAA_CRYPTO_MAX_BATCH_SIZE,
 	.base			=3D {
 		.cra_name		=3D "deflate",
 		.cra_driver_name	=3D "deflate-iaa-dynamic",
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 0f1334168f1b..e94046529e46 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -108,6 +108,8 @@ struct acomp_req {
  *
  * @compress:		Function performs a compress operation
  * @decompress:		Function performs a de-compress operation
+ * @batch_size:		Maximum batch-size for batching compress/decompress
+ *			operations.
  * @reqsize:		Context size for (de)compression requests
  * @fb:			Synchronous fallback tfm
  * @base:		Common crypto API algorithm data structure
@@ -115,6 +117,7 @@ struct acomp_req {
 struct crypto_acomp {
 	int (*compress)(struct acomp_req *req);
 	int (*decompress)(struct acomp_req *req);
+	unsigned int batch_size;
 	unsigned int reqsize;
 	struct crypto_tfm base;
 };
@@ -205,6 +208,13 @@ static inline bool acomp_is_async(struct crypto_acomp =
*tfm)
 	       CRYPTO_ALG_ASYNC;
 }
=20
+static inline bool acomp_has_async_batching(struct crypto_acomp *tfm)
+{
+	return (acomp_is_async(tfm) &&
+		(crypto_comp_alg_common(tfm)->base.cra_flags & CRYPTO_ALG_TYPE_ACOMPRESS=
) &&
+		(tfm->batch_size > 1));
+}
+
 static inline struct crypto_acomp *crypto_acomp_reqtfm(struct acomp_req *r=
eq)
 {
 	return __crypto_acomp_tfm(req->base.tfm);
@@ -578,6 +588,23 @@ int crypto_acomp_compress(struct acomp_req *req);
  */
 int crypto_acomp_decompress(struct acomp_req *req);
=20
+/**
+ * crypto_acomp_batch_size() -- Get the algorithm's batch size
+ *
+ * Function returns the algorithm's batch size for batching operations
+ *
+ * @tfm:	ACOMPRESS tfm handle allocated with crypto_alloc_acomp()
+ *
+ * Return:	crypto_acomp's batch size.
+ */
+static inline unsigned int crypto_acomp_batch_size(struct crypto_acomp *tf=
m)
+{
+	if (acomp_has_async_batching(tfm))
+		return tfm->batch_size;
+
+	return 1;
+}
+
 static inline struct acomp_req *acomp_request_on_stack_init(
 	char *buf, struct crypto_acomp *tfm)
 {
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/=
acompress.h
index 2d97440028ff..e451e0ae3b9b 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -28,6 +28,8 @@
  *
  * @compress:	Function performs a compress operation
  * @decompress:	Function performs a de-compress operation
+ * @batch_size:	Maximum batch-size for batching compress/decompress
+ *		operations.
  * @init:	Initialize the cryptographic transformation object.
  *		This function is used to initialize the cryptographic
  *		transformation object. This function is called only once at
@@ -46,6 +48,7 @@
 struct acomp_alg {
 	int (*compress)(struct acomp_req *req);
 	int (*decompress)(struct acomp_req *req);
+	unsigned int batch_size;
 	int (*init)(struct crypto_acomp *tfm);
 	void (*exit)(struct crypto_acomp *tfm);
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 37D0428466A;
	Fri, 26 Sep 2025 03:35:23 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857725; cv=none;
 b=OUSFmhsHw55C1WHcTKRRRVUjdWvKlp4Dtyn4OVZQmIIhtlvmTjc/ZfKQonHbJ326TRgACvcQZxziaOJ0de7mq6jnVtiKBX4+LAmTHlCIMSoPonfChV3x7I/23wmzslLiJXdtUeSWIrHoomWXo/Kb3JjCOwvVC5zpWxCifiIOozo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857725; c=relaxed/simple;
	bh=01Lb8LwYMfR+RTbgHusuGCF48WnuBxBCxoCOmtG+hkw=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=L11OQnVI9xYs7DDHoS4ql1jctNDxk/Irtcec0Yrv8+LoJzCle3lFjTlSmF+hBHZrKshVEyABVngsXIS0s99Ytir+qLs+U3YfXoooev55gfQlQwEEgszJvqb2L5md/x6sfJ+MyGJUdrg+qKbJYqlKcDiuH8vCq4cNE3PQvThqpKo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=kAUZBk6X; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="kAUZBk6X"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857723; x=1790393723;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=01Lb8LwYMfR+RTbgHusuGCF48WnuBxBCxoCOmtG+hkw=;
  b=kAUZBk6XY7WUgJD1FjGxGOQALDxZqyCUagShlBf/LL5w033NOF4EZ6Fk
   5k9MAETZrQ2m+I772WfFb+0dC7rAW6dqCfazV+l9iHp2t5IFmsBnghc0h
   VFySYA1R2h/r2QpWWqa40/z7U2vmMNEyGAvLI5YutTKdmj0H3nmUMumvp
   peYNBokamMTyGzdllA94S5H+B1aBLzgpvtxfcFbHI5srkTkKhlOdDvY34
   ubmHKnSGMuAaiNj/nSxkPF0L2Naawdtr9iyABBRpV7ymJxov6AaJHPAZE
   h96h1aEx5MbThs5Ip2kAavSMgeDhIY+LalVDiETHUO5QOwhJJJOPk5NSs
   A==;
X-CSE-ConnectionGUID: pIuYXRXjTuOZopv0NZSPIw==
X-CSE-MsgGUID: RJhXN9eKSSKj7khNyqwFiA==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819672"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819672"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:07 -0700
X-CSE-ConnectionGUID: RCx4hUprT/ulKFB3pSElZw==
X-CSE-MsgGUID: 5/k0k+9YSQmn9BNskB7LKA==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636612"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:06 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 20/23] mm: zswap: Per-CPU acomp_ctx resources exist from
 pool creation to deletion.
Date: Thu, 25 Sep 2025 20:34:59 -0700
Message-Id: <20250926033502.7486-21-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch simplifies the zswap_pool's per-CPU acomp_ctx resource
management. Similar to the per-CPU acomp_ctx itself, the per-CPU
acomp_ctx's resources' (acomp, req, buffer) lifetime will also be from
pool creation to pool deletion. These resources will persist through CPU
hotplug operations. The zswap_cpu_comp_dead() teardown callback has been
deleted from the call to
cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE). As a result, CPU
offline hotplug operations will be no-ops as far as the acomp_ctx
resources are concerned.

This commit refactors the code from zswap_cpu_comp_dead() into a
new function acomp_ctx_dealloc() that preserves the IS_ERR_OR_NULL()
checks on acomp_ctx, req and acomp from the existing mainline
implementation of zswap_cpu_comp_dead(). acomp_ctx_dealloc() is called
to clean up acomp_ctx resources from all these procedures:

1) zswap_cpu_comp_prepare() when an error is encountered,
2) zswap_pool_create() when an error is encountered, and
3) from zswap_pool_destroy().

The main benefit of using the CPU hotplug multi state instance startup
callback to allocate the acomp_ctx resources is that it prevents the
cores from being offlined until the multi state instance addition call
returns.

  From Documentation/core-api/cpu_hotplug.rst:

    "The node list add/remove operations and the callback invocations are
     serialized against CPU hotplug operations."

Furthermore, zswap_[de]compress() cannot contend with
zswap_cpu_comp_prepare() because:

  - During pool creation/deletion, the pool is not in the zswap_pools
    list.

  - During CPU hot[un]plug, the CPU is not yet online, as Yosry pointed
    out. zswap_cpu_comp_prepare() will be executed on a control CPU,
    since CPUHP_MM_ZSWP_POOL_PREPARE is in the PREPARE section of "enum
    cpuhp_state". Thanks Yosry for sharing this observation!

  In both these cases, any recursions into zswap reclaim from
  zswap_cpu_comp_prepare() will be handled by the old pool.

The above two observations enable the following simplifications:

 1) zswap_cpu_comp_prepare(): CPU cannot be offlined. Reclaim cannot use
    the pool. Considerations for mutex init/locking and handling
    subsequent CPU hotplug online-offlines:

    Should we lock the mutex of current CPU's acomp_ctx from start to
    end? It doesn't seem like this is required. The CPU hotplug
    operations acquire a "cpuhp_state_mutex" before proceeding, hence
    they are serialized against CPU hotplug operations.

    If the process gets migrated while zswap_cpu_comp_prepare() is
    running, it will complete on the new CPU. In case of failures, we
    pass the acomp_ctx pointer obtained at the start of
    zswap_cpu_comp_prepare() to acomp_ctx_dealloc(), which again, can
    only undergo migration. There appear to be no contention scenarios
    that might cause inconsistent values of acomp_ctx's members. Hence,
    it seems there is no need for mutex_lock(&acomp_ctx->mutex) in
    zswap_cpu_comp_prepare().

    Since the pool is not yet on zswap_pools list, we don't need to
    initialize the per-CPU acomp_ctx mutex in zswap_pool_create(). This
    has been restored to occur in zswap_cpu_comp_prepare().

    zswap_cpu_comp_prepare() checks upfront if acomp_ctx->acomp is
    valid. If so, it returns success. This should handle any CPU
    hotplug online-offline transitions after pool creation is done.

 2) CPU offline vis-a-vis zswap ops: Let's suppose the process is
    migrated to another CPU before the current CPU is dysfunctional. If
    zswap_[de]compress() holds the acomp_ctx->mutex lock of the offlined
    CPU, that mutex will be released once it completes on the new
    CPU. Since there is no teardown callback, there is no possibility of
    UAF.

 3) Pool creation/deletion and process migration to another CPU:

    - During pool creation/deletion, the pool is not in the zswap_pools
      list. Hence it cannot contend with zswap ops on that CPU. However,
      the process can get migrated.

      Pool creation --> zswap_cpu_comp_prepare()
                                --> process migrated:
                                    * CPU offline: no-op.
                                    * zswap_cpu_comp_prepare() continues
                                      to run on the new CPU to finish
                                      allocating acomp_ctx resources for
                                      the offlined CPU.

      Pool deletion --> acomp_ctx_dealloc()
                                --> process migrated:
                                    * CPU offline: no-op.
                                    * acomp_ctx_dealloc() continues
                                      to run on the new CPU to finish
                                      de-allocating acomp_ctx resources
                                      for the offlined CPU.

 4) Pool deletion vis-a-vis CPU onlining:
    To prevent possibility of race conditions between
    acomp_ctx_dealloc() freeing the acomp_ctx resources and the initial
    check for a valid acomp_ctx->acomp in zswap_cpu_comp_prepare(), we
    need to delete the multi state instance right after it is added, in
    zswap_pool_create().

 Summary of changes based on the above:
 --------------------------------------
 1) Zero-initialization of pool->acomp_ctx in zswap_pool_create() to
    simplify and share common code for different error handling/cleanup
    related to the acomp_ctx.

 2) Remove the node list instance right after node list add function
    call in zswap_pool_create(). This prevents race conditions between
    CPU onlining after initial pool creation, and acomp_ctx_dealloc()
    freeing the acomp_ctx resources.

 3) zswap_pool_destroy() will call acomp_ctx_dealloc() to de-allocate
    the per-CPU acomp_ctx resources.

 4) Changes to zswap_cpu_comp_prepare():

    a) Check if acomp_ctx->acomp is valid at the beginning and return,
       because the acomp_ctx is already initialized.
    b) Move the mutex_init to happen in this procedure, before it
       returns.
    c) All error conditions handled by calling acomp_ctx_dealloc().

 5) New procedure acomp_ctx_dealloc() for common error/cleanup code.

 6) No more multi state instance teardown callback. CPU offlining is a
    no-op as far as acomp_ctx resources are concerned.

 7) Delete acomp_ctx_get_cpu_lock()/acomp_ctx_put_unlock(). Directly
    call mutex_lock(&acomp_ctx->mutex)/mutex_unlock(&acomp_ctx->mutex)
    in zswap_[de]compress().

The per-CPU memory cost of not deleting the acomp_ctx resources upon CPU
offlining, and only deleting them when the pool is destroyed, is as
follows, on x86_64:

    IAA with 8 dst buffers for batching:    64.34 KB
    Software compressors with 1 dst buffer:  8.28 KB

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/zswap.c | 194 +++++++++++++++++++++++++----------------------------
 1 file changed, 93 insertions(+), 101 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index c1af782e54ec..27665eaa3f89 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -242,6 +242,30 @@ static inline struct xarray *swap_zswap_tree(swp_entry=
_t swp)
 **********************************/
 static void __zswap_pool_empty(struct percpu_ref *ref);
=20
+/*
+ * The per-cpu pool->acomp_ctx is zero-initialized on allocation. This mak=
es
+ * it easy for different error conditions/cleanup related to the acomp_ctx
+ * to be handled by acomp_ctx_dealloc():
+ * - Errors during zswap_cpu_comp_prepare().
+ * - Partial success/error of cpuhp_state_add_instance() call in
+ *   zswap_pool_create(). Only some cores could have executed
+ *   zswap_cpu_comp_prepare(), not others.
+ * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy().
+ */
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+{
+	if (IS_ERR_OR_NULL(acomp_ctx))
+		return;
+
+	if (!IS_ERR_OR_NULL(acomp_ctx->req))
+		acomp_request_free(acomp_ctx->req);
+
+	if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+		crypto_free_acomp(acomp_ctx->acomp);
+
+	kfree(acomp_ctx->buffer);
+}
+
 static struct zswap_pool *zswap_pool_create(char *compressor)
 {
 	struct zswap_pool *pool;
@@ -263,19 +287,43 @@ static struct zswap_pool *zswap_pool_create(char *com=
pressor)
=20
 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
=20
-	pool->acomp_ctx =3D alloc_percpu(*pool->acomp_ctx);
+	/* Many things rely on the zero-initialization. */
+	pool->acomp_ctx =3D alloc_percpu_gfp(*pool->acomp_ctx,
+					   GFP_KERNEL | __GFP_ZERO);
 	if (!pool->acomp_ctx) {
 		pr_err("percpu alloc failed\n");
 		goto error;
 	}
=20
-	for_each_possible_cpu(cpu)
-		mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
-
+	/*
+	 * This is serialized against CPU hotplug operations. Hence, cores
+	 * cannot be offlined until this finishes.
+	 * In case of errors, we need to goto "ref_fail" instead of "error"
+	 * because there is no teardown callback registered anymore, for
+	 * cpuhp_state_add_instance() to de-allocate resources as it rolls back
+	 * state on cores before the CPU on which error was encountered.
+	 */
 	ret =3D cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
 				       &pool->node);
+
+	/*
+	 * We only needed the multi state instance add operation to invoke the
+	 * startup callback for all cores without cores getting offlined. Since
+	 * the acomp_ctx resources will now only be de-allocated when the pool
+	 * is destroyed, we can safely remove the multi state instance. This
+	 * minimizes (but does not eliminate) the possibility of
+	 * zswap_cpu_comp_prepare() being invoked again due to a CPU
+	 * offline-online transition. Removing the instance also prevents race
+	 * conditions between CPU onlining after initial pool creation, and
+	 * acomp_ctx_dealloc() freeing the acomp_ctx resources.
+	 * Note that we delete the instance before checking the error status of
+	 * the node list add operation because we want the instance removal even
+	 * in case of errors in the former.
+	 */
+	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
 	if (ret)
-		goto error;
+		goto ref_fail;
=20
 	/* being the current pool takes 1 ref; this func expects the
 	 * caller to always add the new pool as the current pool
@@ -291,7 +339,8 @@ static struct zswap_pool *zswap_pool_create(char *compr=
essor)
 	return pool;
=20
 ref_fail:
-	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+	for_each_possible_cpu(cpu)
+		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -322,9 +371,13 @@ static struct zswap_pool *__zswap_pool_create_fallback=
(void)
=20
 static void zswap_pool_destroy(struct zswap_pool *pool)
 {
+	int cpu;
+
 	zswap_pool_debug("destroying", pool);
=20
-	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+	for_each_possible_cpu(cpu)
+		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+
 	free_percpu(pool->acomp_ctx);
=20
 	zs_destroy_pool(pool->zs_pool);
@@ -736,39 +789,39 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, s=
truct hlist_node *node)
 {
 	struct zswap_pool *pool =3D hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx =3D per_cpu_ptr(pool->acomp_ctx, cpu);
-	struct crypto_acomp *acomp =3D NULL;
-	struct acomp_req *req =3D NULL;
-	u8 *buffer =3D NULL;
-	int ret;
+	int ret =3D -ENOMEM;
=20
-	buffer =3D kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
-	if (!buffer) {
-		ret =3D -ENOMEM;
-		goto fail;
-	}
+	/*
+	 * The per-CPU pool->acomp_ctx is zero-initialized on allocation.
+	 * Even though we delete the multi state instance right after successful
+	 * addition of the instance in zswap_pool_create(), we cannot eliminate
+	 * the possibility of the CPU going through offline-online transitions.
+	 * If this does happen, we check if the acomp_ctx has already been
+	 * initialized, and return.
+	 */
+	if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+		return 0;
=20
-	acomp =3D crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
-	if (IS_ERR(acomp)) {
+	acomp_ctx->buffer =3D kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu=
));
+	if (!acomp_ctx->buffer)
+		return ret;
+
+	acomp_ctx->acomp =3D crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to=
_node(cpu));
+	if (IS_ERR(acomp_ctx->acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
-				pool->tfm_name, PTR_ERR(acomp));
-		ret =3D PTR_ERR(acomp);
+				pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
+		ret =3D PTR_ERR(acomp_ctx->acomp);
 		goto fail;
 	}
+	acomp_ctx->is_sleepable =3D acomp_is_async(acomp_ctx->acomp);
=20
-	req =3D acomp_request_alloc(acomp);
-	if (!req) {
+	acomp_ctx->req =3D acomp_request_alloc(acomp_ctx->acomp);
+	if (!acomp_ctx->req) {
 		pr_err("could not alloc crypto acomp_request %s\n",
 		       pool->tfm_name);
-		ret =3D -ENOMEM;
 		goto fail;
 	}
=20
-	/*
-	 * Only hold the mutex after completing allocations, otherwise we may
-	 * recurse into zswap through reclaim and attempt to hold the mutex
-	 * again resulting in a deadlock.
-	 */
-	mutex_lock(&acomp_ctx->mutex);
 	crypto_init_wait(&acomp_ctx->wait);
=20
 	/*
@@ -776,81 +829,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, s=
truct hlist_node *node)
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
 	 * won't be called, crypto_wait_req() will return without blocking.
 	 */
-	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+	acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
=20
-	acomp_ctx->buffer =3D buffer;
-	acomp_ctx->acomp =3D acomp;
-	acomp_ctx->is_sleepable =3D acomp_is_async(acomp);
-	acomp_ctx->req =3D req;
-	mutex_unlock(&acomp_ctx->mutex);
+	mutex_init(&acomp_ctx->mutex);
 	return 0;
=20
 fail:
-	if (acomp)
-		crypto_free_acomp(acomp);
-	kfree(buffer);
+	acomp_ctx_dealloc(acomp_ctx);
 	return ret;
 }
=20
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct zswap_pool *pool =3D hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx =3D per_cpu_ptr(pool->acomp_ctx, cpu);
-	struct acomp_req *req;
-	struct crypto_acomp *acomp;
-	u8 *buffer;
-
-	if (IS_ERR_OR_NULL(acomp_ctx))
-		return 0;
-
-	mutex_lock(&acomp_ctx->mutex);
-	req =3D acomp_ctx->req;
-	acomp =3D acomp_ctx->acomp;
-	buffer =3D acomp_ctx->buffer;
-	acomp_ctx->req =3D NULL;
-	acomp_ctx->acomp =3D NULL;
-	acomp_ctx->buffer =3D NULL;
-	mutex_unlock(&acomp_ctx->mutex);
-
-	/*
-	 * Do the actual freeing after releasing the mutex to avoid subtle
-	 * locking dependencies causing deadlocks.
-	 */
-	if (!IS_ERR_OR_NULL(req))
-		acomp_request_free(req);
-	if (!IS_ERR_OR_NULL(acomp))
-		crypto_free_acomp(acomp);
-	kfree(buffer);
-
-	return 0;
-}
-
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *=
pool)
-{
-	struct crypto_acomp_ctx *acomp_ctx;
-
-	for (;;) {
-		acomp_ctx =3D raw_cpu_ptr(pool->acomp_ctx);
-		mutex_lock(&acomp_ctx->mutex);
-		if (likely(acomp_ctx->req))
-			return acomp_ctx;
-		/*
-		 * It is possible that we were migrated to a different CPU after
-		 * getting the per-CPU ctx but before the mutex was acquired. If
-		 * the old CPU got offlined, zswap_cpu_comp_dead() could have
-		 * already freed ctx->req (among other things) and set it to
-		 * NULL. Just try again on the new CPU that we ended up on.
-		 */
-		mutex_unlock(&acomp_ctx->mutex);
-	}
-}
-
-static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
-{
-	mutex_unlock(&acomp_ctx->mutex);
-}
-
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -863,7 +852,9 @@ static bool zswap_compress(struct page *page, struct zs=
wap_entry *entry,
 	u8 *dst;
 	bool mapped =3D false;
=20
-	acomp_ctx =3D acomp_ctx_get_cpu_lock(pool);
+	acomp_ctx =3D raw_cpu_ptr(pool->acomp_ctx);
+	mutex_lock(&acomp_ctx->mutex);
+
 	dst =3D acomp_ctx->buffer;
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -927,7 +918,7 @@ static bool zswap_compress(struct page *page, struct zs=
wap_entry *entry,
 	else if (alloc_ret)
 		zswap_reject_alloc_fail++;
=20
-	acomp_ctx_put_unlock(acomp_ctx);
+	mutex_unlock(&acomp_ctx->mutex);
 	return comp_ret =3D=3D 0 && alloc_ret =3D=3D 0;
 }
=20
@@ -939,7 +930,8 @@ static bool zswap_decompress(struct zswap_entry *entry,=
 struct folio *folio)
 	int decomp_ret =3D 0, dlen =3D PAGE_SIZE;
 	u8 *src, *obj;
=20
-	acomp_ctx =3D acomp_ctx_get_cpu_lock(pool);
+	acomp_ctx =3D raw_cpu_ptr(pool->acomp_ctx);
+	mutex_lock(&acomp_ctx->mutex);
 	obj =3D zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer=
);
=20
 	/* zswap entries of length PAGE_SIZE are not compressed. */
@@ -970,7 +962,7 @@ static bool zswap_decompress(struct zswap_entry *entry,=
 struct folio *folio)
=20
 read_done:
 	zs_obj_read_end(pool->zs_pool, entry->handle, obj);
-	acomp_ctx_put_unlock(acomp_ctx);
+	mutex_unlock(&acomp_ctx->mutex);
=20
 	if (!decomp_ret && dlen =3D=3D PAGE_SIZE)
 		return true;
@@ -1796,7 +1788,7 @@ static int zswap_setup(void)
 	ret =3D cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
 				      "mm/zswap_pool:prepare",
 				      zswap_cpu_comp_prepare,
-				      zswap_cpu_comp_dead);
+				      NULL);
 	if (ret)
 		goto hp_fail;
=20
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 65BAB285056;
	Fri, 26 Sep 2025 03:35:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857726; cv=none;
 b=Kxu92NwykUsfDgnEVa7l2r+A94Ah9i+CQt9yW5kPLGWI+ktfZYI/TqLbDu69g+khIa6A3P14FO/U1JGIraeUkCDtNUnltAxyJQ3XbGkiEqF8HaGz93Lky9xFybFgdO+Kosz9ssYb7tytMPtGLiWWCwKaecr1m0JWO3D7fq0MPs8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857726; c=relaxed/simple;
	bh=hZGN6xEotmZo6YeZgYvHqSvus949RdI129rpxv5A790=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=kvDtAIV16cC//ExvLqUwPcsBtAI+W0dsJqx7JurWLgYOmvnR70tjfSBoYmcHzmScMbjanJ/+IgqFkvSXOT3jzaC7HgWaBwaQFnsRwA+jh69ecL4kbmzCbMbv2Ci8SPfalj0Q2eYB3AmtfQf0/2QDy3SAeiEPJA6qoG+2hR/Q8dk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=e3wsa5JW; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="e3wsa5JW"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857724; x=1790393724;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=hZGN6xEotmZo6YeZgYvHqSvus949RdI129rpxv5A790=;
  b=e3wsa5JW2JO+WbwZz3TNIH3V8OdlZoIwWRnxrU6nXPE5nGrb7RnZ5Kc+
   9+9K/Qo4Aehb6IL5LMJypbi4zYHtFHQsaATvUFHiRSmVFajEBI/jY1xno
   F7Y/oAbrX50G8Tl8k/jqzSUD/IPO8lbqUfsv6xt5GNhhjhQbi1ZDpcAcC
   BFBOVbUBFZG0mommJNYw6RMvsG8k62su7MrInH5cbMeF4g4MW/09FVpd0
   Qm/lFKj2nBmQvsy3lg53GkifSnKCZ46+1RaoR3yBhJBQKGSiVGz7wBcNP
   412KFxax7SbuE22rg9MalX0JQFwUKT6ljxLJaAG0+/O11FfPHpwfQndCU
   A==;
X-CSE-ConnectionGUID: SxvLkzpkTRuXfJ3LTjU/vQ==
X-CSE-MsgGUID: zEqSm8zfRKq2uNg8RvElXw==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819686"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819686"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:07 -0700
X-CSE-ConnectionGUID: wVuklESySoW+bjcdUaKEpg==
X-CSE-MsgGUID: awoYk1RmRJmb0Cdwz6hcrw==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636615"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:07 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 21/23] mm: zswap: Consistently use IS_ERR_OR_NULL() to
 check acomp_ctx resources.
Date: Thu, 25 Sep 2025 20:35:00 -0700
Message-Id: <20250926033502.7486-22-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch uses IS_ERR_OR_NULL() in zswap_cpu_comp_prepare() to check
for valid acomp/req, thereby making it consistent with
acomp_ctx_dealloc().

This is based on this earlier comment [1] from Yosry, when reviewing v8.

[1] https://patchwork.kernel.org/comment/26282128/

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/zswap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 27665eaa3f89..3b3716808d7d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -807,7 +807,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, str=
uct hlist_node *node)
 		return ret;
=20
 	acomp_ctx->acomp =3D crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to=
_node(cpu));
-	if (IS_ERR(acomp_ctx->acomp)) {
+	if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
 				pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
 		ret =3D PTR_ERR(acomp_ctx->acomp);
@@ -816,7 +816,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, str=
uct hlist_node *node)
 	acomp_ctx->is_sleepable =3D acomp_is_async(acomp_ctx->acomp);
=20
 	acomp_ctx->req =3D acomp_request_alloc(acomp_ctx->acomp);
-	if (!acomp_ctx->req) {
+	if (IS_ERR_OR_NULL(acomp_ctx->req)) {
 		pr_err("could not alloc crypto acomp_request %s\n",
 		       pool->tfm_name);
 		goto fail;
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 75C0728489E;
	Fri, 26 Sep 2025 03:35:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857727; cv=none;
 b=b0H+UnONw73EX+pusHAxD5WsySw+g9B+BKsR0UnVDJ7tHBfqFHCliry7cfNETcSR4WXzf0aQhgws5bV48YGCs/MsR84jrsDMEz9jpBXasKoWqhVDnuq0N+vDieNGymX92iPVYRMMlIVt7/Kla5Nbx9Vq7a/90qvevzDidQPqEbQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857727; c=relaxed/simple;
	bh=mGtXvNmlYxsrpZ+kKJB5ZWF7dLawxjBywtU+Xpqj8Mg=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=gNhVZ6EzGjbrbw4gWuqcjRDTP+JGFUKMpr83tsn/3G5qYtlUEJ9XOz1TiMDYizYX4G7UpphzTZpA3SdcjmhYisufbxMUZ3ORhpp9u1BiRFp5k59lDyPo2Lo6xY22yfvxxnOZ7Dr3k4UbUcviFO1yadItS46B/vW8DP3IIq+Di2k=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=Y5MMlU5K; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="Y5MMlU5K"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857725; x=1790393725;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=mGtXvNmlYxsrpZ+kKJB5ZWF7dLawxjBywtU+Xpqj8Mg=;
  b=Y5MMlU5Kq5F/jUqwyjMAuhd8nhRfA8j15uyyxesNfXVPC+SrCQd7TEPD
   5qm6uq7VmbdTRPJAqEBKx3alKCl+JtjkweZU1SSjcvH7BS288Yj8TGU0w
   3goTeg9D0QMnqFEV92xEmMw8I1jBJwC1OtsbD65eZ+8YWoEFwEG9RhE2H
   7S2TvAcsOa2h0TTZ2quij429bOq6m3IksDn87dKczRLoqGnc0yaNZVoby
   EEn3RZEGpPR3QQvRD/uQwpg1zXgw5qnHw+n/A65Z9wpV/eFfBD+SWyf3L
   ZuDi3qPRupT16KGxL1/21AjRYAMvqS4NJLpYwlv2eIil0WjOKtp/EHnqv
   g==;
X-CSE-ConnectionGUID: b2nyaV1IRhOI3P07BAJ15Q==
X-CSE-MsgGUID: 1LHjrlPHTPiA5hBygWkMSw==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819700"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819700"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:08 -0700
X-CSE-ConnectionGUID: 1UJzSllASZCUHGsMSBm3oQ==
X-CSE-MsgGUID: xgBxwvpgT7utbXkp/7Mv3w==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636618"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:07 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 22/23] mm: zswap: zswap_store() will process a large folio
 in batches.
Date: Thu, 25 Sep 2025 20:35:01 -0700
Message-Id: <20250926033502.7486-23-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

This patch makes two major changes:

First, we allocate pool batching resources if the compressor supports
batching:

  This patch sets up zswap for allocating per-CPU resources optimally
  for non-batching and batching compressors.

  A new ZSWAP_MAX_BATCH_SIZE constant is defined as 8U, to set an upper
  limit on the number of pages in large folios that will be batch
  compressed.

  It is up to the compressor to manage multiple requests, as needed, to
  accomplish batch parallelism. zswap only needs to allocate the per-CPU
  dst buffers according to the batch size supported by the compressor.

  A "u8 compr_batch_size" member is added to "struct zswap_pool", as per
  Yosry's suggestion. pool->compr_batch_size is set as the minimum of
  the compressor's max batch-size and ZSWAP_MAX_BATCH_SIZE. Accordingly,
  it proceeds to allocate the necessary compression dst buffers in the
  per-CPU acomp_ctx.

  Another "u8 store_batch_size" member is added to "struct zswap_pool"
  to store the unit for batching large folio stores: for batching
  compressors, this is the pool->compr_batch_size. For non-batching
  compressors, this is ZSWAP_MAX_BATCH_SIZE.

  zswap does not use more than one dst buffer yet. Follow-up patches
  will actually utilize the multiple acomp_ctx buffers for batch
  compression/decompression of multiple pages.

  Thus, ZSWAP_MAX_BATCH_SIZE limits the amount of extra memory used for
  batching. There is a small extra memory overhead of allocating
  the acomp_ctx->buffers array for compressors that do not support
  batching: On x86_64, the overhead is 1 pointer per-CPU (i.e. 8 bytes).

Next, we store the folio in batches:

  This patch modifies zswap_store() to store a batch of pages in large
  folios at a time, instead of storing one page at a time. It does this by
  calling a new procedure zswap_store_pages() with a range of
  "pool->store_batch_size" indices in the folio.

  zswap_store_pages() implements all the computes done earlier in
  zswap_store_page() for a single-page, for multiple pages in a folio,
  namely the "batch":

  1) It starts by allocating all zswap entries required to store the
     batch. New procedures, zswap_entries_cache_alloc_batch() and
     zswap_entries_cache_free_batch() call kmem_cache_[free]alloc_bulk()
     to optimize the performance of this step.

  2) Next, the entries fields are written, computes that need to be happen
     anyway, without modifying the zswap xarray/LRU publishing order. This
     improves latency by avoiding having to bring the entries into the
     cache for writing in different code blocks within this procedure.

  3) Next, it calls zswap_compress() to sequentially compress each page in
     the batch.

  4) Finally, it adds the batch's zswap entries to the xarray and LRU,
     charges zswap memory and increments zswap stats.

  5) The error handling and cleanup required for all failure scenarios
     that can occur while storing a batch in zswap are consolidated to a
     single "store_pages_failed" label in zswap_store_pages(). Here again,
     we optimize performance by calling kmem_cache_free_bulk().

This commit also makes a minor optimization in zswap_compress(), for the
info on whether or not the page's folio has memcg writeback enabled to
be passed in via a "bool folio_wb" flag from zswap_store(). The intent
is to not re-compute this for every page in a folio. Since
zswap_compress() is a static function, I figured this should be safe.
A repetition of "dlen =3D PAGE_SIZE" is deleted.

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 mm/zswap.c | 319 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 224 insertions(+), 95 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 3b3716808d7d..9e0e7887de33 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -82,6 +82,9 @@ static bool zswap_pool_reached_full;
=20
 #define ZSWAP_PARAM_UNSET ""
=20
+/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
+#define ZSWAP_MAX_BATCH_SIZE 8U
+
 static int zswap_setup(void);
=20
 /* Enable/disable zswap */
@@ -139,7 +142,7 @@ struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
 	struct acomp_req *req;
 	struct crypto_wait wait;
-	u8 *buffer;
+	u8 **buffers;
 	struct mutex mutex;
 	bool is_sleepable;
 };
@@ -158,6 +161,8 @@ struct zswap_pool {
 	struct work_struct release_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+	u8 compr_batch_size;
+	u8 store_batch_size;
 };
=20
 /* Global LRU lists shared by all zswap pools. */
@@ -252,8 +257,10 @@ static void __zswap_pool_empty(struct percpu_ref *ref);
  *   zswap_cpu_comp_prepare(), not others.
  * - Cleanup acomp_ctx resources on all cores in zswap_pool_destroy().
  */
-static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_bu=
ffers)
 {
+	u8 i;
+
 	if (IS_ERR_OR_NULL(acomp_ctx))
 		return;
=20
@@ -263,7 +270,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx =
*acomp_ctx)
 	if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 		crypto_free_acomp(acomp_ctx->acomp);
=20
-	kfree(acomp_ctx->buffer);
+	if (acomp_ctx->buffers) {
+		for (i =3D 0; i < nr_buffers; ++i)
+			kfree(acomp_ctx->buffers[i]);
+		kfree(acomp_ctx->buffers);
+	}
 }
=20
 static struct zswap_pool *zswap_pool_create(char *compressor)
@@ -275,6 +286,7 @@ static struct zswap_pool *zswap_pool_create(char *compr=
essor)
 	if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
 		return NULL;
=20
+	/* Many things rely on the zero-initialization. */
 	pool =3D kzalloc(sizeof(*pool), GFP_KERNEL);
 	if (!pool)
 		return NULL;
@@ -334,13 +346,28 @@ static struct zswap_pool *zswap_pool_create(char *com=
pressor)
 		goto ref_fail;
 	INIT_LIST_HEAD(&pool->list);
=20
+	/*
+	 * Set the unit of compress batching for large folios, for quick
+	 * retrieval in the zswap_compress() fast path:
+	 * If the compressor is sequential (@pool->compr_batch_size is 1),
+	 * large folios will be compressed in batches of ZSWAP_MAX_BATCH_SIZE
+	 * pages, where each page in the batch is compressed sequentially.
+	 * We see better performance by processing the folio in batches of
+	 * ZSWAP_MAX_BATCH_SIZE, due to cache locality of working set
+	 * structures.
+	 */
+	pool->store_batch_size =3D (pool->compr_batch_size > 1) ?
+				  pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
+
 	zswap_pool_debug("created", pool);
=20
 	return pool;
=20
 ref_fail:
 	for_each_possible_cpu(cpu)
-		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+				  pool->compr_batch_size);
+
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -376,7 +403,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 	zswap_pool_debug("destroying", pool);
=20
 	for_each_possible_cpu(cpu)
-		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+		acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+				  pool->compr_batch_size);
=20
 	free_percpu(pool->acomp_ctx);
=20
@@ -763,6 +791,24 @@ static void zswap_entry_cache_free(struct zswap_entry =
*entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
=20
+/*
+ * Returns 0 if kmem_cache_alloc_bulk() failed and a positive number other=
wise.
+ * The code for __kmem_cache_alloc_bulk() indicates that this positive num=
ber
+ * will be the @size requested, i.e., @nr_entries.
+ */
+static __always_inline int zswap_entries_cache_alloc_batch(void **entries,
+							   unsigned int nr_entries,
+							   gfp_t gfp)
+{
+	return kmem_cache_alloc_bulk(zswap_entry_cache, gfp, nr_entries, entries);
+}
+
+static __always_inline void zswap_entries_cache_free_batch(void **entries,
+							   unsigned int nr_entries)
+{
+	kmem_cache_free_bulk(zswap_entry_cache, nr_entries, entries);
+}
+
 /*
  * Carries out the common pattern of freeing an entry's zsmalloc allocatio=
n,
  * freeing the entry itself, and decrementing the number of stored pages.
@@ -789,7 +835,9 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, str=
uct hlist_node *node)
 {
 	struct zswap_pool *pool =3D hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx =3D per_cpu_ptr(pool->acomp_ctx, cpu);
+	int cpu_node =3D cpu_to_node(cpu);
 	int ret =3D -ENOMEM;
+	u8 i;
=20
 	/*
 	 * The per-CPU pool->acomp_ctx is zero-initialized on allocation.
@@ -802,11 +850,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, st=
ruct hlist_node *node)
 	if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 		return 0;
=20
-	acomp_ctx->buffer =3D kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu=
));
-	if (!acomp_ctx->buffer)
-		return ret;
-
-	acomp_ctx->acomp =3D crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to=
_node(cpu));
+	acomp_ctx->acomp =3D crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_no=
de);
 	if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
 				pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
@@ -815,20 +859,40 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, s=
truct hlist_node *node)
 	}
 	acomp_ctx->is_sleepable =3D acomp_is_async(acomp_ctx->acomp);
=20
+	/*
+	 * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
+	 * compressor supports batching.
+	 */
+	pool->compr_batch_size =3D min(ZSWAP_MAX_BATCH_SIZE,
+				     crypto_acomp_batch_size(acomp_ctx->acomp));
+
 	acomp_ctx->req =3D acomp_request_alloc(acomp_ctx->acomp);
+
 	if (IS_ERR_OR_NULL(acomp_ctx->req)) {
 		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
+			pool->tfm_name);
 		goto fail;
 	}
=20
-	crypto_init_wait(&acomp_ctx->wait);
+	acomp_ctx->buffers =3D kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
+					  GFP_KERNEL, cpu_node);
+	if (!acomp_ctx->buffers)
+		goto fail;
+
+	for (i =3D 0; i < pool->compr_batch_size; ++i) {
+		acomp_ctx->buffers[i] =3D kmalloc_node(PAGE_SIZE, GFP_KERNEL,
+						     cpu_node);
+		if (!acomp_ctx->buffers[i])
+			goto fail;
+	}
=20
 	/*
 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
 	 * won't be called, crypto_wait_req() will return without blocking.
 	 */
+	crypto_init_wait(&acomp_ctx->wait);
+
 	acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
=20
@@ -836,12 +900,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, s=
truct hlist_node *node)
 	return 0;
=20
 fail:
-	acomp_ctx_dealloc(acomp_ctx);
+	acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
 	return ret;
 }
=20
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
-			   struct zswap_pool *pool)
+			   struct zswap_pool *pool, bool folio_wb)
 {
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct scatterlist input, output;
@@ -855,7 +919,7 @@ static bool zswap_compress(struct page *page, struct zs=
wap_entry *entry,
 	acomp_ctx =3D raw_cpu_ptr(pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
=20
-	dst =3D acomp_ctx->buffer;
+	dst =3D acomp_ctx->buffers[0];
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
=20
@@ -886,13 +950,11 @@ static bool zswap_compress(struct page *page, struct =
zswap_entry *entry,
 	 */
 	if (comp_ret || !dlen || dlen >=3D PAGE_SIZE) {
 		dlen =3D PAGE_SIZE;
-		if (!mem_cgroup_zswap_writeback_enabled(
-					folio_memcg(page_folio(page)))) {
+		if (!folio_wb) {
 			comp_ret =3D comp_ret ? comp_ret : -EINVAL;
 			goto unlock;
 		}
 		comp_ret =3D 0;
-		dlen =3D PAGE_SIZE;
 		dst =3D kmap_local_page(page);
 		mapped =3D true;
 	}
@@ -932,7 +994,7 @@ static bool zswap_decompress(struct zswap_entry *entry,=
 struct folio *folio)
=20
 	acomp_ctx =3D raw_cpu_ptr(pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
-	obj =3D zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer=
);
+	obj =3D zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer=
s[0]);
=20
 	/* zswap entries of length PAGE_SIZE are not compressed. */
 	if (entry->length =3D=3D PAGE_SIZE) {
@@ -942,15 +1004,15 @@ static bool zswap_decompress(struct zswap_entry *ent=
ry, struct folio *folio)
=20
 	/*
 	 * zs_obj_read_begin() might return a kmap address of highmem when
-	 * acomp_ctx->buffer is not used.  However, sg_init_one() does not
-	 * handle highmem addresses, so copy the object to acomp_ctx->buffer.
+	 * acomp_ctx->buffers[0] is not used.  However, sg_init_one() does not
+	 * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
 	 */
 	if (virt_addr_valid(obj)) {
 		src =3D obj;
 	} else {
-		WARN_ON_ONCE(obj =3D=3D acomp_ctx->buffer);
-		memcpy(acomp_ctx->buffer, obj, entry->length);
-		src =3D acomp_ctx->buffer;
+		WARN_ON_ONCE(obj =3D=3D acomp_ctx->buffers[0]);
+		memcpy(acomp_ctx->buffers[0], obj, entry->length);
+		src =3D acomp_ctx->buffers[0];
 	}
=20
 	sg_init_one(&input, src, entry->length);
@@ -1404,95 +1466,160 @@ static void shrink_worker(struct work_struct *w)
 * main API
 **********************************/
=20
-static bool zswap_store_page(struct page *page,
-			     struct obj_cgroup *objcg,
-			     struct zswap_pool *pool)
+/*
+ * Store multiple pages in @folio, starting from the page at index @start =
up to
+ * the page at index @end-1.
+ */
+static bool zswap_store_pages(struct folio *folio,
+			      long start,
+			      long end,
+			      struct obj_cgroup *objcg,
+			      struct zswap_pool *pool,
+			      int node_id,
+			      bool folio_wb)
 {
-	swp_entry_t page_swpentry =3D page_swap_entry(page);
-	struct zswap_entry *entry, *old;
-
-	/* allocate entry */
-	entry =3D zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
-	if (!entry) {
-		zswap_reject_kmemcache_fail++;
-		return false;
+	struct zswap_entry *entries[ZSWAP_MAX_BATCH_SIZE];
+	u8 i, store_fail_idx =3D 0, nr_pages =3D end - start;
+
+	VM_WARN_ON_ONCE(nr_pages > ZSWAP_MAX_BATCH_SIZE);
+
+	if (unlikely(!zswap_entries_cache_alloc_batch((void **)&entries[0],
+						      nr_pages, GFP_KERNEL))) {
+		for (i =3D 0; i < nr_pages; ++i) {
+			entries[i] =3D zswap_entry_cache_alloc(GFP_KERNEL, node_id);
+
+			if (unlikely(!entries[i])) {
+				zswap_reject_kmemcache_fail++;
+				/*
+				 * While handling this error, we only need to
+				 * call zswap_entries_cache_free_batch() for
+				 * entries[0 .. i-1].
+				 */
+				nr_pages =3D i;
+				goto store_pages_failed;
+			}
+		}
 	}
=20
-	if (!zswap_compress(page, entry, pool))
-		goto compress_failed;
+	/*
+	 * Three sets of initializations are done to minimize bringing
+	 * @entries into the cache for writing at different parts of this
+	 * procedure, since doing so regresses performance:
+	 *
+	 * 1) Do all the writes to each entry in one code block. These
+	 *    writes need to be done anyway upon success which is more likely
+	 *    than not.
+	 *
+	 * 2) Initialize the handle to an error value. This facilitates
+	 *    having a consolidated failure handling
+	 *    'goto store_pages_failed' that can inspect the value of the
+	 *    handle to determine whether zsmalloc memory needs to be
+	 *    de-allocated.
+	 *
+	 * 3) The page_swap_entry() is obtained once and stored in the entry.
+	 *    Subsequent store in xarray gets the entry->swpentry instead of
+	 *    calling page_swap_entry(), minimizing computes.
+	 */
+	for (i =3D 0; i < nr_pages; ++i) {
+		entries[i]->handle =3D (unsigned long)ERR_PTR(-EINVAL);
+		entries[i]->pool =3D pool;
+		entries[i]->swpentry =3D page_swap_entry(folio_page(folio, start + i));
+		entries[i]->objcg =3D objcg;
+		entries[i]->referenced =3D true;
+		INIT_LIST_HEAD(&entries[i]->lru);
+	}
=20
-	old =3D xa_store(swap_zswap_tree(page_swpentry),
-		       swp_offset(page_swpentry),
-		       entry, GFP_KERNEL);
-	if (xa_is_err(old)) {
-		int err =3D xa_err(old);
+	for (i =3D 0; i < nr_pages; ++i) {
+		struct page *page =3D folio_page(folio, start + i);
=20
-		WARN_ONCE(err !=3D -ENOMEM, "unexpected xarray error: %d\n", err);
-		zswap_reject_alloc_fail++;
-		goto store_failed;
+		if (!zswap_compress(page, entries[i], pool, folio_wb))
+			goto store_pages_failed;
 	}
=20
-	/*
-	 * We may have had an existing entry that became stale when
-	 * the folio was redirtied and now the new version is being
-	 * swapped out. Get rid of the old.
-	 */
-	if (old)
-		zswap_entry_free(old);
+	for (i =3D 0; i < nr_pages; ++i) {
+		struct zswap_entry *old, *entry =3D entries[i];
=20
-	/*
-	 * The entry is successfully compressed and stored in the tree, there is
-	 * no further possibility of failure. Grab refs to the pool and objcg,
-	 * charge zswap memory, and increment zswap_stored_pages.
-	 * The opposite actions will be performed by zswap_entry_free()
-	 * when the entry is removed from the tree.
-	 */
-	zswap_pool_get(pool);
-	if (objcg) {
-		obj_cgroup_get(objcg);
-		obj_cgroup_charge_zswap(objcg, entry->length);
-	}
-	atomic_long_inc(&zswap_stored_pages);
-	if (entry->length =3D=3D PAGE_SIZE)
-		atomic_long_inc(&zswap_stored_incompressible_pages);
+		old =3D xa_store(swap_zswap_tree(entry->swpentry),
+			       swp_offset(entry->swpentry),
+			       entry, GFP_KERNEL);
+		if (unlikely(xa_is_err(old))) {
+			int err =3D xa_err(old);
=20
-	/*
-	 * We finish initializing the entry while it's already in xarray.
-	 * This is safe because:
-	 *
-	 * 1. Concurrent stores and invalidations are excluded by folio lock.
-	 *
-	 * 2. Writeback is excluded by the entry not being on the LRU yet.
-	 *    The publishing order matters to prevent writeback from seeing
-	 *    an incoherent entry.
-	 */
-	entry->pool =3D pool;
-	entry->swpentry =3D page_swpentry;
-	entry->objcg =3D objcg;
-	entry->referenced =3D true;
-	if (entry->length) {
-		INIT_LIST_HEAD(&entry->lru);
-		zswap_lru_add(&zswap_list_lru, entry);
+			WARN_ONCE(err !=3D -ENOMEM, "unexpected xarray error: %d\n", err);
+			zswap_reject_alloc_fail++;
+			/*
+			 * Entries up to this point have been stored in the
+			 * xarray. zswap_store() will erase them from the xarray
+			 * and call zswap_entry_free(). Local cleanup in
+			 * 'store_pages_failed' only needs to happen for
+			 * entries from [@i to @nr_pages).
+			 */
+			store_fail_idx =3D i;
+			goto store_pages_failed;
+		}
+
+		/*
+		 * We may have had an existing entry that became stale when
+		 * the folio was redirtied and now the new version is being
+		 * swapped out. Get rid of the old.
+		 */
+		if (unlikely(old))
+			zswap_entry_free(old);
+
+		/*
+		 * The entry is successfully compressed and stored in the tree, there is
+		 * no further possibility of failure. Grab refs to the pool and objcg,
+		 * charge zswap memory, and increment zswap_stored_pages.
+		 * The opposite actions will be performed by zswap_entry_free()
+		 * when the entry is removed from the tree.
+		 */
+		zswap_pool_get(pool);
+		if (objcg) {
+			obj_cgroup_get(objcg);
+			obj_cgroup_charge_zswap(objcg, entry->length);
+		}
+		atomic_long_inc(&zswap_stored_pages);
+		if (entry->length =3D=3D PAGE_SIZE)
+			atomic_long_inc(&zswap_stored_incompressible_pages);
+
+		/*
+		 * We finish by adding the entry to the LRU while it's already
+		 * in xarray. This is safe because:
+		 *
+		 * 1. Concurrent stores and invalidations are excluded by folio lock.
+		 *
+		 * 2. Writeback is excluded by the entry not being on the LRU yet.
+		 *    The publishing order matters to prevent writeback from seeing
+		 *    an incoherent entry.
+		 */
+		if (likely(entry->length))
+			zswap_lru_add(&zswap_list_lru, entry);
 	}
=20
 	return true;
=20
-store_failed:
-	zs_free(pool->zs_pool, entry->handle);
-compress_failed:
-	zswap_entry_cache_free(entry);
+store_pages_failed:
+	for (i =3D store_fail_idx; i < nr_pages; ++i) {
+		if (!IS_ERR_VALUE(entries[i]->handle))
+			zs_free(pool->zs_pool, entries[i]->handle);
+	}
+	zswap_entries_cache_free_batch((void **)&entries[store_fail_idx],
+				       nr_pages - store_fail_idx);
+
 	return false;
 }
=20
 bool zswap_store(struct folio *folio)
 {
+	bool folio_wb =3D mem_cgroup_zswap_writeback_enabled(folio_memcg(folio));
 	long nr_pages =3D folio_nr_pages(folio);
+	int node_id =3D folio_nid(folio);
 	swp_entry_t swp =3D folio->swap;
 	struct obj_cgroup *objcg =3D NULL;
 	struct mem_cgroup *memcg =3D NULL;
 	struct zswap_pool *pool;
 	bool ret =3D false;
-	long index;
+	long start, end;
=20
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1526,10 +1653,12 @@ bool zswap_store(struct folio *folio)
 		mem_cgroup_put(memcg);
 	}
=20
-	for (index =3D 0; index < nr_pages; ++index) {
-		struct page *page =3D folio_page(folio, index);
+	/* Store the folio in batches of @pool->store_batch_size pages. */
+	for (start =3D 0; start < nr_pages; start +=3D pool->store_batch_size) {
+		end =3D min(start + pool->store_batch_size, nr_pages);
=20
-		if (!zswap_store_page(page, objcg, pool))
+		if (!zswap_store_pages(folio, start, end, objcg, pool,
+				       node_id, folio_wb))
 			goto put_pool;
 	}
=20
@@ -1559,9 +1688,9 @@ bool zswap_store(struct folio *folio)
 		struct zswap_entry *entry;
 		struct xarray *tree;
=20
-		for (index =3D 0; index < nr_pages; ++index) {
-			tree =3D swap_zswap_tree(swp_entry(type, offset + index));
-			entry =3D xa_erase(tree, offset + index);
+		for (start =3D 0; start < nr_pages; ++start) {
+			tree =3D swap_zswap_tree(swp_entry(type, offset + start));
+			entry =3D xa_erase(tree, offset + start);
 			if (entry)
 				zswap_entry_free(entry);
 		}
--=20
2.27.0
From nobody Thu Oct  2 00:50:47 2025
Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.13])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0E376286898;
	Fri, 26 Sep 2025 03:35:26 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=192.198.163.13
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1758857729; cv=none;
 b=fsUpbeXaqeoOuJm/UXU/j0kXNj7zVKhF9NitOBGi9KT6c3p79hJthqGoFKuQHh+0Iq5dlvt+Bc7kOAH7xwDSnmYmN1THu9nQfP5we7GD3zvpQfmRT+t+LxsaO3aGn5gUHlGwEvr5BMd0rL0eC3ThVbNyuNrcd2C1xbWjIKojTgo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1758857729; c=relaxed/simple;
	bh=/IMJmgZyIzA42OJgV5BL7ppfZNvkisR4eqtX58YpxiY=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=VstsntYLJlTwcnhOYRYMgfU9jagEPDvE431LzfvAxEkVV5Ve+RtVLI+PK6rB8wRCwooKQzUUfWDRM1kxXsgjWXMGtf5uwarAtyvBlPo+DgMmBSxPDOCxY64Uj/WZNhIs3p9oryExexlKbc7xiDXjVBiLniuPLzHhsAcCb0JKUSw=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com;
 spf=pass smtp.mailfrom=intel.com;
 dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b=HqiC24o8; arc=none smtp.client-ip=192.198.163.13
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com
 header.b="HqiC24o8"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1758857726; x=1790393726;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=/IMJmgZyIzA42OJgV5BL7ppfZNvkisR4eqtX58YpxiY=;
  b=HqiC24o8hki+fYhDvhjCGGEPCul/FBDjw4qEaBCJ39fDiQeiGiwvKq6i
   jZbKQdnIvQ5VQZMDhsS03bMOf28vaMbMIrJR9GTXXFuqccjEhIiQ80S37
   FtWmHTWUSBk0+UcLch0hyn94W+HghBHvmHK8BsgQwzy/j+UTTMPxKe7VC
   SFACd8Hql7UxktCCv9IB2lvDL2hlDIY6bAF7zK4Yb33apVlTkWeqFc5Ua
   ac1R73k/abbnZotqm3JaiJOakkJGlC0YHB89h/T9uF3S+uDa4pe1QVbds
   +OiXKR7xPWRLD9rYSHrLhzs5fgIzDAOqqLi00tDUl0/wdA5AZi2yz7hw4
   A==;
X-CSE-ConnectionGUID: 1bJX8OdpTfSMVAHgGLJl/g==
X-CSE-MsgGUID: bk0Wa11STxCFW8FbOsOixQ==
X-IronPort-AV: E=McAfee;i="6800,10657,11564"; a="63819716"
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="63819716"
Received: from orviesa001.jf.intel.com ([10.64.159.141])
  by fmvoesa107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 25 Sep 2025 20:35:08 -0700
X-CSE-ConnectionGUID: QypCj91aR/aKHZ0y9k5QVw==
X-CSE-MsgGUID: 3MtKD3D+Re+w0p0vqcKOGA==
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="6.18,294,1751266800";
   d="scan'208";a="214636622"
Received: from jf5300-b11a338t.jf.intel.com ([10.242.51.115])
  by orviesa001.jf.intel.com with ESMTP; 25 Sep 2025 20:35:07 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
To: linux-kernel@vger.kernel.org,
	linux-mm@kvack.org,
	hannes@cmpxchg.org,
	yosry.ahmed@linux.dev,
	nphamcs@gmail.com,
	chengming.zhou@linux.dev,
	usamaarif642@gmail.com,
	ryan.roberts@arm.com,
	21cnbao@gmail.com,
	ying.huang@linux.alibaba.com,
	akpm@linux-foundation.org,
	senozhatsky@chromium.org,
	sj@kernel.org,
	kasong@tencent.com,
	linux-crypto@vger.kernel.org,
	herbert@gondor.apana.org.au,
	davem@davemloft.net,
	clabbe@baylibre.com,
	ardb@kernel.org,
	ebiggers@google.com,
	surenb@google.com,
	kristen.c.accardi@intel.com,
	vinicius.gomes@intel.com
Cc: wajdi.k.feghali@intel.com,
	vinodh.gopal@intel.com,
	kanchana.p.sridhar@intel.com
Subject: [PATCH v12 23/23] mm: zswap: Batched zswap_compress() with compress
 batching of large folios.
Date: Thu, 25 Sep 2025 20:35:02 -0700
Message-Id: <20250926033502.7486-24-kanchana.p.sridhar@intel.com>
X-Mailer: git-send-email 2.27.0
In-Reply-To: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
References: <20250926033502.7486-1-kanchana.p.sridhar@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

crypto: iaa - Use the memory allocated in acomp_req->__ctx[] for batching.

This patch introduces a new unified implementation of zswap_compress()
for compressors that do and do not support batching. This eliminates
code duplication and facilitates maintainability of the code with the
introduction of compress batching.

The vectorized implementation of calling the earlier zswap_compress()
sequentially, one page at a time in zswap_store_pages(), is replaced
with this new version of zswap_compress() that accepts multiple pages to
compress as a batch.

If the compressor does not support batching, each page in the batch is
compressed and stored sequentially. If the compressor supports batching,
for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
is compressed in parallel in hardware. If the batch is compressed
without errors, the compressed buffers are then stored in zsmalloc. In
case of compression errors, the current behavior is preserved for the
batching zswap_compress(): if the folio's memcg is writeback enabled,
pages with compression errors are store uncompressed in zsmalloc; if
not, we return an error for the folio in zswap_store().

As per Herbert's suggestion in [1] for batching to be based on SG lists
to interface with the crypto API, an "struct sg_table *sg_outputs" is
added to the per-CPU acomp_ctx. In zswap_cpu_comp_prepare(), memory is
allocated for @pool->compr_batch_size scatterlists in
@acomp_ctx->sg_outputs. The per-CPU @acomp_ctx->buffers' addresses are
mapped to the respective SG in @acomp_ctx->sg_outputs. This is done once
and does not need to be repeated in zswap_compress(). The existing
non-NUMA sg_alloc_table() was found to give better performance than a
NUMA-aware allocation function, hence is used in this patch.

All that zswap_compress() needs to do for non-batching software
compressors is to set the singular output SG's length to PAGE_SIZE.
Batching compressors should initialize the output SG lengths to
PAGE_SIZE as part of the internal compress batching setup, to avoid
having to do multiple traversals over the @acomp_ctx->sg_outputs->sgl.
This is exactly how batching is implemented in the iaa_crypto driver's
compress batching procedure, iaa_comp_acompress_batch().

Another initialization level optimization that happens in
zswap_cpu_comp_prepare() is, only for batching compressors, we allocate
extra memory for "batch-size" int pointers in the
@acomp_ctx->req->__ctx[] that are statically set to track the output SG
lists' lengths. This optimization was necessary to avoid the latency
cost of multiple @acomp_ctx->sg_outputs->sgl traversals in
zswap_compress() and in iaa_comp_acompress_batch() and to recover
batching performance with the SG lists based architecture. This
optimization's per-CPU memory cost for a batching compressor with
batch-size of 8 is 64 bytes. There is no memory cost for software
compressors.

Consequently, batching compressors can use the memory allocated in
@acomp_ctx->req->__ctx[] to internally manage updates to the output
@sg->lengths for the batch. zswap_compress() does not need to
traverse @pool->compr_batch_size number of output SG list elements to
get the compressed output length/error for each page in the batch.
This is readily available in the per-CPU acomp_ctx->req->__ctx[].

On a related note, the code in zswap_compress() is generalized as much
as possible for software compressors, by introducing a local @dstlen int
pointer to track @acomp_ctx->req->dlen, and assigning it the @err return
value from crypto_acomp_compress(), so that the subsequent
incompressible page handling, zs_pool writes, and error handling code is
seamless for software and batching compressors, i.e., avoids
conditionals to switch to specialized code for either.

Finally, again as per Herbert's suggestion, we call:

  acomp_request_set_unit_size(acomp_ctx->req, PAGE_SIZE);

to set the unit size for zswap to PAGE_SIZE, after the call to
acomp_request_set_callback() in zswap_cpu_comp_prepare().

The new batching implementation of zswap_compress() is called with a
batch of @nr_pages sent to zswap_store_pages(). It sets up the
acomp_ctx->req's src/dst SG lists to contain the folio pages and
@nr_comps output buffers, where @nr_comps is min(nr_pages,
pool->compr_batch_size); before calling crypto_acomp_compress().

An upfront mapping of @dlens to the @acomp_ctx->req->__ctx for batching
compressors, and to @acomp_ctx->req->dlen for software compressors,
simplifies the zs_pool writes and error handling after compression.

Some important requirements of this batching architecture for batching
compressors:

  1) The output SG lengths for each sg in the acomp_req->dst should be
     intialized to PAGE_SIZE as part of other batch setup in the batch
     compression function. zswap will not take care of this in the
     interest of avoiding repetitive traversals of the
     @acomp_ctx->sg_outputs->sgl so as to not lose the benefits of
     batching.

  2) In case of a compression error for any page in the batch, the
     batching compressor should set the corresponding @sg->length to a
     negative error number, as suggested by Herbert. Otherwise, the
     @sg->length will contain the compressed output length.

Another important change this patch makes is with the acomp_ctx mutex
locking in zswap_compress(). Earlier, the mutex was held per page's
compression. With the new code, [un]locking the mutex per page caused
regressions for software compressors when testing with usemem
(30 processes) and also kernel compilation with 'allmod' config. The
regressions were more eggregious when PMD folios were stored. The
implementation in this commit locks/unlocks the mutex once per batch,
that resolves the regression.

The use of likely()/unlikely() annotations prevent regressions with
software compressors like zstd, and generally improve non-batching
compressors' performance with the batching code by ~3%.

Architectural considerations for the zswap batching framework:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
We have designed the zswap batching framework to be
hardware-agnostic. It has no dependencies on Intel-specific features and
can be leveraged by any hardware accelerator or software-based
compressor. In other words, the framework is open and inclusive by
design.

Other ongoing work that can use batching:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
This patch-series demonstrates the performance benefits of compress
batching when used in zswap_store() of large folios. shrink_folio_list()
"reclaim batching" of any-order folios is the major next work that uses
the zswap compress batching framework: our testing of kernel_compilation
with writeback and the zswap shrinker indicates 10X fewer pages get
written back when we reclaim 32 folios as a batch, as compared to one
folio at a time: this is with deflate-iaa and with zstd. We expect to
submit a patch-series with this data and the resulting performance
improvements shortly. Reclaim batching relieves memory pressure faster
than reclaiming one folio at a time, hence alleviates the need to scan
slab memory for writeback.

Nhat has given ideas on using batching with the ongoing kcompressd work,
as well as beneficially using decompression batching & block IO batching
to improve zswap writeback efficiency.

Experiments that combine zswap compress batching, reclaim batching,
swapin_readahead() decompression batching of prefetched pages, and
writeback batching show that 0 pages are written back with deflate-iaa
and zstd. For comparison, the baselines for these compressors see
200K-800K pages written to disk (kernel compilation 'allmod' config).

To summarize, these are future clients of the batching framework:

   - shrink_folio_list() reclaim batching of multiple folios:
       Implemented, will submit patch-series.
   - zswap writeback with decompress batching:
       Implemented, will submit patch-series.
   - zram:
       Implemented, will submit patch-series.
   - kcompressd:
       Not yet implemented.
   - file systems:
       Not yet implemented.
   - swapin_readahead() decompression batching of prefetched pages:
       Implemented, will submit patch-series.

Additionally, any place we have folios that need to be compressed, can
potentially be parallelized.

Performance data:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D

As suggested by Barry, this is the performance data gathered on Intel
Sapphire Rapids with usemem 30 processes running at 50% memory pressure
and kernel_compilation/allmod config run with 2G limit using 32
threads. To keep comparisons simple, all testing was done without the
zswap shrinker.

usemem30, 64K folios:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
 ------------------------------------------------------------------
                                    deflate-iaa

                     mm-unstable-9-18-2025           v12    Change
 ------------------------------------------------------------------
 Total throughput (KB/s)         7,191,949    10,702,115      49%
 Average throughput (KB/s)         239,731       356,737      49%
 elapsed time (sec)                  93.21         69.98     -25%
 sys time (sec)                   2,190.52      1,651.51     -25%
 ------------------------------------------------------------------

 ------------------------------------------------------------------
                                    zstd

                     mm-unstable-9-18-2025           v12    Change
 ------------------------------------------------------------------
 Total throughput (KB/s)         6,258,312     6,269,511     0.2%
 Average throughput (KB/s)         208,610       208,983     0.2%
 elapsed time (sec)                 100.01        100.50     0.5%
 sys time (sec)                   2,505.14      2,490.00      -1%
 ------------------------------------------------------------------

usemem30, PMD folios:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
 ------------------------------------------------------------------
                                    deflate-iaa

                     mm-unstable-9-18-2025           v12    Change
 ------------------------------------------------------------------
 Total throughput (KB/s)         7,237,676    11,228,928      55%
 Average throughput (KB/s)         241,255       374,297      55%
 elapsed time (sec)                  82.26         69.30     -16%
 sys time (sec)                   1,901.90      1,634.78     -14%
 ------------------------------------------------------------------

 ------------------------------------------------------------------
                                    zstd

                     mm-unstable-9-18-2025           v12    Change
 ------------------------------------------------------------------
 Total throughput (KB/s)         6,796,376     6,865,781     1.0%
 Average throughput (KB/s)         226,545       228,859     1.0%
 elapsed time (sec)                  94.07         88.80      -6%
 sys time (sec)                   2,261.67      2,082.91      -8%
 ------------------------------------------------------------------

The main takeaway from usemem, a workload that is mostly compression
dominated (very few swapins) is that the higher the number of batches,
such as, with larger folios, the more the benefit of batching cost
amortization, as shown by the PMD usemem data. This aligns well with the
future direction noted earlier.

kernel_compilation/allmodconfig, 64K folios:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D

 -------------------------------------------------------
                           deflate-iaa

          mm-unstable-9-18-2025           v12    Change
 -------------------------------------------------------
 real_sec                874.74        821.59    -6.1%
 sys_sec               3,834.35      3,791.12      -1%
 -------------------------------------------------------

 -------------------------------------------------------
                              zstd

          mm-unstable-9-18-2025           v12    Change
 -------------------------------------------------------
 real_sec                925.08        853.14    -7.8%
 sys_sec               5,318.65      5,172.23    -2.8%
 -------------------------------------------------------

kernel_compilation/allmodconfig, PMD folios:
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D

 -------------------------------------------------------
                           deflate-iaa

          mm-unstable-9-18-2025           v12    Change
 -------------------------------------------------------
 real_sec                808.10        794.85    -1.6%
 sys_sec               4,351.01      4,266.95      -2%
 -------------------------------------------------------

 -------------------------------------------------------
                              zstd

          mm-unstable-9-18-2025           v12    Change
 -------------------------------------------------------
 real_sec                848.06        845.42    -0.3%
 sys_sec               5,898.58      5,741.31    -2.7%
 -------------------------------------------------------

[1]: https://lore.kernel.org/all/aJ7Fk6RpNc815Ivd@gondor.apana.org.au/T/#m9=
9aea2ce3d284e6c5a3253061d97b08c4752a798

Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
 drivers/crypto/intel/iaa/iaa_crypto_main.c |   4 +-
 mm/zswap.c                                 | 311 ++++++++++++++++-----
 2 files changed, 247 insertions(+), 68 deletions(-)

diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/in=
tel/iaa/iaa_crypto_main.c
index 2fa38176034d..019f80f87993 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1510,7 +1510,9 @@ static __always_inline void acomp_to_iaa(struct acomp=
_req *areq,
 	req->slen =3D areq->slen;
 	req->dlen =3D areq->dlen;
 	req->flags =3D areq->base.flags;
-	if (unlikely(ctx->use_irq))
+	if (likely(!ctx->use_irq))
+		req->dlens =3D (int **)&areq->__ctx[0];
+	else
 		req->drv_data =3D areq;
 }
=20
diff --git a/mm/zswap.c b/mm/zswap.c
index 9e0e7887de33..8104ecb30a1a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -143,6 +143,7 @@ struct crypto_acomp_ctx {
 	struct acomp_req *req;
 	struct crypto_wait wait;
 	u8 **buffers;
+	struct sg_table *sg_outputs;
 	struct mutex mutex;
 	bool is_sleepable;
 };
@@ -275,6 +276,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx =
*acomp_ctx, u8 nr_buffers)
 			kfree(acomp_ctx->buffers[i]);
 		kfree(acomp_ctx->buffers);
 	}
+
+	if (acomp_ctx->sg_outputs) {
+		sg_free_table(acomp_ctx->sg_outputs);
+		kfree(acomp_ctx->sg_outputs);
+	}
 }
=20
 static struct zswap_pool *zswap_pool_create(char *compressor)
@@ -836,6 +842,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, str=
uct hlist_node *node)
 	struct zswap_pool *pool =3D hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx =3D per_cpu_ptr(pool->acomp_ctx, cpu);
 	int cpu_node =3D cpu_to_node(cpu);
+	struct scatterlist *sg;
 	int ret =3D -ENOMEM;
 	u8 i;
=20
@@ -866,7 +873,28 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, st=
ruct hlist_node *node)
 	pool->compr_batch_size =3D min(ZSWAP_MAX_BATCH_SIZE,
 				     crypto_acomp_batch_size(acomp_ctx->acomp));
=20
-	acomp_ctx->req =3D acomp_request_alloc(acomp_ctx->acomp);
+	/*
+	 * For batching compressors, we allocate extra memory for "batch-size"
+	 * int pointers that will be statically set to track the output
+	 * SG lists' lengths later in this procedure. This optimization was
+	 * required to avoid the latency cost of SG lists traversal in
+	 * zswap_compress().
+	 *
+	 * Consequently, batching compressors can use the memory allocated in
+	 * acomp_ctx->req->__ctx[] to internally manage updates to the output
+	 * @sg->lengths for the batch. zswap_compress() does not need to
+	 * traverse ZSWAP_MAX_BATCH_SIZE number of output SG list elements to
+	 * get the compressed output length/error for each page in the batch.
+	 * This is readily available in the per-CPU acomp_ctx->req->__ctx[].
+	 * This optimization's per-CPU memory cost for a batching compressor
+	 * with batch-size of 8 is 64 bytes.
+	 */
+	if (pool->compr_batch_size > 1)
+		acomp_ctx->req =3D acomp_request_alloc_extra(acomp_ctx->acomp,
+					pool->compr_batch_size * sizeof(int *),
+					GFP_KERNEL | __GFP_ZERO);
+	else
+		acomp_ctx->req =3D acomp_request_alloc(acomp_ctx->acomp);
=20
 	if (IS_ERR_OR_NULL(acomp_ctx->req)) {
 		pr_err("could not alloc crypto acomp_request %s\n",
@@ -886,6 +914,37 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, st=
ruct hlist_node *node)
 			goto fail;
 	}
=20
+	acomp_ctx->sg_outputs =3D kmalloc(sizeof(*acomp_ctx->sg_outputs),
+					GFP_KERNEL);
+	if (!acomp_ctx->sg_outputs)
+		goto fail;
+
+	if (sg_alloc_table(acomp_ctx->sg_outputs, pool->compr_batch_size,
+			   GFP_KERNEL))
+		goto fail;
+
+	/*
+	 * Map the per-CPU destination buffers to the per-CPU SG list.
+	 * This only needs to be done once.
+	 */
+	for_each_sg(acomp_ctx->sg_outputs->sgl, sg, pool->compr_batch_size, i)
+		sg_set_buf(sg, acomp_ctx->buffers[i], PAGE_SIZE);
+
+	/*
+	 * Use the @pool->compr_batch_size number of int pointers for
+	 * which we allocated extra memory in the @acomp_ctx->req above, to
+	 * track the addresses of the @sg->length members of the individual
+	 * SG lists in @acomp_ctx->sg_outputs->sgl. This is a static mapping
+	 * that needs to be done only once, and saves latency by avoiding
+	 * traversing the SG lists in zswap_compress().
+	 */
+	if (pool->compr_batch_size > 1) {
+		for_each_sg(acomp_ctx->sg_outputs->sgl, sg, pool->compr_batch_size, i)
+			acomp_ctx->req->__ctx[i] =3D &sg->length;
+	} else {
+		acomp_ctx->req->dlen =3D PAGE_SIZE;
+	}
+
 	/*
 	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
 	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -896,6 +955,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, str=
uct hlist_node *node)
 	acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
=20
+	acomp_request_set_unit_size(acomp_ctx->req, PAGE_SIZE);
+
 	mutex_init(&acomp_ctx->mutex);
 	return 0;
=20
@@ -904,84 +965,203 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, =
struct hlist_node *node)
 	return ret;
 }
=20
-static bool zswap_compress(struct page *page, struct zswap_entry *entry,
-			   struct zswap_pool *pool, bool folio_wb)
+/*
+ * Unified code path for compressors that do and do not support batching. =
This
+ * procedure will compress multiple @nr_pages in @folio starting from the
+ * @start index.
+ *
+ * It is assumed that @nr_pages <=3D ZSWAP_MAX_BATCH_SIZE. zswap_store() m=
akes
+ * sure of this by design and zswap_store_pages() warns once if this is not
+ * true.
+ *
+ * @nr_pages can be in (1, ZSWAP_MAX_BATCH_SIZE] even if the compressor do=
es not
+ * support batching.
+ *
+ * If @pool->compr_batch_size is 1, each page is processed sequentially.
+ *
+ * If @pool->compr_batch_size is > 1, compression batching is invoked with=
in
+ * the algorithm's driver, except if @nr_pages is 1: if so, the driver can
+ * choose to call the sequential/non-batching compress API.
+ *
+ * In both cases, if all compressions are successful, the compressed buffe=
rs
+ * are stored in zsmalloc.
+ *
+ * Traversing multiple SG lists when @nr_comps is > 1 is expensive, and im=
pacts
+ * batching performance if we were to repeat this operation multiple times,
+ * such as:
+ *   - to map destination buffers to each SG list in the @acomp_ctx->sg_ou=
tputs
+ *     sg_table.
+ *   - to initialize each output SG list @sg->length to PAGE_SIZE.
+ *   - to get the compressed output length in each @sg->length.
+ *
+ * These are some design choices made to optimize batching with SG lists:
+ *
+ * 1) The source folio pages in the batch are directly submitted to
+ *    crypto_acomp via acomp_request_set_src_folio().
+ *
+ * 2) The per-CPU @acomp_ctx->sg_outputs scatterlists are used to set up
+ *    destination buffers for interfacing with crypto_acomp.
+ *
+ * 3) To optimize performance, we map the per-CPU @acomp_ctx->buffers to t=
he
+ *    @acomp_ctx->sg_outputs->sgl SG lists at pool creation time. The only=
 task
+ *    remaining to be done for the output SG lists in zswap_compress() is =
to
+ *    set the @sg->length to PAGE_SIZE. This is done in zswap_compress()
+ *    for non-batching compressors. This needs to be done within the compr=
ess
+ *    batching driver procedure as part of iterating through the SG lists =
for
+ *    batch setup, so as to minimize traversals through the SG lists.
+ *
+ * 4) For batching compressors, we allocate extra memory in the
+ *    @acomp_ctx->req->__ctx[] to store @pool->compr_batch_size number of
+ *    int pointers (the @dlens). These pointers are initialized to the
+ *    individual @sg->lengths' addresses in @acomp_ctx->sg_outputs->sgl at=
 pool
+ *    creation time. zswap_compress() has this readily available without h=
aving
+ *    to re-traverse @acomp_ctx->sg_outputs->sgl to get the compressed out=
put
+ *    lengths after batch compression.
+ *
+ * 5) An important requirement for compression errors and batching compres=
sors:
+ *    the individual @sg->lengths in @acomp_ctx->req->sg_outputs->sgl shou=
ld be
+ *    set to the error value for the respective batch page.
+ *
+ * A few important changes made to not regress and in fact improve
+ * compression performance with non-batching software compressors, using t=
his
+ * new batching code:
+ *
+ * 1) acomp_ctx mutex locking:
+ *    Earlier, the mutex was held per page compression. With the new code,
+ *    [un]locking the mutex per page caused regressions for software
+ *    compressors. We now lock the mutex once per batch, which resolves the
+ *    regression.
+ *
+ * 2) The likely()/unlikely() annotations prevent regressions with software
+ *    compressors like zstd, and generally improve non-batching compressor=
s'
+ *    performance with the batching code by ~3%.
+ */
+static bool zswap_compress(struct folio *folio, long start, unsigned int n=
r_pages,
+			   struct zswap_entry *entries[], struct zswap_pool *pool,
+			   int node_id, bool folio_wb)
 {
+	unsigned int nr_comps =3D min(nr_pages, pool->compr_batch_size);
 	struct crypto_acomp_ctx *acomp_ctx;
-	struct scatterlist input, output;
-	int comp_ret =3D 0, alloc_ret =3D 0;
-	unsigned int dlen =3D PAGE_SIZE;
+	int *dstlen[1], **dlens;
+	struct scatterlist *sg;
 	unsigned long handle;
+	unsigned int i, j, k;
+	void *dst;
 	gfp_t gfp;
-	u8 *dst;
-	bool mapped =3D false;
+	int err;
+
+	gfp =3D GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
=20
 	acomp_ctx =3D raw_cpu_ptr(pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
=20
-	dst =3D acomp_ctx->buffers[0];
-	sg_init_table(&input, 1);
-	sg_set_page(&input, page, PAGE_SIZE, 0);
-
-	sg_init_one(&output, dst, PAGE_SIZE);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen=
);
+	dstlen[0] =3D &acomp_ctx->req->dlen;
+	if (pool->compr_batch_size =3D=3D 1)
+		dlens =3D &dstlen[0];
+	else
+		dlens =3D (int **)&acomp_ctx->req->__ctx[0];
=20
 	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in th=
is
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 * [i] refers to the incoming batch space and is used to
+	 *     index into the folio pages.
+	 *
+	 * [j] refers to the incoming batch space and is used to
+	 *     index into the @entries for the folio's pages in this
+	 *     batch, per compress call while iterating over the output SG
+	 *     lists. Also used to index into the folio's pages from @start, in
+	 *     case of compress errors.
+	 *
+	 * [k] refers to the @acomp_ctx space, as determined by
+	 *     @pool->compr_batch_size, and is used to index into
+	 *     @acomp_ctx->sg_outputs->sgl, @acomp_ctx->buffers and @dlens.
 	 */
-	comp_ret =3D crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acom=
p_ctx->wait);
-	dlen =3D acomp_ctx->req->dlen;
+	for (i =3D 0; i < nr_pages; i +=3D nr_comps) {
+		acomp_request_set_src_folio(acomp_ctx->req, folio,
+					    (start + i) * PAGE_SIZE,
+					    nr_comps * PAGE_SIZE);
=20
-	/*
-	 * If a page cannot be compressed into a size smaller than PAGE_SIZE,
-	 * save the content as is without a compression, to keep the LRU order
-	 * of writebacks.  If writeback is disabled, reject the page since it
-	 * only adds metadata overhead.  swap_writeout() will put the page back
-	 * to the active LRU list in the case.
-	 */
-	if (comp_ret || !dlen || dlen >=3D PAGE_SIZE) {
-		dlen =3D PAGE_SIZE;
-		if (!folio_wb) {
-			comp_ret =3D comp_ret ? comp_ret : -EINVAL;
-			goto unlock;
+		acomp_ctx->sg_outputs->sgl->length =3D nr_comps * PAGE_SIZE;
+
+		acomp_request_set_dst_sg(acomp_ctx->req,
+					 acomp_ctx->sg_outputs->sgl,
+					 nr_comps * PAGE_SIZE);
+
+		err =3D crypto_wait_req(crypto_acomp_compress(acomp_ctx->req),
+				      &acomp_ctx->wait);
+
+		/*
+		 * If a page cannot be compressed into a size smaller than
+		 * PAGE_SIZE, save the content as is without a compression, to
+		 * keep the LRU order of writebacks.  If writeback is disabled,
+		 * reject the page since it only adds metadata overhead.
+		 * swap_writeout() will put the page back to the active LRU list
+		 * in the case.
+		 *
+		 * It is assumed that any compressor that sets the output length
+		 * to 0 or a value >=3D PAGE_SIZE will also return a negative
+		 * error status in @err; i.e, will not return a successful
+		 * compression status in @err in this case.
+		 */
+		if (unlikely(err)) {
+			*dstlen[0] =3D err;
+			if (!folio_wb)
+				goto compress_error;
 		}
-		comp_ret =3D 0;
-		dst =3D kmap_local_page(page);
-		mapped =3D true;
-	}
=20
-	gfp =3D GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
-	handle =3D zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page));
-	if (IS_ERR_VALUE(handle)) {
-		alloc_ret =3D PTR_ERR((void *)handle);
-		goto unlock;
-	}
+		/*
+		 * All @nr_comps pages were either successfully compressed, or,
+		 * writeback is enabled for the folio's memcg and there were
+		 * compression errors. In either case, store the pages in
+		 * zsmalloc:
+		 *
+		 * - For successful compressions, store the compressed outputs.
+		 * - For errors, store the page uncompressed.
+		 */
+		for_each_sg(acomp_ctx->sg_outputs->sgl, sg, nr_comps, k) {
+			j =3D k + i;
+
+			dst =3D acomp_ctx->buffers[k];
+
+			if (unlikely(*dlens[k] < 0)) {
+				*dlens[k] =3D PAGE_SIZE;
+				dst =3D kmap_local_page(folio_page(folio, start + j));
+			}
+
+			handle =3D zs_malloc(pool->zs_pool, *dlens[k], gfp, node_id);
=20
-	zs_obj_write(pool->zs_pool, handle, dst, dlen);
-	entry->handle =3D handle;
-	entry->length =3D dlen;
+			if (unlikely(IS_ERR_VALUE(handle))) {
+				if (PTR_ERR((void *)handle) =3D=3D -ENOSPC)
+					zswap_reject_compress_poor++;
+				else
+					zswap_reject_alloc_fail++;
=20
-unlock:
-	if (mapped)
-		kunmap_local(dst);
-	if (comp_ret =3D=3D -ENOSPC || alloc_ret =3D=3D -ENOSPC)
-		zswap_reject_compress_poor++;
-	else if (comp_ret)
-		zswap_reject_compress_fail++;
-	else if (alloc_ret)
-		zswap_reject_alloc_fail++;
+				goto err_unlock;
+			}
+
+			zs_obj_write(pool->zs_pool, handle, dst, *dlens[k]);
+			entries[j]->handle =3D handle;
+			entries[j]->length =3D *dlens[k];
+			if (unlikely(dst !=3D acomp_ctx->buffers[k]))
+				kunmap_local(dst);
+		}
+	} /* finished compress and store nr_pages. */
=20
 	mutex_unlock(&acomp_ctx->mutex);
-	return comp_ret =3D=3D 0 && alloc_ret =3D=3D 0;
+	return true;
+
+compress_error:
+	for (k =3D 0; k < nr_comps; ++k) {
+		if (*dlens[k] < 0) {
+			if (*dlens[k] =3D=3D -ENOSPC)
+				zswap_reject_compress_poor++;
+			else
+				zswap_reject_compress_fail++;
+		}
+	}
+
+err_unlock:
+	mutex_unlock(&acomp_ctx->mutex);
+	return false;
 }
=20
 static bool zswap_decompress(struct zswap_entry *entry, struct folio *foli=
o)
@@ -1529,12 +1709,9 @@ static bool zswap_store_pages(struct folio *folio,
 		INIT_LIST_HEAD(&entries[i]->lru);
 	}
=20
-	for (i =3D 0; i < nr_pages; ++i) {
-		struct page *page =3D folio_page(folio, start + i);
-
-		if (!zswap_compress(page, entries[i], pool, folio_wb))
-			goto store_pages_failed;
-	}
+	if (unlikely(!zswap_compress(folio, start, nr_pages, entries, pool,
+				     node_id, folio_wb)))
+		goto store_pages_failed;
=20
 	for (i =3D 0; i < nr_pages; ++i) {
 		struct zswap_entry *old, *entry =3D entries[i];
--=20
2.27.0