[PATCHv4 02/17] zram: do not use per-CPU compression streams

Sergey Senozhatsky posted 17 patches 1 year ago
There is a newer version of this series
[PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
Similarly to per-entry spin-lock per-CPU compression streams
also have a number of shortcoming.

First, per-CPU stream access has to be done from a non-preemptible
(atomic) section, which imposes the same atomicity requirements on
compression backends as entry spin-lock do and makes it impossible
to use algorithms that can schedule/wait/sleep during compression
and decompression.

Second, per-CPU streams noticeably increase memory usage (actually
more like wastage) of secondary compression streams.  The problem
is that secondary compression streams are allocated per-CPU, just
like the primary streams are.  Yet we never use more that one
secondary stream at a time, because recompression is a single
threaded action.  Which means that remaining num_online_cpu() - 1
streams are allocated for nothing, and this is per-priority list
(we can have several secondary compression algorithms).  Depending
on the algorithm this may lead to a significant memory wastage, in
addition each stream also carries a workmem buffer (2 physical
pages).

Instead of per-CPU streams, maintain a list of idle compression
streams and allocate new streams on-demand (something that we
used to do many years ago).  So that zram read() and write() become
non-atomic and ease requirements on the compression algorithm
implementation.  This also means that we now should have only one
secondary stream per-priority list.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/zcomp.c    | 164 +++++++++++++++++++---------------
 drivers/block/zram/zcomp.h    |  17 ++--
 drivers/block/zram/zram_drv.c |  29 +++---
 include/linux/cpuhotplug.h    |   1 -
 4 files changed, 109 insertions(+), 102 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index bb514403e305..982c769d5831 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -6,7 +6,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
-#include <linux/cpu.h>
+#include <linux/cpumask.h>
 #include <linux/crypto.h>
 #include <linux/vmalloc.h>
 
@@ -43,31 +43,40 @@ static const struct zcomp_ops *backends[] = {
 	NULL
 };
 
-static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *strm)
 {
-	comp->ops->destroy_ctx(&zstrm->ctx);
-	vfree(zstrm->buffer);
-	zstrm->buffer = NULL;
+	comp->ops->destroy_ctx(&strm->ctx);
+	vfree(strm->buffer);
+	kfree(strm);
 }
 
-static int zcomp_strm_init(struct zcomp *comp, struct zcomp_strm *zstrm)
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 {
+	struct zcomp_strm *strm;
 	int ret;
 
-	ret = comp->ops->create_ctx(comp->params, &zstrm->ctx);
-	if (ret)
-		return ret;
+	strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+	if (!strm)
+		return NULL;
+
+	INIT_LIST_HEAD(&strm->entry);
+
+	ret = comp->ops->create_ctx(comp->params, &strm->ctx);
+	if (ret) {
+		kfree(strm);
+		return NULL;
+	}
 
 	/*
-	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
-	 * case when compressed size is larger than the original one
+	 * allocate 2 pages. 1 for compressed data, plus 1 extra in case if
+	 * compressed data is larger than the original one.
 	 */
-	zstrm->buffer = vzalloc(2 * PAGE_SIZE);
-	if (!zstrm->buffer) {
-		zcomp_strm_free(comp, zstrm);
-		return -ENOMEM;
+	strm->buffer = vzalloc(2 * PAGE_SIZE);
+	if (!strm->buffer) {
+		zcomp_strm_free(comp, strm);
+		return NULL;
 	}
-	return 0;
+	return strm;
 }
 
 static const struct zcomp_ops *lookup_backend_ops(const char *comp)
@@ -109,13 +118,59 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
 
 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
 {
-	local_lock(&comp->stream->lock);
-	return this_cpu_ptr(comp->stream);
+	struct zcomp_strm *strm;
+
+	might_sleep();
+
+	while (1) {
+		spin_lock(&comp->strm_lock);
+		if (!list_empty(&comp->idle_strm)) {
+			strm = list_first_entry(&comp->idle_strm,
+						struct zcomp_strm,
+						entry);
+			list_del(&strm->entry);
+			spin_unlock(&comp->strm_lock);
+			return strm;
+		}
+
+		/* cannot allocate new stream, wait for an idle one */
+		if (comp->avail_strm >= num_online_cpus()) {
+			spin_unlock(&comp->strm_lock);
+			wait_event(comp->strm_wait,
+				   !list_empty(&comp->idle_strm));
+			continue;
+		}
+
+		/* allocate new stream */
+		comp->avail_strm++;
+		spin_unlock(&comp->strm_lock);
+
+		strm = zcomp_strm_alloc(comp);
+		if (strm)
+			break;
+
+		spin_lock(&comp->strm_lock);
+		comp->avail_strm--;
+		spin_unlock(&comp->strm_lock);
+		wait_event(comp->strm_wait, !list_empty(&comp->idle_strm));
+	}
+
+	return strm;
 }
 
-void zcomp_stream_put(struct zcomp *comp)
+void zcomp_stream_put(struct zcomp *comp, struct zcomp_strm *strm)
 {
-	local_unlock(&comp->stream->lock);
+	spin_lock(&comp->strm_lock);
+	if (comp->avail_strm <= num_online_cpus()) {
+		list_add(&strm->entry, &comp->idle_strm);
+		spin_unlock(&comp->strm_lock);
+		wake_up(&comp->strm_wait);
+		return;
+	}
+
+	comp->avail_strm--;
+	spin_unlock(&comp->strm_lock);
+	zcomp_strm_free(comp, strm);
 }
 
 int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
@@ -148,61 +203,19 @@ int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm,
 	return comp->ops->decompress(comp->params, &zstrm->ctx, &req);
 }
 
-int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
-{
-	struct zcomp *comp = hlist_entry(node, struct zcomp, node);
-	struct zcomp_strm *zstrm;
-	int ret;
-
-	zstrm = per_cpu_ptr(comp->stream, cpu);
-	local_lock_init(&zstrm->lock);
-
-	ret = zcomp_strm_init(comp, zstrm);
-	if (ret)
-		pr_err("Can't allocate a compression stream\n");
-	return ret;
-}
-
-int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct zcomp *comp = hlist_entry(node, struct zcomp, node);
-	struct zcomp_strm *zstrm;
-
-	zstrm = per_cpu_ptr(comp->stream, cpu);
-	zcomp_strm_free(comp, zstrm);
-	return 0;
-}
-
-static int zcomp_init(struct zcomp *comp, struct zcomp_params *params)
-{
-	int ret;
-
-	comp->stream = alloc_percpu(struct zcomp_strm);
-	if (!comp->stream)
-		return -ENOMEM;
-
-	comp->params = params;
-	ret = comp->ops->setup_params(comp->params);
-	if (ret)
-		goto cleanup;
-
-	ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node);
-	if (ret < 0)
-		goto cleanup;
-
-	return 0;
-
-cleanup:
-	comp->ops->release_params(comp->params);
-	free_percpu(comp->stream);
-	return ret;
-}
-
 void zcomp_destroy(struct zcomp *comp)
 {
-	cpuhp_state_remove_instance(CPUHP_ZCOMP_PREPARE, &comp->node);
+	struct zcomp_strm *strm;
+
+	while (!list_empty(&comp->idle_strm)) {
+		strm = list_first_entry(&comp->idle_strm,
+					struct zcomp_strm,
+					entry);
+		list_del(&strm->entry);
+		zcomp_strm_free(comp, strm);
+	}
+
 	comp->ops->release_params(comp->params);
-	free_percpu(comp->stream);
 	kfree(comp);
 }
 
@@ -229,7 +242,12 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params)
 		return ERR_PTR(-EINVAL);
 	}
 
-	error = zcomp_init(comp, params);
+	INIT_LIST_HEAD(&comp->idle_strm);
+	init_waitqueue_head(&comp->strm_wait);
+	spin_lock_init(&comp->strm_lock);
+
+	comp->params = params;
+	error = comp->ops->setup_params(comp->params);
 	if (error) {
 		kfree(comp);
 		return ERR_PTR(error);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index ad5762813842..62330829db3f 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -3,10 +3,10 @@
 #ifndef _ZCOMP_H_
 #define _ZCOMP_H_
 
-#include <linux/local_lock.h>
-
 #define ZCOMP_PARAM_NO_LEVEL	INT_MIN
 
+#include <linux/wait.h>
+
 /*
  * Immutable driver (backend) parameters. The driver may attach private
  * data to it (e.g. driver representation of the dictionary, etc.).
@@ -31,7 +31,7 @@ struct zcomp_ctx {
 };
 
 struct zcomp_strm {
-	local_lock_t lock;
+	struct list_head entry;
 	/* compression buffer */
 	void *buffer;
 	struct zcomp_ctx ctx;
@@ -60,16 +60,15 @@ struct zcomp_ops {
 	const char *name;
 };
 
-/* dynamic per-device compression frontend */
 struct zcomp {
-	struct zcomp_strm __percpu *stream;
+	struct list_head idle_strm;
+	spinlock_t strm_lock;
+	u32 avail_strm;
+	wait_queue_head_t strm_wait;
 	const struct zcomp_ops *ops;
 	struct zcomp_params *params;
-	struct hlist_node node;
 };
 
-int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node);
-int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node);
 ssize_t zcomp_available_show(const char *comp, char *buf);
 bool zcomp_available_algorithm(const char *comp);
 
@@ -77,7 +76,7 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params);
 void zcomp_destroy(struct zcomp *comp);
 
 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
-void zcomp_stream_put(struct zcomp *comp);
+void zcomp_stream_put(struct zcomp *comp, struct zcomp_strm *strm);
 
 int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
 		   const void *src, unsigned int *dst_len);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 1c2df2341704..8d5974ea8ff8 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -31,7 +31,6 @@
 #include <linux/idr.h>
 #include <linux/sysfs.h>
 #include <linux/debugfs.h>
-#include <linux/cpuhotplug.h>
 #include <linux/part_stat.h>
 #include <linux/kernel_read_file.h>
 
@@ -1601,7 +1600,7 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index)
 	ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst);
 	kunmap_local(dst);
 	zs_unmap_object(zram->mem_pool, handle);
-	zcomp_stream_put(zram->comps[prio]);
+	zcomp_stream_put(zram->comps[prio], zstrm);
 
 	return ret;
 }
@@ -1762,14 +1761,14 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	kunmap_local(mem);
 
 	if (unlikely(ret)) {
-		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
+		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
 		pr_err("Compression failed! err=%d\n", ret);
 		zs_free(zram->mem_pool, handle);
 		return ret;
 	}
 
 	if (comp_len >= huge_class_size) {
-		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
+		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
 		return write_incompressible_page(zram, page, index);
 	}
 
@@ -1793,7 +1792,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 				   __GFP_HIGHMEM |
 				   __GFP_MOVABLE);
 	if (IS_ERR_VALUE(handle)) {
-		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
+		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
 		atomic64_inc(&zram->stats.writestall);
 		handle = zs_malloc(zram->mem_pool, comp_len,
 				   GFP_NOIO | __GFP_HIGHMEM |
@@ -1805,7 +1804,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	}
 
 	if (!zram_can_store_page(zram)) {
-		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
+		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
 		zs_free(zram->mem_pool, handle);
 		return -ENOMEM;
 	}
@@ -1813,7 +1812,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
 
 	memcpy(dst, zstrm->buffer, comp_len);
-	zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
+	zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
 	zs_unmap_object(zram->mem_pool, handle);
 
 	zram_slot_write_lock(zram, index);
@@ -1972,7 +1971,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
 		kunmap_local(src);
 
 		if (ret) {
-			zcomp_stream_put(zram->comps[prio]);
+			zcomp_stream_put(zram->comps[prio], zstrm);
 			return ret;
 		}
 
@@ -1982,7 +1981,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
 		/* Continue until we make progress */
 		if (class_index_new >= class_index_old ||
 		    (threshold && comp_len_new >= threshold)) {
-			zcomp_stream_put(zram->comps[prio]);
+			zcomp_stream_put(zram->comps[prio], zstrm);
 			continue;
 		}
 
@@ -2040,13 +2039,13 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
 			       __GFP_HIGHMEM |
 			       __GFP_MOVABLE);
 	if (IS_ERR_VALUE(handle_new)) {
-		zcomp_stream_put(zram->comps[prio]);
+		zcomp_stream_put(zram->comps[prio], zstrm);
 		return PTR_ERR((void *)handle_new);
 	}
 
 	dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
 	memcpy(dst, zstrm->buffer, comp_len_new);
-	zcomp_stream_put(zram->comps[prio]);
+	zcomp_stream_put(zram->comps[prio], zstrm);
 
 	zs_unmap_object(zram->mem_pool, handle_new);
 
@@ -2794,7 +2793,6 @@ static void destroy_devices(void)
 	zram_debugfs_destroy();
 	idr_destroy(&zram_index_idr);
 	unregister_blkdev(zram_major, "zram");
-	cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
 }
 
 static int __init zram_init(void)
@@ -2804,15 +2802,9 @@ static int __init zram_init(void)
 
 	BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8);
 
-	ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
-				      zcomp_cpu_up_prepare, zcomp_cpu_dead);
-	if (ret < 0)
-		return ret;
-
 	ret = class_register(&zram_control_class);
 	if (ret) {
 		pr_err("Unable to register zram-control class\n");
-		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
 		return ret;
 	}
 
@@ -2821,7 +2813,6 @@ static int __init zram_init(void)
 	if (zram_major <= 0) {
 		pr_err("Unable to get major number\n");
 		class_unregister(&zram_control_class);
-		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
 		return -EBUSY;
 	}
 
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6cc5e484547c..092ace7db8ee 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -119,7 +119,6 @@ enum cpuhp_state {
 	CPUHP_MM_ZS_PREPARE,
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
-	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_PREPARE,
 	CPUHP_TMIGR_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
-- 
2.48.1.362.g079036d154-goog
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Kairui Song 1 year ago
Hi Sergey,

On Fri, Jan 31, 2025 at 5:07 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Similarly to per-entry spin-lock per-CPU compression streams
> also have a number of shortcoming.
>
> First, per-CPU stream access has to be done from a non-preemptible
> (atomic) section, which imposes the same atomicity requirements on
> compression backends as entry spin-lock do and makes it impossible
> to use algorithms that can schedule/wait/sleep during compression
> and decompression.
>
> Second, per-CPU streams noticeably increase memory usage (actually
> more like wastage) of secondary compression streams.  The problem
> is that secondary compression streams are allocated per-CPU, just
> like the primary streams are.  Yet we never use more that one
> secondary stream at a time, because recompression is a single
> threaded action.  Which means that remaining num_online_cpu() - 1
> streams are allocated for nothing, and this is per-priority list
> (we can have several secondary compression algorithms).  Depending
> on the algorithm this may lead to a significant memory wastage, in
> addition each stream also carries a workmem buffer (2 physical
> pages).
>
> Instead of per-CPU streams, maintain a list of idle compression
> streams and allocate new streams on-demand (something that we
> used to do many years ago).  So that zram read() and write() become
> non-atomic and ease requirements on the compression algorithm
> implementation.  This also means that we now should have only one
> secondary stream per-priority list.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  drivers/block/zram/zcomp.c    | 164 +++++++++++++++++++---------------
>  drivers/block/zram/zcomp.h    |  17 ++--
>  drivers/block/zram/zram_drv.c |  29 +++---
>  include/linux/cpuhotplug.h    |   1 -
>  4 files changed, 109 insertions(+), 102 deletions(-)
>
> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> index bb514403e305..982c769d5831 100644
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -6,7 +6,7 @@
>  #include <linux/slab.h>
>  #include <linux/wait.h>
>  #include <linux/sched.h>
> -#include <linux/cpu.h>
> +#include <linux/cpumask.h>
>  #include <linux/crypto.h>
>  #include <linux/vmalloc.h>
>
> @@ -43,31 +43,40 @@ static const struct zcomp_ops *backends[] = {
>         NULL
>  };
>
> -static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
> +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *strm)
>  {
> -       comp->ops->destroy_ctx(&zstrm->ctx);
> -       vfree(zstrm->buffer);
> -       zstrm->buffer = NULL;
> +       comp->ops->destroy_ctx(&strm->ctx);
> +       vfree(strm->buffer);
> +       kfree(strm);
>  }
>
> -static int zcomp_strm_init(struct zcomp *comp, struct zcomp_strm *zstrm)
> +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
>  {
> +       struct zcomp_strm *strm;
>         int ret;
>
> -       ret = comp->ops->create_ctx(comp->params, &zstrm->ctx);
> -       if (ret)
> -               return ret;
> +       strm = kzalloc(sizeof(*strm), GFP_KERNEL);
> +       if (!strm)
> +               return NULL;
> +
> +       INIT_LIST_HEAD(&strm->entry);
> +
> +       ret = comp->ops->create_ctx(comp->params, &strm->ctx);
> +       if (ret) {
> +               kfree(strm);
> +               return NULL;
> +       }
>
>         /*
> -        * allocate 2 pages. 1 for compressed data, plus 1 extra for the
> -        * case when compressed size is larger than the original one
> +        * allocate 2 pages. 1 for compressed data, plus 1 extra in case if
> +        * compressed data is larger than the original one.
>          */
> -       zstrm->buffer = vzalloc(2 * PAGE_SIZE);
> -       if (!zstrm->buffer) {
> -               zcomp_strm_free(comp, zstrm);
> -               return -ENOMEM;
> +       strm->buffer = vzalloc(2 * PAGE_SIZE);
> +       if (!strm->buffer) {
> +               zcomp_strm_free(comp, strm);
> +               return NULL;
>         }
> -       return 0;
> +       return strm;
>  }
>
>  static const struct zcomp_ops *lookup_backend_ops(const char *comp)
> @@ -109,13 +118,59 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
>
>  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
>  {
> -       local_lock(&comp->stream->lock);
> -       return this_cpu_ptr(comp->stream);
> +       struct zcomp_strm *strm;
> +
> +       might_sleep();
> +
> +       while (1) {
> +               spin_lock(&comp->strm_lock);
> +               if (!list_empty(&comp->idle_strm)) {
> +                       strm = list_first_entry(&comp->idle_strm,
> +                                               struct zcomp_strm,
> +                                               entry);
> +                       list_del(&strm->entry);
> +                       spin_unlock(&comp->strm_lock);
> +                       return strm;
> +               }
> +
> +               /* cannot allocate new stream, wait for an idle one */
> +               if (comp->avail_strm >= num_online_cpus()) {
> +                       spin_unlock(&comp->strm_lock);
> +                       wait_event(comp->strm_wait,
> +                                  !list_empty(&comp->idle_strm));
> +                       continue;
> +               }
> +
> +               /* allocate new stream */
> +               comp->avail_strm++;
> +               spin_unlock(&comp->strm_lock);
> +
> +               strm = zcomp_strm_alloc(comp);
> +               if (strm)
> +                       break;
> +
> +               spin_lock(&comp->strm_lock);
> +               comp->avail_strm--;
> +               spin_unlock(&comp->strm_lock);
> +               wait_event(comp->strm_wait, !list_empty(&comp->idle_strm));
> +       }
> +
> +       return strm;
>  }
>
> -void zcomp_stream_put(struct zcomp *comp)
> +void zcomp_stream_put(struct zcomp *comp, struct zcomp_strm *strm)
>  {
> -       local_unlock(&comp->stream->lock);
> +       spin_lock(&comp->strm_lock);
> +       if (comp->avail_strm <= num_online_cpus()) {
> +               list_add(&strm->entry, &comp->idle_strm);
> +               spin_unlock(&comp->strm_lock);
> +               wake_up(&comp->strm_wait);
> +               return;
> +       }
> +
> +       comp->avail_strm--;
> +       spin_unlock(&comp->strm_lock);
> +       zcomp_strm_free(comp, strm);
>  }
>
>  int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
> @@ -148,61 +203,19 @@ int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm,
>         return comp->ops->decompress(comp->params, &zstrm->ctx, &req);
>  }
>
> -int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
> -{
> -       struct zcomp *comp = hlist_entry(node, struct zcomp, node);
> -       struct zcomp_strm *zstrm;
> -       int ret;
> -
> -       zstrm = per_cpu_ptr(comp->stream, cpu);
> -       local_lock_init(&zstrm->lock);
> -
> -       ret = zcomp_strm_init(comp, zstrm);
> -       if (ret)
> -               pr_err("Can't allocate a compression stream\n");
> -       return ret;
> -}
> -
> -int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node)
> -{
> -       struct zcomp *comp = hlist_entry(node, struct zcomp, node);
> -       struct zcomp_strm *zstrm;
> -
> -       zstrm = per_cpu_ptr(comp->stream, cpu);
> -       zcomp_strm_free(comp, zstrm);
> -       return 0;
> -}
> -
> -static int zcomp_init(struct zcomp *comp, struct zcomp_params *params)
> -{
> -       int ret;
> -
> -       comp->stream = alloc_percpu(struct zcomp_strm);
> -       if (!comp->stream)
> -               return -ENOMEM;
> -
> -       comp->params = params;
> -       ret = comp->ops->setup_params(comp->params);
> -       if (ret)
> -               goto cleanup;
> -
> -       ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node);
> -       if (ret < 0)
> -               goto cleanup;
> -
> -       return 0;
> -
> -cleanup:
> -       comp->ops->release_params(comp->params);
> -       free_percpu(comp->stream);
> -       return ret;
> -}
> -
>  void zcomp_destroy(struct zcomp *comp)
>  {
> -       cpuhp_state_remove_instance(CPUHP_ZCOMP_PREPARE, &comp->node);
> +       struct zcomp_strm *strm;
> +
> +       while (!list_empty(&comp->idle_strm)) {
> +               strm = list_first_entry(&comp->idle_strm,
> +                                       struct zcomp_strm,
> +                                       entry);
> +               list_del(&strm->entry);
> +               zcomp_strm_free(comp, strm);
> +       }
> +
>         comp->ops->release_params(comp->params);
> -       free_percpu(comp->stream);
>         kfree(comp);
>  }
>
> @@ -229,7 +242,12 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params)
>                 return ERR_PTR(-EINVAL);
>         }
>
> -       error = zcomp_init(comp, params);
> +       INIT_LIST_HEAD(&comp->idle_strm);
> +       init_waitqueue_head(&comp->strm_wait);
> +       spin_lock_init(&comp->strm_lock);
> +
> +       comp->params = params;
> +       error = comp->ops->setup_params(comp->params);
>         if (error) {
>                 kfree(comp);
>                 return ERR_PTR(error);
> diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> index ad5762813842..62330829db3f 100644
> --- a/drivers/block/zram/zcomp.h
> +++ b/drivers/block/zram/zcomp.h
> @@ -3,10 +3,10 @@
>  #ifndef _ZCOMP_H_
>  #define _ZCOMP_H_
>
> -#include <linux/local_lock.h>
> -
>  #define ZCOMP_PARAM_NO_LEVEL   INT_MIN
>
> +#include <linux/wait.h>
> +
>  /*
>   * Immutable driver (backend) parameters. The driver may attach private
>   * data to it (e.g. driver representation of the dictionary, etc.).
> @@ -31,7 +31,7 @@ struct zcomp_ctx {
>  };
>
>  struct zcomp_strm {
> -       local_lock_t lock;
> +       struct list_head entry;
>         /* compression buffer */
>         void *buffer;
>         struct zcomp_ctx ctx;
> @@ -60,16 +60,15 @@ struct zcomp_ops {
>         const char *name;
>  };
>
> -/* dynamic per-device compression frontend */
>  struct zcomp {
> -       struct zcomp_strm __percpu *stream;
> +       struct list_head idle_strm;
> +       spinlock_t strm_lock;
> +       u32 avail_strm;
> +       wait_queue_head_t strm_wait;
>         const struct zcomp_ops *ops;
>         struct zcomp_params *params;
> -       struct hlist_node node;
>  };
>
> -int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node);
> -int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node);
>  ssize_t zcomp_available_show(const char *comp, char *buf);
>  bool zcomp_available_algorithm(const char *comp);
>
> @@ -77,7 +76,7 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params);
>  void zcomp_destroy(struct zcomp *comp);
>
>  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
> -void zcomp_stream_put(struct zcomp *comp);
> +void zcomp_stream_put(struct zcomp *comp, struct zcomp_strm *strm);
>
>  int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
>                    const void *src, unsigned int *dst_len);
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 1c2df2341704..8d5974ea8ff8 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -31,7 +31,6 @@
>  #include <linux/idr.h>
>  #include <linux/sysfs.h>
>  #include <linux/debugfs.h>
> -#include <linux/cpuhotplug.h>
>  #include <linux/part_stat.h>
>  #include <linux/kernel_read_file.h>
>
> @@ -1601,7 +1600,7 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index)
>         ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst);
>         kunmap_local(dst);
>         zs_unmap_object(zram->mem_pool, handle);
> -       zcomp_stream_put(zram->comps[prio]);
> +       zcomp_stream_put(zram->comps[prio], zstrm);
>
>         return ret;
>  }
> @@ -1762,14 +1761,14 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
>         kunmap_local(mem);
>
>         if (unlikely(ret)) {
> -               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
> +               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
>                 pr_err("Compression failed! err=%d\n", ret);
>                 zs_free(zram->mem_pool, handle);
>                 return ret;
>         }
>
>         if (comp_len >= huge_class_size) {
> -               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
> +               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
>                 return write_incompressible_page(zram, page, index);
>         }
>
> @@ -1793,7 +1792,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
>                                    __GFP_HIGHMEM |
>                                    __GFP_MOVABLE);
>         if (IS_ERR_VALUE(handle)) {
> -               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
> +               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
>                 atomic64_inc(&zram->stats.writestall);
>                 handle = zs_malloc(zram->mem_pool, comp_len,
>                                    GFP_NOIO | __GFP_HIGHMEM |
> @@ -1805,7 +1804,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
>         }
>
>         if (!zram_can_store_page(zram)) {
> -               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
> +               zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
>                 zs_free(zram->mem_pool, handle);
>                 return -ENOMEM;
>         }
> @@ -1813,7 +1812,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
>         dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
>
>         memcpy(dst, zstrm->buffer, comp_len);
> -       zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
> +       zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
>         zs_unmap_object(zram->mem_pool, handle);
>
>         zram_slot_write_lock(zram, index);
> @@ -1972,7 +1971,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
>                 kunmap_local(src);
>
>                 if (ret) {
> -                       zcomp_stream_put(zram->comps[prio]);
> +                       zcomp_stream_put(zram->comps[prio], zstrm);
>                         return ret;
>                 }
>
> @@ -1982,7 +1981,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
>                 /* Continue until we make progress */
>                 if (class_index_new >= class_index_old ||
>                     (threshold && comp_len_new >= threshold)) {
> -                       zcomp_stream_put(zram->comps[prio]);
> +                       zcomp_stream_put(zram->comps[prio], zstrm);
>                         continue;
>                 }
>
> @@ -2040,13 +2039,13 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
>                                __GFP_HIGHMEM |
>                                __GFP_MOVABLE);
>         if (IS_ERR_VALUE(handle_new)) {
> -               zcomp_stream_put(zram->comps[prio]);
> +               zcomp_stream_put(zram->comps[prio], zstrm);
>                 return PTR_ERR((void *)handle_new);
>         }
>
>         dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
>         memcpy(dst, zstrm->buffer, comp_len_new);
> -       zcomp_stream_put(zram->comps[prio]);
> +       zcomp_stream_put(zram->comps[prio], zstrm);
>
>         zs_unmap_object(zram->mem_pool, handle_new);
>
> @@ -2794,7 +2793,6 @@ static void destroy_devices(void)
>         zram_debugfs_destroy();
>         idr_destroy(&zram_index_idr);
>         unregister_blkdev(zram_major, "zram");
> -       cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
>  }
>
>  static int __init zram_init(void)
> @@ -2804,15 +2802,9 @@ static int __init zram_init(void)
>
>         BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8);
>
> -       ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
> -                                     zcomp_cpu_up_prepare, zcomp_cpu_dead);
> -       if (ret < 0)
> -               return ret;
> -
>         ret = class_register(&zram_control_class);
>         if (ret) {
>                 pr_err("Unable to register zram-control class\n");
> -               cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
>                 return ret;
>         }
>
> @@ -2821,7 +2813,6 @@ static int __init zram_init(void)
>         if (zram_major <= 0) {
>                 pr_err("Unable to get major number\n");
>                 class_unregister(&zram_control_class);
> -               cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
>                 return -EBUSY;
>         }
>
> diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> index 6cc5e484547c..092ace7db8ee 100644
> --- a/include/linux/cpuhotplug.h
> +++ b/include/linux/cpuhotplug.h
> @@ -119,7 +119,6 @@ enum cpuhp_state {
>         CPUHP_MM_ZS_PREPARE,
>         CPUHP_MM_ZSWP_POOL_PREPARE,
>         CPUHP_KVM_PPC_BOOK3S_PREPARE,
> -       CPUHP_ZCOMP_PREPARE,
>         CPUHP_TIMERS_PREPARE,
>         CPUHP_TMIGR_PREPARE,
>         CPUHP_MIPS_SOC_PREPARE,
> --
> 2.48.1.362.g079036d154-goog

This seems will cause a huge regression of performance on multi core
systems, this is especially significant as the number of concurrent
tasks increases:

Test build linux kernel using ZRAM as SWAP (1G memcg):

Before:
+ /usr/bin/time make -s -j48
2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata
863304maxresident)k

After:
+ /usr/bin/time make -s -j48
2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata
863276maxresident)k

`perf lock contention -ab sleep 3` also indicates the big spin lock in
zcomp_stream_get/put is having significant contention:
contended   total wait     max wait     avg wait         type   caller

    793357     28.71 s       2.66 ms     36.19 us     spinlock
zcomp_stream_get+0x37
    793170     28.60 s       2.65 ms     36.06 us     spinlock
zcomp_stream_put+0x1f
    444007     15.26 s       2.58 ms     34.37 us     spinlock
zcomp_stream_put+0x1f
    443960     15.21 s       2.68 ms     34.25 us     spinlock
zcomp_stream_get+0x37
      5516    152.50 ms      3.30 ms     27.65 us     spinlock
evict_folios+0x7e
      4523    137.47 ms      3.66 ms     30.39 us     spinlock
folio_lruvec_lock_irqsave+0xc3
      4253    108.93 ms      2.92 ms     25.61 us     spinlock
folio_lruvec_lock_irqsave+0xc3
     49294     71.73 ms     15.87 us      1.46 us     spinlock
list_lru_del+0x7c
      2327     51.35 ms      3.48 ms     22.07 us     spinlock
evict_folios+0x5c0
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/01 17:21), Kairui Song wrote:
> This seems will cause a huge regression of performance on multi core
> systems, this is especially significant as the number of concurrent
> tasks increases:
> 
> Test build linux kernel using ZRAM as SWAP (1G memcg):
> 
> Before:
> + /usr/bin/time make -s -j48
> 2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata
> 863304maxresident)k
> 
> After:
> + /usr/bin/time make -s -j48
> 2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata
> 863276maxresident)k

How many CPUs do you have?  I assume, preemption gets into way which is
sort of expected, to be honest...  Using per-CPU compression streams
disables preemption and uses CPU exclusively at a price of other tasks
not being able to run.  I do tend to think that I made a mistake by
switching zram to per-CPU compression streams.

What preemption model do you use and to what extent do you overload
your system?

My tests don't show anything unusual (but I don't overload the system)

CONFIG_PREEMPT

before
1371.96user 156.21system 1:30.91elapsed 1680%CPU (0avgtext+0avgdata 825636maxresident)k
32688inputs+1768416outputs (259major+51539861minor)pagefaults 0swaps

after
1372.05user 155.79system 1:30.82elapsed 1682%CPU (0avgtext+0avgdata 825684maxresident)k
32680inputs+1768416outputs (273major+51541815minor)pagefaults 0swaps

(I use zram as a block device with ext4 on it.)

> `perf lock contention -ab sleep 3` also indicates the big spin lock in
> zcomp_stream_get/put is having significant contention:

Hmm it's just

	spin_lock()
	list first entry
	spin_unlock()

Shouldn't be "a big spin lock", that's very odd.  I'm not familiar with
perf lock contention, let me take a look.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Kairui Song 1 year ago
On Mon, Feb 3, 2025 at 11:49 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (25/02/01 17:21), Kairui Song wrote:
> > This seems will cause a huge regression of performance on multi core
> > systems, this is especially significant as the number of concurrent
> > tasks increases:
> >
> > Test build linux kernel using ZRAM as SWAP (1G memcg):
> >
> > Before:
> > + /usr/bin/time make -s -j48
> > 2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata
> > 863304maxresident)k
> >
> > After:
> > + /usr/bin/time make -s -j48
> > 2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata
> > 863276maxresident)k
>
> How many CPUs do you have?  I assume, preemption gets into way which is
> sort of expected, to be honest...  Using per-CPU compression streams
> disables preemption and uses CPU exclusively at a price of other tasks
> not being able to run.  I do tend to think that I made a mistake by
> switching zram to per-CPU compression streams.
>
> What preemption model do you use and to what extent do you overload
> your system?
>
> My tests don't show anything unusual (but I don't overload the system)
>
> CONFIG_PREEMPT

I'm using CONFIG_PREEMPT_VOLUNTARY=y, and there are 96 logical CPUs
(48c96t), make -j48 shouldn't be considered overload I think. make
-j32 also showed an obvious slow down.

>
> before
> 1371.96user 156.21system 1:30.91elapsed 1680%CPU (0avgtext+0avgdata 825636maxresident)k
> 32688inputs+1768416outputs (259major+51539861minor)pagefaults 0swaps
>
> after
> 1372.05user 155.79system 1:30.82elapsed 1682%CPU (0avgtext+0avgdata 825684maxresident)k
> 32680inputs+1768416outputs (273major+51541815minor)pagefaults 0swaps
>
> (I use zram as a block device with ext4 on it.)

I'm testing with ZRAM as SWAP, and tmpfs as storage for the kernel
source code, with memory pressure inside a 2G or smaller mem cgroup
(depend on make -j48 or -j32).

>
> > `perf lock contention -ab sleep 3` also indicates the big spin lock in
> > zcomp_stream_get/put is having significant contention:
>
> Hmm it's just
>
>         spin_lock()
>         list first entry
>         spin_unlock()
>
> Shouldn't be "a big spin lock", that's very odd.  I'm not familiar with
> perf lock contention, let me take a look.

I can debug this a bit more to figure out why the contention is huge
later, but my first thought is that, as Yosry also mentioned in
another reply, making it preemptable doesn't necessarily mean the per
CPU stream has to be gone.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/06 14:55), Kairui Song wrote:
> > On (25/02/01 17:21), Kairui Song wrote:
> > > This seems will cause a huge regression of performance on multi core
> > > systems, this is especially significant as the number of concurrent
> > > tasks increases:
> > >
> > > Test build linux kernel using ZRAM as SWAP (1G memcg):
> > >
> > > Before:
> > > + /usr/bin/time make -s -j48
> > > 2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata
> > > 863304maxresident)k
> > >
> > > After:
> > > + /usr/bin/time make -s -j48
> > > 2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata
> > > 863276maxresident)k
> >
> > How many CPUs do you have?  I assume, preemption gets into way which is
> > sort of expected, to be honest...  Using per-CPU compression streams
> > disables preemption and uses CPU exclusively at a price of other tasks
> > not being able to run.  I do tend to think that I made a mistake by
> > switching zram to per-CPU compression streams.
> >
> > What preemption model do you use and to what extent do you overload
> > your system?
> >
> > My tests don't show anything unusual (but I don't overload the system)
> >
> > CONFIG_PREEMPT
> 
> I'm using CONFIG_PREEMPT_VOLUNTARY=y, and there are 96 logical CPUs
> (48c96t), make -j48 shouldn't be considered overload I think. make
> -j32 also showed an obvious slow down.

Hmm, there should be more than enough compression streams then, the
limit is num_online_cpus.  That's strange.  I wonder if that's zsmalloc
handle allocation ("remove two-staged handle allocation" in the series.)

[..]
> > Hmm it's just
> >
> >         spin_lock()
> >         list first entry
> >         spin_unlock()
> >
> > Shouldn't be "a big spin lock", that's very odd.  I'm not familiar with
> > perf lock contention, let me take a look.
> 
> I can debug this a bit more to figure out why the contention is huge
> later

That will be appreciated, thank you.

> but my first thought is that, as Yosry also mentioned in
> another reply, making it preemptable doesn't necessarily mean the per
> CPU stream has to be gone.

Was going to reply to Yosry's email today/tomorrow, didn't have time to
look into, but will reply here.


So for spin-lock contention - yes, but that lock really should not
be so visible.  Other than that we limit the number of compression
streams to the number of the CPUs and permit preemption, so it should
be the same as the "preemptible per-CPU" streams, roughly.  The
difference, perhaps, is that we don't pre-allocate streams, but
allocate only as needed.  This has two sides: one side is that later
allocations can fail, but the other side is that we don't allocate
streams that we don't use.  Especially secondary streams (priority 1
and 2, which are used for recompression).  I didn't know it was possible
to use per-CPU data and still have preemption enabled at the same time.
So I'm not opposed to the idea of still having per-CPU streams and do
what zswap folks did.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Yosry Ahmed 1 year ago
On Thu, Feb 06, 2025 at 04:22:27PM +0900, Sergey Senozhatsky wrote:
> On (25/02/06 14:55), Kairui Song wrote:
> > > On (25/02/01 17:21), Kairui Song wrote:
> > > > This seems will cause a huge regression of performance on multi core
> > > > systems, this is especially significant as the number of concurrent
> > > > tasks increases:
> > > >
> > > > Test build linux kernel using ZRAM as SWAP (1G memcg):
> > > >
> > > > Before:
> > > > + /usr/bin/time make -s -j48
> > > > 2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata
> > > > 863304maxresident)k
> > > >
> > > > After:
> > > > + /usr/bin/time make -s -j48
> > > > 2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata
> > > > 863276maxresident)k
> > >
> > > How many CPUs do you have?  I assume, preemption gets into way which is
> > > sort of expected, to be honest...  Using per-CPU compression streams
> > > disables preemption and uses CPU exclusively at a price of other tasks
> > > not being able to run.  I do tend to think that I made a mistake by
> > > switching zram to per-CPU compression streams.
> > >
> > > What preemption model do you use and to what extent do you overload
> > > your system?
> > >
> > > My tests don't show anything unusual (but I don't overload the system)
> > >
> > > CONFIG_PREEMPT
> > 
> > I'm using CONFIG_PREEMPT_VOLUNTARY=y, and there are 96 logical CPUs
> > (48c96t), make -j48 shouldn't be considered overload I think. make
> > -j32 also showed an obvious slow down.
> 
> Hmm, there should be more than enough compression streams then, the
> limit is num_online_cpus.  That's strange.  I wonder if that's zsmalloc
> handle allocation ("remove two-staged handle allocation" in the series.)
> 
> [..]
> > > Hmm it's just
> > >
> > >         spin_lock()
> > >         list first entry
> > >         spin_unlock()
> > >
> > > Shouldn't be "a big spin lock", that's very odd.  I'm not familiar with
> > > perf lock contention, let me take a look.
> > 
> > I can debug this a bit more to figure out why the contention is huge
> > later
> 
> That will be appreciated, thank you.
> 
> > but my first thought is that, as Yosry also mentioned in
> > another reply, making it preemptable doesn't necessarily mean the per
> > CPU stream has to be gone.
> 
> Was going to reply to Yosry's email today/tomorrow, didn't have time to
> look into, but will reply here.
> 
> 
> So for spin-lock contention - yes, but that lock really should not
> be so visible.  Other than that we limit the number of compression
> streams to the number of the CPUs and permit preemption, so it should
> be the same as the "preemptible per-CPU" streams, roughly.

I think one other problem is that with a pool of streams guarded by a
single lock all CPUs have to be serialized on that lock, even if there's
enough streams for all CPUs in theory.

> The difference, perhaps, is that we don't pre-allocate streams, but
> allocate only as needed.  This has two sides: one side is that later
> allocations can fail, but the other side is that we don't allocate
> streams that we don't use.  Especially secondary streams (priority 1
> and 2, which are used for recompression).  I didn't know it was possible
> to use per-CPU data and still have preemption enabled at the same time.
> So I'm not opposed to the idea of still having per-CPU streams and do
> what zswap folks did.

Note that it's not a free lunch. If preemption is allowed there is
nothing holding keeping the CPU that you're using its data, and it can
be offlined. I see that zcomp_cpu_dead() would free the compression
stream from under its user in this case.

We had a similar problem recently in zswap and it took me a couple of
iterations to properly fix it. In short, you need to synchronize the CPU
hotplug callbacks with the users of the compression stream to make sure
the stream is not freed under the user.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/06 16:16), Yosry Ahmed wrote:
> > So for spin-lock contention - yes, but that lock really should not
> > be so visible.  Other than that we limit the number of compression
> > streams to the number of the CPUs and permit preemption, so it should
> > be the same as the "preemptible per-CPU" streams, roughly.
> 
> I think one other problem is that with a pool of streams guarded by a
> single lock all CPUs have to be serialized on that lock, even if there's
> enough streams for all CPUs in theory.

Yes, at the same time it guards list-first-entry, which is not
exceptionally expensive.  Yet, somehow, it still showed up on
Kairui's radar.

I think there was also a problem with how on-demand streams were
constructed - GFP_KERNEL allocations from a reclaim path, which
is a tiny bit problematic and deadlock-ish.

> > The difference, perhaps, is that we don't pre-allocate streams, but
> > allocate only as needed.  This has two sides: one side is that later
> > allocations can fail, but the other side is that we don't allocate
> > streams that we don't use.  Especially secondary streams (priority 1
> > and 2, which are used for recompression).  I didn't know it was possible
> > to use per-CPU data and still have preemption enabled at the same time.
> > So I'm not opposed to the idea of still having per-CPU streams and do
> > what zswap folks did.
> 
> Note that it's not a free lunch. If preemption is allowed there is
> nothing holding keeping the CPU that you're using its data, and it can
> be offlined. I see that zcomp_cpu_dead() would free the compression
> stream from under its user in this case.

Yes, I took same approach as you did in zswap - we are holding the mutex
that cpu-dead is blocked on as long as the stream is being used.

struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
{
        for (;;) {
                struct zcomp_strm *zstrm = raw_cpu_ptr(comp->stream);

                /*
                 * Inspired by zswap
                 *
                 * stream is returned with ->mutex locked which prevents
                 * cpu_dead() from releasing this stream under us, however
                 * there is still a race window between raw_cpu_ptr() and
                 * mutex_lock(), during which we could have been migrated
                 * to a CPU that has already destroyed its stream.  If so
                 * then unlock and re-try on the current CPU.
                 */
                mutex_lock(&zstrm->lock);
                if (likely(zstrm->buffer))
                        return zstrm;
                mutex_unlock(&zstrm->lock);
        }
}

void zcomp_stream_put(struct zcomp_strm *zstrm)
{
        mutex_unlock(&zstrm->lock);
}

int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
        struct zcomp *comp = hlist_entry(node, struct zcomp, node);
        struct zcomp_strm *zstrm = per_cpu_ptr(comp->stream, cpu);

        mutex_lock(&zstrm->lock);
        zcomp_strm_free(comp, zstrm);
        mutex_unlock(&zstrm->lock);
        return 0;
}

> We had a similar problem recently in zswap and it took me a couple of
> iterations to properly fix it. In short, you need to synchronize the CPU
> hotplug callbacks with the users of the compression stream to make sure
> the stream is not freed under the user.

Agreed.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/07 11:56), Sergey Senozhatsky wrote:
> struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
> {
>         for (;;) {
>                 struct zcomp_strm *zstrm = raw_cpu_ptr(comp->stream);
> 
>                 /*
>                  * Inspired by zswap
>                  *
>                  * stream is returned with ->mutex locked which prevents
>                  * cpu_dead() from releasing this stream under us, however
>                  * there is still a race window between raw_cpu_ptr() and
>                  * mutex_lock(), during which we could have been migrated
>                  * to a CPU that has already destroyed its stream.  If so
>                  * then unlock and re-try on the current CPU.
>                  */
>                 mutex_lock(&zstrm->lock);
>                 if (likely(zstrm->buffer))
>                         return zstrm;
>                 mutex_unlock(&zstrm->lock);
>         }
> }
> 
> void zcomp_stream_put(struct zcomp_strm *zstrm)
> {
>         mutex_unlock(&zstrm->lock);
> }
> 
> int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node)
> {
>         struct zcomp *comp = hlist_entry(node, struct zcomp, node);
>         struct zcomp_strm *zstrm = per_cpu_ptr(comp->stream, cpu);
> 
>         mutex_lock(&zstrm->lock);
>         zcomp_strm_free(comp, zstrm);
>         mutex_unlock(&zstrm->lock);
>         return 0;
> }

One downside of this is that this adds mutex to the locking graph and
limits what zram can do.  In particular we cannot do GFP_NOIO zsmalloc
handle allocations, because NOIO still does reclaim (doesn't reach the
block layer) which grabs some locks internally and this looks a bit
problematics:
	zram strm mutex -> zsmalloc GFP_NOIO -> reclaim
vs
	reclaim -> zram strm mutex -> zsmalloc

GFP_NOWAIT allocation has lower success chances.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Yosry Ahmed 1 year ago
On Fri, Feb 07, 2025 at 03:12:59PM +0900, Sergey Senozhatsky wrote:
> On (25/02/07 11:56), Sergey Senozhatsky wrote:
> > struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
> > {
> >         for (;;) {
> >                 struct zcomp_strm *zstrm = raw_cpu_ptr(comp->stream);
> > 
> >                 /*
> >                  * Inspired by zswap
> >                  *
> >                  * stream is returned with ->mutex locked which prevents
> >                  * cpu_dead() from releasing this stream under us, however
> >                  * there is still a race window between raw_cpu_ptr() and
> >                  * mutex_lock(), during which we could have been migrated
> >                  * to a CPU that has already destroyed its stream.  If so
> >                  * then unlock and re-try on the current CPU.
> >                  */
> >                 mutex_lock(&zstrm->lock);
> >                 if (likely(zstrm->buffer))
> >                         return zstrm;
> >                 mutex_unlock(&zstrm->lock);
> >         }
> > }
> > 
> > void zcomp_stream_put(struct zcomp_strm *zstrm)
> > {
> >         mutex_unlock(&zstrm->lock);
> > }
> > 
> > int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node)
> > {
> >         struct zcomp *comp = hlist_entry(node, struct zcomp, node);
> >         struct zcomp_strm *zstrm = per_cpu_ptr(comp->stream, cpu);
> > 
> >         mutex_lock(&zstrm->lock);
> >         zcomp_strm_free(comp, zstrm);
> >         mutex_unlock(&zstrm->lock);
> >         return 0;
> > }
> 
> One downside of this is that this adds mutex to the locking graph and
> limits what zram can do.  In particular we cannot do GFP_NOIO zsmalloc
> handle allocations, because NOIO still does reclaim (doesn't reach the
> block layer) which grabs some locks internally and this looks a bit
> problematics:
> 	zram strm mutex -> zsmalloc GFP_NOIO -> reclaim
> vs
> 	reclaim -> zram strm mutex -> zsmalloc
> 
> GFP_NOWAIT allocation has lower success chances.

I assume this problem is unique to zram and not zswap because zram can
be used with normal IO (and then recurse through reclaim), while zswap
is only reachable thorugh reclaim (which cannot recurse)?
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/07 21:07), Yosry Ahmed wrote:
> I assume this problem is unique to zram and not zswap because zram can
> be used with normal IO (and then recurse through reclaim), while zswap
> is only reachable thorugh reclaim (which cannot recurse)?

I think I figured it out.  It appears that the problem was in lockdep
class key.  Both in zram and in zsmalloc I made the keys static:

 static void zram_slot_lock_init(struct zram *zram, u32 index)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
       static struct lock_class_key key;

        lockdep_init_map(&zram->table[index].lockdep_map, "zram-entry->lock",
                        &key, 0);
 #endif
 }

Which would put the locks to the same class from lockdep point of
view.  And that means that chains of locks from zram0 (mounted ext4)
and chains of locks from zram1 (swap device) would interleave, leading
to reports that made no sense.  Like ext4 writeback and blkdev_read and
handle_mm_fault->do_swap_page() would be parts of the same lock-chain *.

So I moved lockdep class keys to per-zram device and per-zsmalloc pool
to separate the lockdep chains.  Looks like that did the trick.


*

[ 1714.787676] [    T172] ======================================================
[ 1714.788905] [    T172] WARNING: possible circular locking dependency detected
[ 1714.790114] [    T172] 6.14.0-rc1-next-20250207+ #936 Not tainted
[ 1714.791150] [    T172] ------------------------------------------------------
[ 1714.792356] [    T172] kworker/u96:4/172 is trying to acquire lock:
[ 1714.793421] [    T172] ffff888114cf0598 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: page_vma_mapped_walk+0x5c0/0x960
[ 1714.795174] [    T172]
                          but task is already holding lock:
[ 1714.796453] [    T172] ffffe8ffff981cf8 (&zstrm->lock){+.+.}-{4:4}, at: zcomp_stream_get+0x20/0x40 [zram]
[ 1714.798098] [    T172]
                          which lock already depends on the new lock.

[ 1714.799901] [    T172]                                                                                                                                                                                                                                                                                   the existing dependency chain (in reverse order) is:
[ 1714.801469] [    T172]
                          -> #3 (&zstrm->lock){+.+.}-{4:4}:
[ 1714.802750] [    T172]        lock_acquire.part.0+0x63/0x1a0
[ 1714.803712] [    T172]        __mutex_lock+0xaa/0xd40
[ 1714.804574] [    T172]        zcomp_stream_get+0x20/0x40 [zram]
[ 1714.805578] [    T172]        zram_read_from_zspool+0x84/0x140 [zram]
[ 1714.806673] [    T172]        zram_bio_read+0x56/0x2c0 [zram]
[ 1714.807641] [    T172]        __submit_bio+0x12d/0x1c0
[ 1714.808511] [    T172]        __submit_bio_noacct+0x7f/0x200
[ 1714.809468] [    T172]        mpage_readahead+0xdd/0x110
[ 1714.810360] [    T172]        read_pages+0x7a/0x1b0
[ 1714.811182] [    T172]        page_cache_ra_unbounded+0x19a/0x210
[ 1714.812215] [    T172]        force_page_cache_ra+0x92/0xb0
[ 1714.813161] [    T172]        filemap_get_pages+0x11f/0x440
[ 1714.814098] [    T172]        filemap_read+0xf6/0x400
[ 1714.814945] [    T172]        blkdev_read_iter+0x66/0x130
[ 1714.815860] [    T172]        vfs_read+0x266/0x370
[ 1714.816674] [    T172]        ksys_read+0x66/0xe0
[ 1714.817477] [    T172]        do_syscall_64+0x64/0x130
[ 1714.818344] [    T172]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[ 1714.819444] [    T172]
                          -> #2 (zram-entry->lock){+.+.}-{0:0}:
[ 1714.820769] [    T172]        lock_acquire.part.0+0x63/0x1a0
[ 1714.821734] [    T172]        zram_slot_free_notify+0x5c/0x80 [zram]
[ 1714.822811] [    T172]        swap_entry_range_free+0x115/0x1a0
[ 1714.823812] [    T172]        cluster_swap_free_nr+0xb9/0x150
[ 1714.824787] [    T172]        do_swap_page+0x80d/0xea0
[ 1714.825661] [    T172]        __handle_mm_fault+0x538/0x7a0
[ 1714.826592] [    T172]        handle_mm_fault+0xdf/0x240
[ 1714.827485] [    T172]        do_user_addr_fault+0x152/0x700
[ 1714.828432] [    T172]        exc_page_fault+0x66/0x1f0
[ 1714.829317] [    T172]        asm_exc_page_fault+0x22/0x30
[ 1714.830235] [    T172]        do_sys_poll+0x213/0x260
[ 1714.831090] [    T172]        __x64_sys_poll+0x44/0x190
[ 1714.831972] [    T172]        do_syscall_64+0x64/0x130
[ 1714.832846] [    T172]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[ 1714.833949] [    T172]
                          -> #1 (&cluster_info[i].lock){+.+.}-{3:3}:
[ 1714.835354] [    T172]        lock_acquire.part.0+0x63/0x1a0
[ 1714.836307] [    T172]        _raw_spin_lock+0x2c/0x40
[ 1714.837194] [    T172]        __swap_duplicate+0x5e/0x150
[ 1714.838123] [    T172]        swap_duplicate+0x1c/0x40
[ 1714.838980] [    T172]        try_to_unmap_one+0x6c4/0xd60
[ 1714.839901] [    T172]        rmap_walk_anon+0xe7/0x210
[ 1714.840774] [    T172]        try_to_unmap+0x76/0x80
[ 1714.841613] [    T172]        shrink_folio_list+0x487/0xad0
[ 1714.842546] [    T172]        evict_folios+0x247/0x800
[ 1714.843404] [    T172]        try_to_shrink_lruvec+0x1cd/0x2b0
[ 1714.844382] [    T172]        lru_gen_shrink_node+0xc3/0x190
[ 1714.845335] [    T172]        do_try_to_free_pages+0xee/0x4b0
[ 1714.846292] [    T172]        try_to_free_pages+0xea/0x280
[ 1714.847208] [    T172]        __alloc_pages_slowpath.constprop.0+0x296/0x970
[ 1714.848391] [    T172]        __alloc_frozen_pages_noprof+0x2b3/0x300
[ 1714.849475] [    T172]        __folio_alloc_noprof+0x10/0x30
[ 1714.850422] [    T172]        do_anonymous_page+0x69/0x4b0
[ 1714.851337] [    T172]        __handle_mm_fault+0x557/0x7a0
[ 1714.852265] [    T172]        handle_mm_fault+0xdf/0x240
[ 1714.853153] [    T172]        do_user_addr_fault+0x152/0x700
[ 1714.854099] [    T172]        exc_page_fault+0x66/0x1f0
[ 1714.854976] [    T172]        asm_exc_page_fault+0x22/0x30
[ 1714.855897] [    T172]        rep_movs_alternative+0x3a/0x60
[ 1714.856851] [    T172]        _copy_to_iter+0xe2/0x7a0
[ 1714.857719] [    T172]        get_random_bytes_user+0x95/0x150
[ 1714.858712] [    T172]        vfs_read+0x266/0x370
[ 1714.859512] [    T172]        ksys_read+0x66/0xe0
[ 1714.860301] [    T172]        do_syscall_64+0x64/0x130
[ 1714.861167] [    T172]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[ 1714.862270] [    T172]
                          -> #0 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}:
[ 1714.863656] [    T172]        check_prev_add+0xeb/0xca0
[ 1714.864532] [    T172]        __lock_acquire+0xf56/0x12c0
[ 1714.865446] [    T172]        lock_acquire.part.0+0x63/0x1a0
[ 1714.866399] [    T172]        _raw_spin_lock+0x2c/0x40
[ 1714.867258] [    T172]        page_vma_mapped_walk+0x5c0/0x960
[ 1714.868235] [    T172]        folio_referenced_one+0xd0/0x4a0
[ 1714.869205] [    T172]        __rmap_walk_file+0xbe/0x1b0
[ 1714.870119] [    T172]        folio_referenced+0x10b/0x140
[ 1714.871039] [    T172]        shrink_folio_list+0x72c/0xad0
[ 1714.871975] [    T172]        evict_folios+0x247/0x800
[ 1714.872851] [    T172]        try_to_shrink_lruvec+0x1cd/0x2b0
[ 1714.873842] [    T172]        lru_gen_shrink_node+0xc3/0x190
[ 1714.874806] [    T172]        do_try_to_free_pages+0xee/0x4b0
[ 1714.875779] [    T172]        try_to_free_pages+0xea/0x280
[ 1714.876699] [    T172]        __alloc_pages_slowpath.constprop.0+0x296/0x970
[ 1714.877897] [    T172]        __alloc_frozen_pages_noprof+0x2b3/0x300
[ 1714.878977] [    T172]        __alloc_pages_noprof+0xa/0x20
[ 1714.879907] [    T172]        alloc_zspage+0xe6/0x2c0 [zsmalloc]
[ 1714.880924] [    T172]        zs_malloc+0xd2/0x2b0 [zsmalloc]
[ 1714.881881] [    T172]        zram_write_page+0xfc/0x300 [zram]
[ 1714.882873] [    T172]        zram_bio_write+0xd1/0x1c0 [zram]
[ 1714.883845] [    T172]        __submit_bio+0x12d/0x1c0
[ 1714.884712] [    T172]        __submit_bio_noacct+0x7f/0x200
[ 1714.885667] [    T172]        ext4_io_submit+0x20/0x40
[ 1714.886532] [    T172]        ext4_do_writepages+0x3e3/0x8b0
[ 1714.887482] [    T172]        ext4_writepages+0xe8/0x280
[ 1714.888377] [    T172]        do_writepages+0xcf/0x260
[ 1714.889247] [    T172]        __writeback_single_inode+0x56/0x350
[ 1714.890273] [    T172]        writeback_sb_inodes+0x227/0x550
[ 1714.891239] [    T172]        __writeback_inodes_wb+0x4c/0xe0
[ 1714.892202] [    T172]        wb_writeback+0x2f2/0x3f0
[ 1714.893071] [    T172]        wb_do_writeback+0x227/0x2a0
[ 1714.893976] [    T172]        wb_workfn+0x56/0x1b0
[ 1714.894777] [    T172]        process_one_work+0x1eb/0x570
[ 1714.895698] [    T172]        worker_thread+0x1d1/0x3b0
[ 1714.896571] [    T172]        kthread+0xf9/0x200
[ 1714.897356] [    T172]        ret_from_fork+0x2d/0x50
[ 1714.898214] [    T172]        ret_from_fork_asm+0x11/0x20
[ 1714.899142] [    T172]
                          other info that might help us debug this:

[ 1714.900906] [    T172] Chain exists of:
                            ptlock_ptr(ptdesc)#2 --> zram-entry->lock --> &zstrm->lock

[ 1714.903183] [    T172]  Possible unsafe locking scenario:

[ 1714.904463] [    T172]        CPU0                    CPU1
[ 1714.905380] [    T172]        ----                    ----
[ 1714.906293] [    T172]   lock(&zstrm->lock);
[ 1714.907006] [    T172]                                lock(zram-entry->lock);
[ 1714.908204] [    T172]                                lock(&zstrm->lock);
[ 1714.909347] [    T172]   lock(ptlock_ptr(ptdesc)#2);
[ 1714.910179] [    T172]
                           *** DEADLOCK ***

[ 1714.911570] [    T172] 7 locks held by kworker/u96:4/172:
[ 1714.912472] [    T172]  #0: ffff88810165d548 ((wq_completion)writeback){+.+.}-{0:0}, at: process_one_work+0x433/0x570
[ 1714.914273] [    T172]  #1: ffffc90000683e40 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: process_one_work+0x1ad/0x570
[ 1714.916339] [    T172]  #2: ffff88810b93d0e0 (&type->s_umount_key#28){++++}-{4:4}, at: super_trylock_shared+0x16/0x50
[ 1714.918141] [    T172]  #3: ffff88810b93ab50 (&sbi->s_writepages_rwsem){.+.+}-{0:0}, at: do_writepages+0xcf/0x260
[ 1714.919877] [    T172]  #4: ffffe8ffff981cf8 (&zstrm->lock){+.+.}-{4:4}, at: zcomp_stream_get+0x20/0x40 [zram]
[ 1714.921573] [    T172]  #5: ffff888106809900 (&mapping->i_mmap_rwsem){++++}-{4:4}, at: __rmap_walk_file+0x161/0x1b0
[ 1714.923347] [    T172]  #6: ffffffff82347d40 (rcu_read_lock){....}-{1:3}, at: ___pte_offset_map+0x26/0x1b0
[ 1714.924981] [    T172]
                          stack backtrace:
[ 1714.925998] [    T172] CPU: 6 UID: 0 PID: 172 Comm: kworker/u96:4 Not tainted 6.14.0-rc1-next-20250207+ #936
[ 1714.926005] [    T172] Workqueue: writeback wb_workfn (flush-251:0)
[ 1714.926009] [    T172] Call Trace:
[ 1714.926013] [    T172]  <TASK>
[ 1714.926015] [    T172]  dump_stack_lvl+0x57/0x80
[ 1714.926018] [    T172]  print_circular_bug.cold+0x38/0x45
[ 1714.926021] [    T172]  check_noncircular+0x12e/0x150
[ 1714.926025] [    T172]  check_prev_add+0xeb/0xca0
[ 1714.926027] [    T172]  ? add_chain_cache+0x10c/0x480
[ 1714.926029] [    T172]  __lock_acquire+0xf56/0x12c0
[ 1714.926032] [    T172]  lock_acquire.part.0+0x63/0x1a0
[ 1714.926035] [    T172]  ? page_vma_mapped_walk+0x5c0/0x960
[ 1714.926036] [    T172]  ? page_vma_mapped_walk+0x5c0/0x960
[ 1714.926037] [    T172]  _raw_spin_lock+0x2c/0x40
[ 1714.926040] [    T172]  ? page_vma_mapped_walk+0x5c0/0x960
[ 1714.926041] [    T172]  page_vma_mapped_walk+0x5c0/0x960
[ 1714.926043] [    T172]  folio_referenced_one+0xd0/0x4a0
[ 1714.926046] [    T172]  __rmap_walk_file+0xbe/0x1b0
[ 1714.926047] [    T172]  folio_referenced+0x10b/0x140
[ 1714.926050] [    T172]  ? page_mkclean_one+0xc0/0xc0
[ 1714.926051] [    T172]  ? folio_get_anon_vma+0x220/0x220
[ 1714.926052] [    T172]  ? __traceiter_remove_migration_pte+0x50/0x50
[ 1714.926054] [    T172]  shrink_folio_list+0x72c/0xad0
[ 1714.926060] [    T172]  evict_folios+0x247/0x800
[ 1714.926064] [    T172]  try_to_shrink_lruvec+0x1cd/0x2b0
[ 1714.926066] [    T172]  lru_gen_shrink_node+0xc3/0x190
[ 1714.926068] [    T172]  ? mark_usage+0x61/0x110
[ 1714.926071] [    T172]  do_try_to_free_pages+0xee/0x4b0
[ 1714.926073] [    T172]  try_to_free_pages+0xea/0x280
[ 1714.926077] [    T172]  __alloc_pages_slowpath.constprop.0+0x296/0x970
[ 1714.926079] [    T172]  ? __lock_acquire+0x3d1/0x12c0
[ 1714.926081] [    T172]  ? get_page_from_freelist+0xd9/0x680
[ 1714.926083] [    T172]  ? match_held_lock+0x30/0xa0
[ 1714.926085] [    T172]  __alloc_frozen_pages_noprof+0x2b3/0x300
[ 1714.926088] [    T172]  __alloc_pages_noprof+0xa/0x20
[ 1714.926090] [    T172]  alloc_zspage+0xe6/0x2c0 [zsmalloc]
[ 1714.926092] [    T172]  ? zs_malloc+0xc5/0x2b0 [zsmalloc]
[ 1714.926094] [    T172]  ? __lock_release.isra.0+0x5e/0x180
[ 1714.926096] [    T172]  zs_malloc+0xd2/0x2b0 [zsmalloc]
[ 1714.926099] [    T172]  zram_write_page+0xfc/0x300 [zram]
[ 1714.926102] [    T172]  zram_bio_write+0xd1/0x1c0 [zram]
[ 1714.926105] [    T172]  __submit_bio+0x12d/0x1c0
[ 1714.926107] [    T172]  ? jbd2_journal_stop+0x145/0x320
[ 1714.926109] [    T172]  ? kmem_cache_free+0xb5/0x3e0
[ 1714.926112] [    T172]  ? lock_release+0x6b/0x130
[ 1714.926115] [    T172]  ? __submit_bio_noacct+0x7f/0x200
[ 1714.926116] [    T172]  __submit_bio_noacct+0x7f/0x200
[ 1714.926118] [    T172]  ext4_io_submit+0x20/0x40
[ 1714.926120] [    T172]  ext4_do_writepages+0x3e3/0x8b0
[ 1714.926122] [    T172]  ? lock_acquire.part.0+0x63/0x1a0
[ 1714.926124] [    T172]  ? do_writepages+0xcf/0x260
[ 1714.926127] [    T172]  ? ext4_writepages+0xe8/0x280
[ 1714.926128] [    T172]  ext4_writepages+0xe8/0x280
[ 1714.926130] [    T172]  do_writepages+0xcf/0x260
[ 1714.926133] [    T172]  ? find_held_lock+0x2b/0x80
[ 1714.926134] [    T172]  ? writeback_sb_inodes+0x1b8/0x550
[ 1714.926136] [    T172]  __writeback_single_inode+0x56/0x350
[ 1714.926138] [    T172]  writeback_sb_inodes+0x227/0x550
[ 1714.926143] [    T172]  __writeback_inodes_wb+0x4c/0xe0
[ 1714.926145] [    T172]  wb_writeback+0x2f2/0x3f0
[ 1714.926147] [    T172]  wb_do_writeback+0x227/0x2a0
[ 1714.926150] [    T172]  wb_workfn+0x56/0x1b0
[ 1714.926151] [    T172]  process_one_work+0x1eb/0x570
[ 1714.926154] [    T172]  worker_thread+0x1d1/0x3b0
[ 1714.926157] [    T172]  ? bh_worker+0x250/0x250
[ 1714.926159] [    T172]  kthread+0xf9/0x200
[ 1714.926161] [    T172]  ? kthread_fetch_affinity.isra.0+0x40/0x40
[ 1714.926163] [    T172]  ret_from_fork+0x2d/0x50
[ 1714.926165] [    T172]  ? kthread_fetch_affinity.isra.0+0x40/0x40
[ 1714.926166] [    T172]  ret_from_fork_asm+0x11/0x20
[ 1714.926170] [    T172]  </TASK>
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/09 01:20), Sergey Senozhatsky wrote:
> So I moved lockdep class keys to per-zram device and per-zsmalloc pool
> to separate the lockdep chains.  Looks like that did the trick.

Also need to indicate "try lock":

drivers/block/zram/zram_drv.c
@@ -86,7 +86,7 @@ static __must_check bool zram_slot_try_lock(struct zram *zram, u32 index)
 
        if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-               mutex_acquire(&zram->table[index].lockdep_map, 0, 0, _RET_IP_);
+               mutex_acquire(&zram->table[index].lockdep_map, 0, 1, _RET_IP_);
 #endif
                return true;
        }

and

mm/zsmalloc.c
@@ -388,7 +388,7 @@ static __must_check bool zspage_try_write_lock(struct zspage *zspage)
        preempt_disable();
        if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-               rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
+               rwsem_acquire(&zspage->lockdep_map, 0, 1, _RET_IP_);
 #endif
                return true;
        }
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/09 15:22), Sergey Senozhatsky wrote:
> On (25/02/09 01:20), Sergey Senozhatsky wrote:
> > So I moved lockdep class keys to per-zram device and per-zsmalloc pool
> > to separate the lockdep chains.  Looks like that did the trick.
> 
> Also need to indicate "try lock":
> 
> drivers/block/zram/zram_drv.c
> @@ -86,7 +86,7 @@ static __must_check bool zram_slot_try_lock(struct zram *zram, u32 index)
>  
>         if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> -               mutex_acquire(&zram->table[index].lockdep_map, 0, 0, _RET_IP_);
> +               mutex_acquire(&zram->table[index].lockdep_map, 0, 1, _RET_IP_);
>  #endif
>                 return true;
>         }
> 
> and
> 
> mm/zsmalloc.c
> @@ -388,7 +388,7 @@ static __must_check bool zspage_try_write_lock(struct zspage *zspage)
>         preempt_disable();
>         if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> -               rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> +               rwsem_acquire(&zspage->lockdep_map, 0, 1, _RET_IP_);
>  #endif
>                 return true;
>         }

I guess this was the point lockdep was making.

lockdep knows about strm->lock -> shrink_folio_list which goes to ptllock
via folio_referenced and cluter_into lock via try_to_unmap.  Then lockdep
knows about zram entry->lock -> strm->lock and, most importantly, lockdep
knows about cluter_into lock -> zram_slot_free_notify() -> zram-entry->lock.
What lockdep doesn't know is that zram_slot_free_notify() is a try lock,
we don't re-enter zram unconditionally.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/09 01:20), Sergey Senozhatsky wrote:
> So I moved lockdep class keys to per-zram device and per-zsmalloc pool
> to separate the lockdep chains.  Looks like that did the trick.
> 
> 
[..]
> 
> [ 1714.900906] [    T172] Chain exists of:
>                             ptlock_ptr(ptdesc)#2 --> zram-entry->lock --> &zstrm->lock
> 
> [ 1714.903183] [    T172]  Possible unsafe locking scenario:
> 
> [ 1714.904463] [    T172]        CPU0                    CPU1
> [ 1714.905380] [    T172]        ----                    ----
> [ 1714.906293] [    T172]   lock(&zstrm->lock);
> [ 1714.907006] [    T172]                                lock(zram-entry->lock);
> [ 1714.908204] [    T172]                                lock(&zstrm->lock);
> [ 1714.909347] [    T172]   lock(ptlock_ptr(ptdesc)#2);
> [ 1714.910179] [    T172]
>                            *** DEADLOCK ***

Actually, let me look at this more.  Maybe I haven't figured it out yet.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/06 16:22), Sergey Senozhatsky wrote:
> I didn't know it was possible to use per-CPU data and still have
> preemption enabled at the same time.  So I'm not opposed to the
> idea of still having per-CPU streams and do what zswap folks did.

Maybe that's actually a preferable option.   Allocation of streams
on-demand has a problem that streams' constructors need to use proper
GFP flags (they still use GFP_KERNEL, wrongly), and so on.  Keeping
things the way they are (per-CPU) but adding a preemption is likely
a safer and nicer option.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Yosry Ahmed 1 year ago
On Mon, Feb 03, 2025 at 12:49:42PM +0900, Sergey Senozhatsky wrote:
> On (25/02/01 17:21), Kairui Song wrote:
> > This seems will cause a huge regression of performance on multi core
> > systems, this is especially significant as the number of concurrent
> > tasks increases:
> > 
> > Test build linux kernel using ZRAM as SWAP (1G memcg):
> > 
> > Before:
> > + /usr/bin/time make -s -j48
> > 2495.77user 2604.77system 2:12.95elapsed 3836%CPU (0avgtext+0avgdata
> > 863304maxresident)k
> > 
> > After:
> > + /usr/bin/time make -s -j48
> > 2403.60user 6676.09system 3:38.22elapsed 4160%CPU (0avgtext+0avgdata
> > 863276maxresident)k
> 
> How many CPUs do you have?  I assume, preemption gets into way which is
> sort of expected, to be honest...  Using per-CPU compression streams
> disables preemption and uses CPU exclusively at a price of other tasks
> not being able to run.  I do tend to think that I made a mistake by
> switching zram to per-CPU compression streams.

FWIW, I am not familiar at all with the zram code but zswap uses per-CPU
acomp contexts with a mutex instead of a spinlock. So the task uses the
context of the CPU that it started on, but it can be preempted or
migrated and end up running on a different CPU. This means that
contention is still possible, but probably much lower than having a
shared pool of contexts that all CPUs compete on.

Again, this could be irrelevant as I am not very familiar with the zram
code, just thought this may be useful.
Re: [PATCHv4 02/17] zram: do not use per-CPU compression streams
Posted by Sergey Senozhatsky 1 year ago
On (25/02/03 21:00), Yosry Ahmed wrote:
> On Mon, Feb 03, 2025 at 12:49:42PM +0900, Sergey Senozhatsky wrote:
> > On (25/02/01 17:21), Kairui Song wrote:
> FWIW, I am not familiar at all with the zram code but zswap uses per-CPU
> acomp contexts with a mutex instead of a spinlock. So the task uses the
> context of the CPU that it started on, but it can be preempted or
> migrated and end up running on a different CPU.

Thank you for the idea.  We couldn't do that before (in zram), in a
number of cases per-CPU stream was taken from atomic context (under
zram table entry spinlock/bit-spinlock), but it's possible now
because entry lock is preemptible.