[v5] zsmalloc/zram: there be preemption

[PATCH v5 01/18] zram: sleepable entry locking

Posted by Sergey Senozhatsky 12 months ago

Concurrent modifications of meta table entries is now handled
by per-entry spin-lock.  This has a number of shortcomings.

First, this imposes atomic requirements on compression backends.
zram can call both zcomp_compress() and zcomp_decompress() under
entry spin-lock, which implies that we can use only compression
algorithms that don't schedule/sleep/wait during compression and
decompression.  This, for instance, makes it impossible to use
some of the ASYNC compression algorithms (H/W compression, etc.)
implementations.

Second, this can potentially trigger watchdogs.  For example,
entry re-compression with secondary algorithms is performed
under entry spin-lock.  Given that we chain secondary
compression algorithms and that some of them can be configured
for best compression ratio (and worst compression speed) zram
can stay under spin-lock for quite some time.

Having a per-entry mutex (or, for instance, a rw-semaphore)
significantly increases sizeof() of each entry and hence the
meta table.  Therefore entry locking returns back to bit
locking, as before, however, this time also preempt-rt friendly,
because if waits-on-bit instead of spinning-on-bit.  Lock owners
are also now permitted to schedule, which is a first step on the
path of making zram non-atomic.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/zram_drv.c | 65 ++++++++++++++++++++++++++++-------
 drivers/block/zram/zram_drv.h | 20 +++++++----
 2 files changed, 67 insertions(+), 18 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9f5020b077c5..3708436f1d1f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -58,19 +58,57 @@ static void zram_free_page(struct zram *zram, size_t index);
 static int zram_read_from_zspool(struct zram *zram, struct page *page,
 				 u32 index);
 
-static int zram_slot_trylock(struct zram *zram, u32 index)
+static void zram_slot_lock_init(struct zram *zram, u32 index)
 {
-	return spin_trylock(&zram->table[index].lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	lockdep_init_map(&zram->table[index].lockdep_map, "zram-entry->lock",
+			 &zram->table_lockdep_key, 0);
+#endif
+}
+
+/*
+ * entry locking rules:
+ *
+ * 1) Lock is exclusive
+ *
+ * 2) lock() function can sleep waiting for the lock
+ *
+ * 3) Lock owner can sleep
+ *
+ * 4) Use TRY lock variant when in atomic context
+ *    - must check return value and handle locking failers
+ */
+static __must_check bool zram_slot_try_lock(struct zram *zram, u32 index)
+{
+	unsigned long *lock = &zram->table[index].flags;
+
+	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+		mutex_acquire(&zram->table[index].lockdep_map, 0, 1, _RET_IP_);
+#endif
+		return true;
+	}
+	return false;
 }
 
 static void zram_slot_lock(struct zram *zram, u32 index)
 {
-	spin_lock(&zram->table[index].lock);
+	unsigned long *lock = &zram->table[index].flags;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	mutex_acquire(&zram->table[index].lockdep_map, 0, 0, _RET_IP_);
+#endif
+	wait_on_bit_lock(lock, ZRAM_ENTRY_LOCK, TASK_UNINTERRUPTIBLE);
 }
 
 static void zram_slot_unlock(struct zram *zram, u32 index)
 {
-	spin_unlock(&zram->table[index].lock);
+	unsigned long *lock = &zram->table[index].flags;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	mutex_release(&zram->table[index].lockdep_map, _RET_IP_);
+#endif
+	clear_and_wake_up_bit(ZRAM_ENTRY_LOCK, lock);
 }
 
 static inline bool init_done(struct zram *zram)
@@ -93,7 +131,6 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
 	zram->table[index].handle = handle;
 }
 
-/* flag operations require table entry bit_spin_lock() being held */
 static bool zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
@@ -1473,15 +1510,11 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 		huge_class_size = zs_huge_class_size(zram->mem_pool);
 
 	for (index = 0; index < num_pages; index++)
-		spin_lock_init(&zram->table[index].lock);
+		zram_slot_lock_init(zram, index);
+
 	return true;
 }
 
-/*
- * To protect concurrent access to the same index entry,
- * caller should hold this table index entry's bit_spinlock to
- * indicate this index entry is accessing.
- */
 static void zram_free_page(struct zram *zram, size_t index)
 {
 	unsigned long handle;
@@ -2321,7 +2354,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 	zram = bdev->bd_disk->private_data;
 
 	atomic64_inc(&zram->stats.notify_free);
-	if (!zram_slot_trylock(zram, index)) {
+	if (!zram_slot_try_lock(zram, index)) {
 		atomic64_inc(&zram->stats.miss_free);
 		return;
 	}
@@ -2625,6 +2658,10 @@ static int zram_add(void)
 	if (ret)
 		goto out_cleanup_disk;
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	lockdep_register_key(&zram->table_lockdep_key);
+#endif
+
 	zram_debugfs_register(zram);
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
@@ -2681,6 +2718,10 @@ static int zram_remove(struct zram *zram)
 	 */
 	zram_reset_device(zram);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	lockdep_unregister_key(&zram->table_lockdep_key);
+#endif
+
 	put_disk(zram->disk);
 	kfree(zram);
 	return 0;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index db78d7c01b9a..63b933059cb6 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -28,7 +28,6 @@
 #define ZRAM_SECTOR_PER_LOGICAL_BLOCK	\
 	(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
 
-
 /*
  * ZRAM is mainly used for memory efficiency so we want to keep memory
  * footprint small and thus squeeze size and zram pageflags into a flags
@@ -46,6 +45,7 @@
 /* Flags for zram pages (table[page_no].flags) */
 enum zram_pageflags {
 	ZRAM_SAME = ZRAM_FLAG_SHIFT,	/* Page consists the same element */
+	ZRAM_ENTRY_LOCK, /* entry access lock bit */
 	ZRAM_WB,	/* page is stored on backing_device */
 	ZRAM_PP_SLOT,	/* Selected for post-processing */
 	ZRAM_HUGE,	/* Incompressible page */
@@ -58,13 +58,18 @@ enum zram_pageflags {
 	__NR_ZRAM_PAGEFLAGS,
 };
 
-/*-- Data structures */
-
-/* Allocated for each disk page */
+/*
+ * Allocated for each disk page.  We use bit-lock (ZRAM_ENTRY_LOCK bit
+ * of flags) to save memory.  There can be plenty of entries and standard
+ * locking primitives (e.g. mutex) will significantly increase sizeof()
+ * of each entry and hence of the meta table.
+ */
 struct zram_table_entry {
 	unsigned long handle;
-	unsigned int flags;
-	spinlock_t lock;
+	unsigned long flags;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map lockdep_map;
+#endif
 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
 	ktime_t ac_time;
 #endif
@@ -137,5 +142,8 @@ struct zram {
 	struct dentry *debugfs_dir;
 #endif
 	atomic_t pp_in_progress;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key table_lockdep_key;
+#endif
 };
 #endif
-- 
2.48.1.502.g6dc24dfdaf-goog

Re: [PATCH v5 01/18] zram: sleepable entry locking

Posted by Andrew Morton 12 months ago

On Wed, 12 Feb 2025 15:26:59 +0900 Sergey Senozhatsky <senozhatsky@chromium.org> wrote:

> Concurrent modifications of meta table entries is now handled
> by per-entry spin-lock.  This has a number of shortcomings.
> 
> First, this imposes atomic requirements on compression backends.
> zram can call both zcomp_compress() and zcomp_decompress() under
> entry spin-lock, which implies that we can use only compression
> algorithms that don't schedule/sleep/wait during compression and
> decompression.  This, for instance, makes it impossible to use
> some of the ASYNC compression algorithms (H/W compression, etc.)
> implementations.
> 
> Second, this can potentially trigger watchdogs.  For example,
> entry re-compression with secondary algorithms is performed
> under entry spin-lock.  Given that we chain secondary
> compression algorithms and that some of them can be configured
> for best compression ratio (and worst compression speed) zram
> can stay under spin-lock for quite some time.
> 
> Having a per-entry mutex (or, for instance, a rw-semaphore)
> significantly increases sizeof() of each entry and hence the
> meta table.  Therefore entry locking returns back to bit
> locking, as before, however, this time also preempt-rt friendly,
> because if waits-on-bit instead of spinning-on-bit.  Lock owners
> are also now permitted to schedule, which is a first step on the
> path of making zram non-atomic.
> 
> ...
>
> -static int zram_slot_trylock(struct zram *zram, u32 index)
> +static void zram_slot_lock_init(struct zram *zram, u32 index)
>  {
> -	return spin_trylock(&zram->table[index].lock);
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	lockdep_init_map(&zram->table[index].lockdep_map, "zram-entry->lock",
> +			 &zram->table_lockdep_key, 0);
> +#endif
> +}
> +
>  
> ...
>
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	lockdep_register_key(&zram->table_lockdep_key);
> +#endif
> +

Please check whether all the ifdefs are needed - some of these things
have CONFIG_LOCKDEP=n stubs.

Re: [PATCH v5 01/18] zram: sleepable entry locking

Posted by Sergey Senozhatsky 12 months ago

On (25/02/12 16:08), Andrew Morton wrote:
> > Concurrent modifications of meta table entries is now handled
> > by per-entry spin-lock.  This has a number of shortcomings.
> > 
> > First, this imposes atomic requirements on compression backends.
> > zram can call both zcomp_compress() and zcomp_decompress() under
> > entry spin-lock, which implies that we can use only compression
> > algorithms that don't schedule/sleep/wait during compression and
> > decompression.  This, for instance, makes it impossible to use
> > some of the ASYNC compression algorithms (H/W compression, etc.)
> > implementations.
> > 
> > Second, this can potentially trigger watchdogs.  For example,
> > entry re-compression with secondary algorithms is performed
> > under entry spin-lock.  Given that we chain secondary
> > compression algorithms and that some of them can be configured
> > for best compression ratio (and worst compression speed) zram
> > can stay under spin-lock for quite some time.
> > 
> > Having a per-entry mutex (or, for instance, a rw-semaphore)
> > significantly increases sizeof() of each entry and hence the
> > meta table.  Therefore entry locking returns back to bit
> > locking, as before, however, this time also preempt-rt friendly,
> > because if waits-on-bit instead of spinning-on-bit.  Lock owners
> > are also now permitted to schedule, which is a first step on the
> > path of making zram non-atomic.
> > 
> > ...
> >
> > -static int zram_slot_trylock(struct zram *zram, u32 index)
> > +static void zram_slot_lock_init(struct zram *zram, u32 index)
> >  {
> > -	return spin_trylock(&zram->table[index].lock);
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > +	lockdep_init_map(&zram->table[index].lockdep_map, "zram-entry->lock",
> > +			 &zram->table_lockdep_key, 0);
> > +#endif
> > +}
> > +
> >  
> > ...
> >
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > +	lockdep_register_key(&zram->table_lockdep_key);
> > +#endif
> > +
> 
> Please check whether all the ifdefs are needed - some of these things
> have CONFIG_LOCKDEP=n stubs.

Will do.

Re: [PATCH v5 01/18] zram: sleepable entry locking

Posted by Sergey Senozhatsky 12 months ago

On (25/02/13 09:52), Sergey Senozhatsky wrote:
> > > -static int zram_slot_trylock(struct zram *zram, u32 index)
> > > +static void zram_slot_lock_init(struct zram *zram, u32 index)
> > >  {
> > > -	return spin_trylock(&zram->table[index].lock);
> > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > +	lockdep_init_map(&zram->table[index].lockdep_map, "zram-entry->lock",
> > > +			 &zram->table_lockdep_key, 0);
> > > +#endif
> > > +}
> > > +
> > >  
> > > ...
> > >
> > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > +	lockdep_register_key(&zram->table_lockdep_key);
> > > +#endif
> > > +
> > 
> > Please check whether all the ifdefs are needed - some of these things
> > have CONFIG_LOCKDEP=n stubs.

The problem is that while functions have LOCKDEP=n stubs, struct members
don't - we still declare table_lockdep_key and lockdep_map only when
DEBUG_LOCK_ALLOC is enabled.

Re: [PATCH v5 01/18] zram: sleepable entry locking

Posted by Sergey Senozhatsky 12 months ago

On (25/02/13 10:42), Sergey Senozhatsky wrote:
> On (25/02/13 09:52), Sergey Senozhatsky wrote:
> > > > -static int zram_slot_trylock(struct zram *zram, u32 index)
> > > > +static void zram_slot_lock_init(struct zram *zram, u32 index)
> > > >  {
> > > > -	return spin_trylock(&zram->table[index].lock);
> > > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > > +	lockdep_init_map(&zram->table[index].lockdep_map, "zram-entry->lock",
> > > > +			 &zram->table_lockdep_key, 0);
> > > > +#endif
> > > > +}
> > > > +
> > > >  
> > > > ...
> > > >
> > > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > > +	lockdep_register_key(&zram->table_lockdep_key);
> > > > +#endif
> > > > +
> > > 
> > > Please check whether all the ifdefs are needed - some of these things
> > > have CONFIG_LOCKDEP=n stubs.
> 
> The problem is that while functions have LOCKDEP=n stubs, struct members
> don't - we still declare table_lockdep_key and lockdep_map only when
> DEBUG_LOCK_ALLOC is enabled.

I rewrote those bits (in zram and in zsmalloc), given that we also
need lock-contended/lock-acquired in various branches, which require
even more ifdef-s.  So I factored out debug-enabled variants.