[v4] zsmalloc/zram: there be preemption

[PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

Switch over from rwlock_t to a atomic_t variable that takes negative
value when the page is under migration, or positive values when the
page is used by zsmalloc users (object map, etc.)   Using a rwsem
per-zspage is a little too memory heavy, a simple atomic_t should
suffice.

zspage lock is a leaf lock for zs_map_object(), where it's read-acquired.
Since this lock now permits preemption extra care needs to be taken when
it is write-acquired - all writers grab it in atomic context, so they
cannot spin and wait for (potentially preempted) reader to unlock zspage.
There are only two writers at this moment - migration and compaction.  In
both cases we use write-try-lock and bail out if zspage is read locked.
Writers, on the other hand, never get preempted, so readers can spin
waiting for the writer to unlock zspage.

With this we can implement a preemptible object mapping.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
---
 mm/zsmalloc.c | 135 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 83 insertions(+), 52 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 4b4c77bc08f9..f5b5fe732e50 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -292,6 +292,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
 	__free_page(page);
 }
 
+#define ZS_PAGE_UNLOCKED	0
+#define ZS_PAGE_WRLOCKED	-1
+
 struct zspage {
 	struct {
 		unsigned int huge:HUGE_BITS;
@@ -304,7 +307,7 @@ struct zspage {
 	struct zpdesc *first_zpdesc;
 	struct list_head list; /* fullness list */
 	struct zs_pool *pool;
-	rwlock_t lock;
+	atomic_t lock;
 };
 
 struct mapping_area {
@@ -314,6 +317,59 @@ struct mapping_area {
 	enum zs_mapmode vm_mm; /* mapping mode */
 };
 
+static void zspage_lock_init(struct zspage *zspage)
+{
+	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+}
+
+/*
+ * zspage lock permits preemption on the reader-side (there can be multiple
+ * readers).  Writers (exclusive zspage ownership), on the other hand, are
+ * always run in atomic context and cannot spin waiting for a (potentially
+ * preempted) reader to unlock zspage.  This, basically, means that writers
+ * can only call write-try-lock and must bail out if it didn't succeed.
+ *
+ * At the same time, writers cannot reschedule under zspage write-lock,
+ * so readers can spin waiting for the writer to unlock zspage.
+ */
+static void zspage_read_lock(struct zspage *zspage)
+{
+	atomic_t *lock = &zspage->lock;
+	int old = atomic_read(lock);
+
+	do {
+		if (old == ZS_PAGE_WRLOCKED) {
+			cpu_relax();
+			old = atomic_read(lock);
+			continue;
+		}
+	} while (!atomic_try_cmpxchg(lock, &old, old + 1));
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+	atomic_dec(&zspage->lock);
+}
+
+static bool zspage_try_write_lock(struct zspage *zspage)
+{
+	atomic_t *lock = &zspage->lock;
+	int old = ZS_PAGE_UNLOCKED;
+
+	preempt_disable();
+	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
+		return true;
+
+	preempt_enable();
+	return false;
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+	preempt_enable();
+}
+
 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
 static void SetZsHugePage(struct zspage *zspage)
 {
@@ -325,12 +381,6 @@ static bool ZsHugePage(struct zspage *zspage)
 	return zspage->huge;
 }
 
-static void lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
-
 #ifdef CONFIG_COMPACTION
 static void kick_deferred_free(struct zs_pool *pool);
 static void init_deferred_free(struct zs_pool *pool);
@@ -1026,7 +1076,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 		return NULL;
 
 	zspage->magic = ZSPAGE_MAGIC;
-	lock_init(zspage);
+	zspage_lock_init(zspage);
 
 	for (i = 0; i < class->pages_per_zspage; i++) {
 		struct zpdesc *zpdesc;
@@ -1251,7 +1301,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	 * zs_unmap_object API so delegate the locking from class to zspage
 	 * which is smaller granularity.
 	 */
-	migrate_read_lock(zspage);
+	zspage_read_lock(zspage);
 	pool_read_unlock(pool);
 
 	class = zspage_class(pool, zspage);
@@ -1311,7 +1361,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 	}
 	local_unlock(&zs_map_area.lock);
 
-	migrate_read_unlock(zspage);
+	zspage_read_unlock(zspage);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
@@ -1705,18 +1755,18 @@ static void lock_zspage(struct zspage *zspage)
 	/*
 	 * Pages we haven't locked yet can be migrated off the list while we're
 	 * trying to lock them, so we need to be careful and only attempt to
-	 * lock each page under migrate_read_lock(). Otherwise, the page we lock
+	 * lock each page under zspage_read_lock(). Otherwise, the page we lock
 	 * may no longer belong to the zspage. This means that we may wait for
 	 * the wrong page to unlock, so we must take a reference to the page
-	 * prior to waiting for it to unlock outside migrate_read_lock().
+	 * prior to waiting for it to unlock outside zspage_read_lock().
 	 */
 	while (1) {
-		migrate_read_lock(zspage);
+		zspage_read_lock(zspage);
 		zpdesc = get_first_zpdesc(zspage);
 		if (zpdesc_trylock(zpdesc))
 			break;
 		zpdesc_get(zpdesc);
-		migrate_read_unlock(zspage);
+		zspage_read_unlock(zspage);
 		zpdesc_wait_locked(zpdesc);
 		zpdesc_put(zpdesc);
 	}
@@ -1727,41 +1777,16 @@ static void lock_zspage(struct zspage *zspage)
 			curr_zpdesc = zpdesc;
 		} else {
 			zpdesc_get(zpdesc);
-			migrate_read_unlock(zspage);
+			zspage_read_unlock(zspage);
 			zpdesc_wait_locked(zpdesc);
 			zpdesc_put(zpdesc);
-			migrate_read_lock(zspage);
+			zspage_read_lock(zspage);
 		}
 	}
-	migrate_read_unlock(zspage);
+	zspage_read_unlock(zspage);
 }
 #endif /* CONFIG_COMPACTION */
 
-static void lock_init(struct zspage *zspage)
-{
-	rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
-	read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
-	read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
-	write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
-	write_unlock(&zspage->lock);
-}
-
 #ifdef CONFIG_COMPACTION
 
 static const struct movable_operations zsmalloc_mops;
@@ -1803,7 +1828,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 }
 
 static int zs_page_migrate(struct page *newpage, struct page *page,
-		enum migrate_mode mode)
+			   enum migrate_mode mode)
 {
 	struct zs_pool *pool;
 	struct size_class *class;
@@ -1819,15 +1844,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 
 	VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
 
-	/* We're committed, tell the world that this is a Zsmalloc page. */
-	__zpdesc_set_zsmalloc(newzpdesc);
-
 	/* The page is locked, so this pointer must remain valid */
 	zspage = get_zspage(zpdesc);
 	pool = zspage->pool;
 
 	/*
-	 * The pool lock protects the race between zpage migration
+	 * The pool->lock protects the race between zpage migration
 	 * and zs_free.
 	 */
 	pool_write_lock(pool);
@@ -1837,8 +1859,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 * the class lock protects zpage alloc/free in the zspage.
 	 */
 	size_class_lock(class);
-	/* the migrate_write_lock protects zpage access via zs_map_object */
-	migrate_write_lock(zspage);
+	/* the zspage write_lock protects zpage access via zs_map_object */
+	if (!zspage_try_write_lock(zspage)) {
+		size_class_unlock(class);
+		pool_write_unlock(pool);
+		return -EINVAL;
+	}
+
+	/* We're committed, tell the world that this is a Zsmalloc page. */
+	__zpdesc_set_zsmalloc(newzpdesc);
 
 	offset = get_first_obj_offset(zpdesc);
 	s_addr = kmap_local_zpdesc(zpdesc);
@@ -1869,7 +1898,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 */
 	pool_write_unlock(pool);
 	size_class_unlock(class);
-	migrate_write_unlock(zspage);
+	zspage_write_unlock(zspage);
 
 	zpdesc_get(newzpdesc);
 	if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
@@ -2005,9 +2034,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		if (!src_zspage)
 			break;
 
-		migrate_write_lock(src_zspage);
+		if (!zspage_try_write_lock(src_zspage))
+			break;
+
 		migrate_zspage(pool, src_zspage, dst_zspage);
-		migrate_write_unlock(src_zspage);
+		zspage_write_unlock(src_zspage);
 
 		fg = putback_zspage(class, src_zspage);
 		if (fg == ZS_INUSE_RATIO_0) {
-- 
2.48.1.362.g079036d154-goog

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 1 year ago

On Fri, Jan 31, 2025 at 06:06:13PM +0900, Sergey Senozhatsky wrote:
> Switch over from rwlock_t to a atomic_t variable that takes negative
> value when the page is under migration, or positive values when the
> page is used by zsmalloc users (object map, etc.)   Using a rwsem
> per-zspage is a little too memory heavy, a simple atomic_t should
> suffice.
> 
> zspage lock is a leaf lock for zs_map_object(), where it's read-acquired.
> Since this lock now permits preemption extra care needs to be taken when
> it is write-acquired - all writers grab it in atomic context, so they
> cannot spin and wait for (potentially preempted) reader to unlock zspage.
> There are only two writers at this moment - migration and compaction.  In
> both cases we use write-try-lock and bail out if zspage is read locked.
> Writers, on the other hand, never get preempted, so readers can spin
> waiting for the writer to unlock zspage.
> 
> With this we can implement a preemptible object mapping.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
> ---
>  mm/zsmalloc.c | 135 +++++++++++++++++++++++++++++++-------------------
>  1 file changed, 83 insertions(+), 52 deletions(-)
> 
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 4b4c77bc08f9..f5b5fe732e50 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -292,6 +292,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
>  	__free_page(page);
>  }
>  
> +#define ZS_PAGE_UNLOCKED	0
> +#define ZS_PAGE_WRLOCKED	-1
> +
>  struct zspage {
>  	struct {
>  		unsigned int huge:HUGE_BITS;
> @@ -304,7 +307,7 @@ struct zspage {
>  	struct zpdesc *first_zpdesc;
>  	struct list_head list; /* fullness list */
>  	struct zs_pool *pool;
> -	rwlock_t lock;
> +	atomic_t lock;
>  };
>  
>  struct mapping_area {
> @@ -314,6 +317,59 @@ struct mapping_area {
>  	enum zs_mapmode vm_mm; /* mapping mode */
>  };
>  
> +static void zspage_lock_init(struct zspage *zspage)
> +{
> +	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
> +}
> +
> +/*
> + * zspage lock permits preemption on the reader-side (there can be multiple
> + * readers).  Writers (exclusive zspage ownership), on the other hand, are
> + * always run in atomic context and cannot spin waiting for a (potentially
> + * preempted) reader to unlock zspage.  This, basically, means that writers
> + * can only call write-try-lock and must bail out if it didn't succeed.
> + *
> + * At the same time, writers cannot reschedule under zspage write-lock,
> + * so readers can spin waiting for the writer to unlock zspage.
> + */
> +static void zspage_read_lock(struct zspage *zspage)
> +{
> +	atomic_t *lock = &zspage->lock;
> +	int old = atomic_read(lock);
> +
> +	do {
> +		if (old == ZS_PAGE_WRLOCKED) {
> +			cpu_relax();
> +			old = atomic_read(lock);
> +			continue;
> +		}
> +	} while (!atomic_try_cmpxchg(lock, &old, old + 1));
> +}
> +
> +static void zspage_read_unlock(struct zspage *zspage)
> +{
> +	atomic_dec(&zspage->lock);
> +}
> +
> +static bool zspage_try_write_lock(struct zspage *zspage)
> +{
> +	atomic_t *lock = &zspage->lock;
> +	int old = ZS_PAGE_UNLOCKED;
> +
> +	preempt_disable();
> +	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))

FWIW, I am usually afraid to manually implement locking like this. For
example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not
atomic_try_cmpxchg(), and I am not quite sure what could happen without
ACQUIRE semantics here on some architectures.

We also lose some debugging capabilities as Hilf pointed out in another
patch.

Just my 2c.

> +		return true;
> +
> +	preempt_enable();
> +	return false;
> +}
> +
> +static void zspage_write_unlock(struct zspage *zspage)
> +{
> +	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
> +	preempt_enable();
> +}
> +
>  /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
>  static void SetZsHugePage(struct zspage *zspage)
>  {
> @@ -325,12 +381,6 @@ static bool ZsHugePage(struct zspage *zspage)
>  	return zspage->huge;
>  }
>  
> -static void lock_init(struct zspage *zspage);
> -static void migrate_read_lock(struct zspage *zspage);
> -static void migrate_read_unlock(struct zspage *zspage);
> -static void migrate_write_lock(struct zspage *zspage);
> -static void migrate_write_unlock(struct zspage *zspage);
> -
>  #ifdef CONFIG_COMPACTION
>  static void kick_deferred_free(struct zs_pool *pool);
>  static void init_deferred_free(struct zs_pool *pool);
> @@ -1026,7 +1076,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
>  		return NULL;
>  
>  	zspage->magic = ZSPAGE_MAGIC;
> -	lock_init(zspage);
> +	zspage_lock_init(zspage);
>  
>  	for (i = 0; i < class->pages_per_zspage; i++) {
>  		struct zpdesc *zpdesc;
> @@ -1251,7 +1301,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  	 * zs_unmap_object API so delegate the locking from class to zspage
>  	 * which is smaller granularity.
>  	 */
> -	migrate_read_lock(zspage);
> +	zspage_read_lock(zspage);
>  	pool_read_unlock(pool);
>  
>  	class = zspage_class(pool, zspage);
> @@ -1311,7 +1361,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>  	}
>  	local_unlock(&zs_map_area.lock);
>  
> -	migrate_read_unlock(zspage);
> +	zspage_read_unlock(zspage);
>  }
>  EXPORT_SYMBOL_GPL(zs_unmap_object);
>  
> @@ -1705,18 +1755,18 @@ static void lock_zspage(struct zspage *zspage)
>  	/*
>  	 * Pages we haven't locked yet can be migrated off the list while we're
>  	 * trying to lock them, so we need to be careful and only attempt to
> -	 * lock each page under migrate_read_lock(). Otherwise, the page we lock
> +	 * lock each page under zspage_read_lock(). Otherwise, the page we lock
>  	 * may no longer belong to the zspage. This means that we may wait for
>  	 * the wrong page to unlock, so we must take a reference to the page
> -	 * prior to waiting for it to unlock outside migrate_read_lock().
> +	 * prior to waiting for it to unlock outside zspage_read_lock().
>  	 */
>  	while (1) {
> -		migrate_read_lock(zspage);
> +		zspage_read_lock(zspage);
>  		zpdesc = get_first_zpdesc(zspage);
>  		if (zpdesc_trylock(zpdesc))
>  			break;
>  		zpdesc_get(zpdesc);
> -		migrate_read_unlock(zspage);
> +		zspage_read_unlock(zspage);
>  		zpdesc_wait_locked(zpdesc);
>  		zpdesc_put(zpdesc);
>  	}
> @@ -1727,41 +1777,16 @@ static void lock_zspage(struct zspage *zspage)
>  			curr_zpdesc = zpdesc;
>  		} else {
>  			zpdesc_get(zpdesc);
> -			migrate_read_unlock(zspage);
> +			zspage_read_unlock(zspage);
>  			zpdesc_wait_locked(zpdesc);
>  			zpdesc_put(zpdesc);
> -			migrate_read_lock(zspage);
> +			zspage_read_lock(zspage);
>  		}
>  	}
> -	migrate_read_unlock(zspage);
> +	zspage_read_unlock(zspage);
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> -static void lock_init(struct zspage *zspage)
> -{
> -	rwlock_init(&zspage->lock);
> -}
> -
> -static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
> -{
> -	read_lock(&zspage->lock);
> -}
> -
> -static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
> -{
> -	read_unlock(&zspage->lock);
> -}
> -
> -static void migrate_write_lock(struct zspage *zspage)
> -{
> -	write_lock(&zspage->lock);
> -}
> -
> -static void migrate_write_unlock(struct zspage *zspage)
> -{
> -	write_unlock(&zspage->lock);
> -}
> -
>  #ifdef CONFIG_COMPACTION
>  
>  static const struct movable_operations zsmalloc_mops;
> @@ -1803,7 +1828,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
>  }
>  
>  static int zs_page_migrate(struct page *newpage, struct page *page,
> -		enum migrate_mode mode)
> +			   enum migrate_mode mode)
>  {
>  	struct zs_pool *pool;
>  	struct size_class *class;
> @@ -1819,15 +1844,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
>  
>  	VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
>  
> -	/* We're committed, tell the world that this is a Zsmalloc page. */
> -	__zpdesc_set_zsmalloc(newzpdesc);
> -
>  	/* The page is locked, so this pointer must remain valid */
>  	zspage = get_zspage(zpdesc);
>  	pool = zspage->pool;
>  
>  	/*
> -	 * The pool lock protects the race between zpage migration
> +	 * The pool->lock protects the race between zpage migration
>  	 * and zs_free.
>  	 */
>  	pool_write_lock(pool);
> @@ -1837,8 +1859,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
>  	 * the class lock protects zpage alloc/free in the zspage.
>  	 */
>  	size_class_lock(class);
> -	/* the migrate_write_lock protects zpage access via zs_map_object */
> -	migrate_write_lock(zspage);
> +	/* the zspage write_lock protects zpage access via zs_map_object */
> +	if (!zspage_try_write_lock(zspage)) {
> +		size_class_unlock(class);
> +		pool_write_unlock(pool);
> +		return -EINVAL;
> +	}
> +
> +	/* We're committed, tell the world that this is a Zsmalloc page. */
> +	__zpdesc_set_zsmalloc(newzpdesc);
>  
>  	offset = get_first_obj_offset(zpdesc);
>  	s_addr = kmap_local_zpdesc(zpdesc);
> @@ -1869,7 +1898,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
>  	 */
>  	pool_write_unlock(pool);
>  	size_class_unlock(class);
> -	migrate_write_unlock(zspage);
> +	zspage_write_unlock(zspage);
>  
>  	zpdesc_get(newzpdesc);
>  	if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
> @@ -2005,9 +2034,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
>  		if (!src_zspage)
>  			break;
>  
> -		migrate_write_lock(src_zspage);
> +		if (!zspage_try_write_lock(src_zspage))
> +			break;
> +
>  		migrate_zspage(pool, src_zspage, dst_zspage);
> -		migrate_write_unlock(src_zspage);
> +		zspage_write_unlock(src_zspage);
>  
>  		fg = putback_zspage(class, src_zspage);
>  		if (fg == ZS_INUSE_RATIO_0) {
> -- 
> 2.48.1.362.g079036d154-goog
>

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/01/31 15:51), Yosry Ahmed wrote:
> > +static void zspage_read_lock(struct zspage *zspage)
> > +{
> > +	atomic_t *lock = &zspage->lock;
> > +	int old = atomic_read(lock);
> > +
> > +	do {
> > +		if (old == ZS_PAGE_WRLOCKED) {
> > +			cpu_relax();
> > +			old = atomic_read(lock);
> > +			continue;
> > +		}
> > +	} while (!atomic_try_cmpxchg(lock, &old, old + 1));
> > +}
> > +
> > +static void zspage_read_unlock(struct zspage *zspage)
> > +{
> > +	atomic_dec(&zspage->lock);
> > +}
> > +
> > +static bool zspage_try_write_lock(struct zspage *zspage)
> > +{
> > +	atomic_t *lock = &zspage->lock;
> > +	int old = ZS_PAGE_UNLOCKED;
> > +
> > +	preempt_disable();
> > +	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
> 
> FWIW, I am usually afraid to manually implement locking like this. For
> example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not
> atomic_try_cmpxchg(), and I am not quite sure what could happen without
> ACQUIRE semantics here on some architectures.

I looked into it a bit, wasn't sure either.  Perhaps we can switch
to acquire/release semantics, I'm not an expert on this, would highly
appreciate help.

> We also lose some debugging capabilities as Hilf pointed out in another
> patch.

So that zspage lock should have not been a lock, I think, it's a ref-counter
and it's being used as one

map()
{
	page->users++;
}

unmap()
{
	page->users--;
}

migrate()
{
	if (!page->users)
		migrate_page();
}

> Just my 2c.

Perhaps we can sprinkle some lockdep on it.  For instance:

---
 mm/zsmalloc.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 956445f4d554..06b1d8ca9e89 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -308,6 +308,10 @@ struct zspage {
 	struct list_head list; /* fullness list */
 	struct zs_pool *pool;
 	atomic_t lock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map lockdep_map;
+#endif
 };
 
 struct mapping_area {
@@ -319,6 +323,12 @@ struct mapping_area {
 
 static void zspage_lock_init(struct zspage *zspage)
 {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key key;
+
+	lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page", &key, 0);
+#endif
+
 	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
 }
 
@@ -344,11 +354,19 @@ static void zspage_read_lock(struct zspage *zspage)
 			continue;
 		}
 	} while (!atomic_try_cmpxchg(lock, &old, old + 1));
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
 }
 
 static void zspage_read_unlock(struct zspage *zspage)
 {
 	atomic_dec(&zspage->lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
 }
 
 static bool zspage_try_write_lock(struct zspage *zspage)
@@ -357,8 +375,12 @@ static bool zspage_try_write_lock(struct zspage *zspage)
 	int old = ZS_PAGE_UNLOCKED;
 
 	preempt_disable();
-	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
+	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+		rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
 		return true;
+	}
 
 	preempt_enable();
 	return false;
-- 
2.48.1.362.g079036d154-goog

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 1 year ago

On Mon, Feb 03, 2025 at 12:13:49PM +0900, Sergey Senozhatsky wrote:
> On (25/01/31 15:51), Yosry Ahmed wrote:
> > > +static void zspage_read_lock(struct zspage *zspage)
> > > +{
> > > +	atomic_t *lock = &zspage->lock;
> > > +	int old = atomic_read(lock);
> > > +
> > > +	do {
> > > +		if (old == ZS_PAGE_WRLOCKED) {
> > > +			cpu_relax();
> > > +			old = atomic_read(lock);
> > > +			continue;
> > > +		}
> > > +	} while (!atomic_try_cmpxchg(lock, &old, old + 1));
> > > +}
> > > +
> > > +static void zspage_read_unlock(struct zspage *zspage)
> > > +{
> > > +	atomic_dec(&zspage->lock);
> > > +}
> > > +
> > > +static bool zspage_try_write_lock(struct zspage *zspage)
> > > +{
> > > +	atomic_t *lock = &zspage->lock;
> > > +	int old = ZS_PAGE_UNLOCKED;
> > > +
> > > +	preempt_disable();
> > > +	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
> > 
> > FWIW, I am usually afraid to manually implement locking like this. For
> > example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not
> > atomic_try_cmpxchg(), and I am not quite sure what could happen without
> > ACQUIRE semantics here on some architectures.
> 
> I looked into it a bit, wasn't sure either.  Perhaps we can switch
> to acquire/release semantics, I'm not an expert on this, would highly
> appreciate help.
> 
> > We also lose some debugging capabilities as Hilf pointed out in another
> > patch.
> 
> So that zspage lock should have not been a lock, I think, it's a ref-counter
> and it's being used as one
> 
> map()
> {
> 	page->users++;
> }
> 
> unmap()
> {
> 	page->users--;
> }
> 
> migrate()
> {
> 	if (!page->users)
> 		migrate_page();
> }

Hmm, but in this case we want migration to block new map/unmap
operations. So a vanilla refcount won't work.

> 
> > Just my 2c.
> 
> Perhaps we can sprinkle some lockdep on it.  For instance:

Honestly this looks like more reason to use existing lock primitives to
me. What are the candidates? I assume rw_semaphore, anything else?

I guess the main reason you didn't use a rw_semaphore is the extra
memory usage. Seems like it uses ~32 bytes more than rwlock_t on x86_64.
That's per zspage. Depending on how many compressed pages we have
per-zspage this may not be too bad.

For example, if a zspage has a chain length of 4, and the average
compression ratio of 1/3, that's 12 compressed pages so the extra
overhead is <3 bytes per compressed page.

Given that the chain length is a function of the class size, I think we
can calculate the exact extra memory overhead per-compressed page for
each class and get a mean/median over all classes.

If the memory overhead is insignificant I'd rather use exisitng lock
primitives tbh.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/02/03 21:11), Yosry Ahmed wrote:
> > > We also lose some debugging capabilities as Hilf pointed out in another
> > > patch.
> > 
> > So that zspage lock should have not been a lock, I think, it's a ref-counter
> > and it's being used as one
> > 
> > map()
> > {
> > 	page->users++;
> > }
> > 
> > unmap()
> > {
> > 	page->users--;
> > }
> > 
> > migrate()
> > {
> > 	if (!page->users)
> > 		migrate_page();
> > }
> 
> Hmm, but in this case we want migration to block new map/unmap
> operations. So a vanilla refcount won't work.

Yeah, correct - migration needs negative values so that map would
wait until it's positive (or zero).

> > > Just my 2c.
> > 
> > Perhaps we can sprinkle some lockdep on it.  For instance:
> 
> Honestly this looks like more reason to use existing lock primitives to
> me. What are the candidates? I assume rw_semaphore, anything else?

Right, rwsem "was" the first choice.

> I guess the main reason you didn't use a rw_semaphore is the extra
> memory usage.

sizeof(struct zs_page) change is one thing.  Another thing is that
zspage->lock is taken from atomic sections, pretty much everywhere.
compaction/migration write-lock it under pool rwlock and class spinlock,
but both compaction and migration now EAGAIN if the lock is locked
already, so that is sorted out.

The remaining problem is map(), which takes zspage read-lock under pool
rwlock.  RFC series (which you hated with passion :P) converted all zsmalloc
into preemptible ones because of this - zspage->lock is a nested leaf-lock,
so it cannot schedule unless locks it's nested under permit it (needless to
say neither rwlock nor spinlock permit it).

> Seems like it uses ~32 bytes more than rwlock_t on x86_64.
> That's per zspage. Depending on how many compressed pages we have
> per-zspage this may not be too bad.

So on a 16GB laptop our memory pressure test at peak used approx 1M zspages.
That is 32 bytes * 1M ~ 32MB of extra memory use.  Not alarmingly a lot,
less than what a single browser tab needs nowadays.  I suppose on 4GB/8GB
that will be even smaller (because those device generate less zspages).
Numbers are not the main issue, however.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 1 year ago

On Tue, Feb 04, 2025 at 03:59:42PM +0900, Sergey Senozhatsky wrote:
> On (25/02/03 21:11), Yosry Ahmed wrote:
> > > > We also lose some debugging capabilities as Hilf pointed out in another
> > > > patch.
> > > 
> > > So that zspage lock should have not been a lock, I think, it's a ref-counter
> > > and it's being used as one
> > > 
> > > map()
> > > {
> > > 	page->users++;
> > > }
> > > 
> > > unmap()
> > > {
> > > 	page->users--;
> > > }
> > > 
> > > migrate()
> > > {
> > > 	if (!page->users)
> > > 		migrate_page();
> > > }
> > 
> > Hmm, but in this case we want migration to block new map/unmap
> > operations. So a vanilla refcount won't work.
> 
> Yeah, correct - migration needs negative values so that map would
> wait until it's positive (or zero).
> 
> > > > Just my 2c.
> > > 
> > > Perhaps we can sprinkle some lockdep on it.  For instance:
> > 
> > Honestly this looks like more reason to use existing lock primitives to
> > me. What are the candidates? I assume rw_semaphore, anything else?
> 
> Right, rwsem "was" the first choice.
> 
> > I guess the main reason you didn't use a rw_semaphore is the extra
> > memory usage.
> 
> sizeof(struct zs_page) change is one thing.  Another thing is that
> zspage->lock is taken from atomic sections, pretty much everywhere.
> compaction/migration write-lock it under pool rwlock and class spinlock,
> but both compaction and migration now EAGAIN if the lock is locked
> already, so that is sorted out.
> 
> The remaining problem is map(), which takes zspage read-lock under pool
> rwlock.  RFC series (which you hated with passion :P) converted all zsmalloc
> into preemptible ones because of this - zspage->lock is a nested leaf-lock,
> so it cannot schedule unless locks it's nested under permit it (needless to
> say neither rwlock nor spinlock permit it).

Hmm, so we want the lock to be preemtible, but we don't want to use an
existing preemtible lock because it may be held it from atomic context.

I think one problem here is that the lock you are introducing is a
spinning lock but the lock holder can be preempted. This is why spinning
locks do not allow preemption. Others waiting for the lock can spin
waiting for a process that is scheduled out.

For example, the compaction/migration code could be sleeping holding the
write lock, and a map() call would spin waiting for that sleeping task.

I wonder if there's a way to rework the locking instead to avoid the
nesting. It seems like sometimes we lock the zspage with the pool lock
held, sometimes with the class lock held, and sometimes with no lock
held.

What are the rules here for acquiring the zspage lock? Do we need to
hold another lock just to make sure the zspage does not go away from
under us? Can we use RCU or something similar to do that instead?

> 
> > Seems like it uses ~32 bytes more than rwlock_t on x86_64.
> > That's per zspage. Depending on how many compressed pages we have
> > per-zspage this may not be too bad.
> 
> So on a 16GB laptop our memory pressure test at peak used approx 1M zspages.
> That is 32 bytes * 1M ~ 32MB of extra memory use.  Not alarmingly a lot,
> less than what a single browser tab needs nowadays.  I suppose on 4GB/8GB
> that will be even smaller (because those device generate less zspages).
> Numbers are not the main issue, however.
>

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/02/04 17:19), Yosry Ahmed wrote:
> > sizeof(struct zs_page) change is one thing.  Another thing is that
> > zspage->lock is taken from atomic sections, pretty much everywhere.
> > compaction/migration write-lock it under pool rwlock and class spinlock,
> > but both compaction and migration now EAGAIN if the lock is locked
> > already, so that is sorted out.
> > 
> > The remaining problem is map(), which takes zspage read-lock under pool
> > rwlock.  RFC series (which you hated with passion :P) converted all zsmalloc
> > into preemptible ones because of this - zspage->lock is a nested leaf-lock,
> > so it cannot schedule unless locks it's nested under permit it (needless to
> > say neither rwlock nor spinlock permit it).
> 
> Hmm, so we want the lock to be preemtible, but we don't want to use an
> existing preemtible lock because it may be held it from atomic context.
> 
> I think one problem here is that the lock you are introducing is a
> spinning lock but the lock holder can be preempted. This is why spinning
> locks do not allow preemption. Others waiting for the lock can spin
> waiting for a process that is scheduled out.
> 
> For example, the compaction/migration code could be sleeping holding the
> write lock, and a map() call would spin waiting for that sleeping task.

write-lock holders cannot sleep, that's the key part.

So the rules are:

1) writer cannot sleep
   - migration/compaction runs in atomic context and grabs
	 write-lock only from atomic context
   - write-locking function disables preemption before lock(), just to be
	 safe, and enables it after unlock()

2) writer does not spin waiting
   - that's why there is only write_try_lock function
	  - compaction and migration bail out when they cannot lock the
		zspage

3) readers can sleep and can spin waiting for a lock
   - other (even preempted) readers don't block new readers
   - writers don't sleep, they always unlock

> I wonder if there's a way to rework the locking instead to avoid the
> nesting. It seems like sometimes we lock the zspage with the pool lock
> held, sometimes with the class lock held, and sometimes with no lock
> held.
> 
> What are the rules here for acquiring the zspage lock?

Most of that code is not written by me, but I think the rule is to disable
"migration" be it via pool lock or class lock.

> Do we need to hold another lock just to make sure the zspage does not go
> away from under us?

Yes, the page cannot go away via "normal" path:
   zs_free(last object) -> zspage becomes empty -> free zspage

so when we have active mapping() it's only migration and compaction
that can free zspage (its content is migrated and so it becomes empty).

> Can we use RCU or something similar to do that instead?

Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data
patterns the clients have.   I suspect we'd need to synchronize RCU every
time a zspage is freed: zs_free() [this one is complicated], or migration,
or compaction?  Sounds like anti-pattern for RCU?

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 1 year ago

On Wed, Feb 05, 2025 at 11:43:16AM +0900, Sergey Senozhatsky wrote:
> On (25/02/04 17:19), Yosry Ahmed wrote:
> > > sizeof(struct zs_page) change is one thing.  Another thing is that
> > > zspage->lock is taken from atomic sections, pretty much everywhere.
> > > compaction/migration write-lock it under pool rwlock and class spinlock,
> > > but both compaction and migration now EAGAIN if the lock is locked
> > > already, so that is sorted out.
> > > 
> > > The remaining problem is map(), which takes zspage read-lock under pool
> > > rwlock.  RFC series (which you hated with passion :P) converted all zsmalloc
> > > into preemptible ones because of this - zspage->lock is a nested leaf-lock,
> > > so it cannot schedule unless locks it's nested under permit it (needless to
> > > say neither rwlock nor spinlock permit it).
> > 
> > Hmm, so we want the lock to be preemtible, but we don't want to use an
> > existing preemtible lock because it may be held it from atomic context.
> > 
> > I think one problem here is that the lock you are introducing is a
> > spinning lock but the lock holder can be preempted. This is why spinning
> > locks do not allow preemption. Others waiting for the lock can spin
> > waiting for a process that is scheduled out.
> > 
> > For example, the compaction/migration code could be sleeping holding the
> > write lock, and a map() call would spin waiting for that sleeping task.
> 
> write-lock holders cannot sleep, that's the key part.
> 
> So the rules are:
> 
> 1) writer cannot sleep
>    - migration/compaction runs in atomic context and grabs
> 	 write-lock only from atomic context
>    - write-locking function disables preemption before lock(), just to be
> 	 safe, and enables it after unlock()
> 
> 2) writer does not spin waiting
>    - that's why there is only write_try_lock function
> 	  - compaction and migration bail out when they cannot lock the
> 		zspage
> 
> 3) readers can sleep and can spin waiting for a lock
>    - other (even preempted) readers don't block new readers
>    - writers don't sleep, they always unlock

That's useful, thanks. If we go with custom locking we need to document
this clearly and add debug checks where possible.

> 
> > I wonder if there's a way to rework the locking instead to avoid the
> > nesting. It seems like sometimes we lock the zspage with the pool lock
> > held, sometimes with the class lock held, and sometimes with no lock
> > held.
> > 
> > What are the rules here for acquiring the zspage lock?
> 
> Most of that code is not written by me, but I think the rule is to disable
> "migration" be it via pool lock or class lock.

It seems like we're not holding either of these locks in
async_free_zspage() when we call lock_zspage(). Is it safe for a
different reason?

> 
> > Do we need to hold another lock just to make sure the zspage does not go
> > away from under us?
> 
> Yes, the page cannot go away via "normal" path:
>    zs_free(last object) -> zspage becomes empty -> free zspage
> 
> so when we have active mapping() it's only migration and compaction
> that can free zspage (its content is migrated and so it becomes empty).
> 
> > Can we use RCU or something similar to do that instead?
> 
> Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data
> patterns the clients have.   I suspect we'd need to synchronize RCU every
> time a zspage is freed: zs_free() [this one is complicated], or migration,
> or compaction?  Sounds like anti-pattern for RCU?

Can't we use kfree_rcu() instead of synchronizing? Not sure if this
would still be an antipattern tbh. It just seems like the current
locking scheme is really complicated :/

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/02/05 19:06), Yosry Ahmed wrote:
> > > For example, the compaction/migration code could be sleeping holding the
> > > write lock, and a map() call would spin waiting for that sleeping task.
> > 
> > write-lock holders cannot sleep, that's the key part.
> > 
> > So the rules are:
> > 
> > 1) writer cannot sleep
> >    - migration/compaction runs in atomic context and grabs
> > 	 write-lock only from atomic context
> >    - write-locking function disables preemption before lock(), just to be
> > 	 safe, and enables it after unlock()
> > 
> > 2) writer does not spin waiting
> >    - that's why there is only write_try_lock function
> > 	  - compaction and migration bail out when they cannot lock the
> > 		zspage
> > 
> > 3) readers can sleep and can spin waiting for a lock
> >    - other (even preempted) readers don't block new readers
> >    - writers don't sleep, they always unlock
> 
> That's useful, thanks. If we go with custom locking we need to document
> this clearly and add debug checks where possible.

Sure.  That's what it currently looks like (can always improve)

---
/*
 * zspage lock permits preemption on the reader-side (there can be multiple
 * readers).  Writers (exclusive zspage ownership), on the other hand, are
 * always run in atomic context and cannot spin waiting for a (potentially
 * preempted) reader to unlock zspage.  This, basically, means that writers
 * can only call write-try-lock and must bail out if it didn't succeed.
 *
 * At the same time, writers cannot reschedule under zspage write-lock,
 * so readers can spin waiting for the writer to unlock zspage.
 */
static void zspage_read_lock(struct zspage *zspage)
{
        atomic_t *lock = &zspage->lock;
        int old = atomic_read_acquire(lock);

        do {
                if (old == ZS_PAGE_WRLOCKED) {
                        cpu_relax();
                        old = atomic_read_acquire(lock);
                        continue;
                }
        } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
}

static void zspage_read_unlock(struct zspage *zspage)
{
        atomic_dec_return_release(&zspage->lock);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
}

static bool zspage_try_write_lock(struct zspage *zspage)
{
        atomic_t *lock = &zspage->lock;
        int old = ZS_PAGE_UNLOCKED;

        preempt_disable();
        if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
                rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
                return true;
        }

        preempt_enable();
        return false;
}

static void zspage_write_unlock(struct zspage *zspage)
{
        atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
        preempt_enable();
}
---

Maybe I'll just copy-paste the locking rules list, a list is always cleaner.

> > > I wonder if there's a way to rework the locking instead to avoid the
> > > nesting. It seems like sometimes we lock the zspage with the pool lock
> > > held, sometimes with the class lock held, and sometimes with no lock
> > > held.
> > > 
> > > What are the rules here for acquiring the zspage lock?
> > 
> > Most of that code is not written by me, but I think the rule is to disable
> > "migration" be it via pool lock or class lock.
> 
> It seems like we're not holding either of these locks in
> async_free_zspage() when we call lock_zspage(). Is it safe for a
> different reason?

I think we hold size class lock there. async-free is only for pages that
reached 0 usage ratio (empty fullness group), so they don't hold any
objects any more and from her such zspages either get freed or
find_get_zspage() recovers them from fullness 0 and allocates an object.
Both are synchronized by size class lock.

> > Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data
> > patterns the clients have.   I suspect we'd need to synchronize RCU every
> > time a zspage is freed: zs_free() [this one is complicated], or migration,
> > or compaction?  Sounds like anti-pattern for RCU?
> 
> Can't we use kfree_rcu() instead of synchronizing? Not sure if this
> would still be an antipattern tbh.

Yeah, I don't know.  The last time I wrongly used kfree_rcu() it caused a
27% performance drop (some internal code).  This zspage thingy maybe will
be better, but still has a potential to generate high numbers of RCU calls,
depends on the clients.  Probably the chances are too high.  Apart from
that, kvfree_rcu() can sleep, as far as I understand, so zram might have
some extra things to deal with, namely slot-free notifications which can
be called from softirq, and always called under spinlock:

 mm slot-free -> zram slot-free -> zs_free -> empty zspage -> kfree_rcu

> It just seems like the current locking scheme is really complicated :/

That's very true.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 1 year ago

On Thu, Feb 06, 2025 at 12:05:55PM +0900, Sergey Senozhatsky wrote:
> On (25/02/05 19:06), Yosry Ahmed wrote:
> > > > For example, the compaction/migration code could be sleeping holding the
> > > > write lock, and a map() call would spin waiting for that sleeping task.
> > > 
> > > write-lock holders cannot sleep, that's the key part.
> > > 
> > > So the rules are:
> > > 
> > > 1) writer cannot sleep
> > >    - migration/compaction runs in atomic context and grabs
> > > 	 write-lock only from atomic context
> > >    - write-locking function disables preemption before lock(), just to be
> > > 	 safe, and enables it after unlock()
> > > 
> > > 2) writer does not spin waiting
> > >    - that's why there is only write_try_lock function
> > > 	  - compaction and migration bail out when they cannot lock the
> > > 		zspage
> > > 
> > > 3) readers can sleep and can spin waiting for a lock
> > >    - other (even preempted) readers don't block new readers
> > >    - writers don't sleep, they always unlock
> > 
> > That's useful, thanks. If we go with custom locking we need to document
> > this clearly and add debug checks where possible.
> 
> Sure.  That's what it currently looks like (can always improve)
> 
> ---
> /*
>  * zspage lock permits preemption on the reader-side (there can be multiple
>  * readers).  Writers (exclusive zspage ownership), on the other hand, are
>  * always run in atomic context and cannot spin waiting for a (potentially
>  * preempted) reader to unlock zspage.  This, basically, means that writers
>  * can only call write-try-lock and must bail out if it didn't succeed.
>  *
>  * At the same time, writers cannot reschedule under zspage write-lock,
>  * so readers can spin waiting for the writer to unlock zspage.
>  */
> static void zspage_read_lock(struct zspage *zspage)
> {
>         atomic_t *lock = &zspage->lock;
>         int old = atomic_read_acquire(lock);
> 
>         do {
>                 if (old == ZS_PAGE_WRLOCKED) {
>                         cpu_relax();
>                         old = atomic_read_acquire(lock);
>                         continue;
>                 }
>         } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
> 
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>         rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> #endif
> }
> 
> static void zspage_read_unlock(struct zspage *zspage)
> {
>         atomic_dec_return_release(&zspage->lock);
> 
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>         rwsem_release(&zspage->lockdep_map, _RET_IP_);
> #endif
> }
> 
> static bool zspage_try_write_lock(struct zspage *zspage)
> {
>         atomic_t *lock = &zspage->lock;
>         int old = ZS_PAGE_UNLOCKED;
> 
>         preempt_disable();
>         if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>                 rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> #endif
>                 return true;
>         }
> 
>         preempt_enable();
>         return false;
> }
> 
> static void zspage_write_unlock(struct zspage *zspage)
> {
>         atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>         rwsem_release(&zspage->lockdep_map, _RET_IP_);
> #endif
>         preempt_enable();
> }
> ---
> 
> Maybe I'll just copy-paste the locking rules list, a list is always cleaner.

Thanks. I think it would be nice if we could also get someone with
locking expertise to take a look at this.

> 
> > > > I wonder if there's a way to rework the locking instead to avoid the
> > > > nesting. It seems like sometimes we lock the zspage with the pool lock
> > > > held, sometimes with the class lock held, and sometimes with no lock
> > > > held.
> > > > 
> > > > What are the rules here for acquiring the zspage lock?
> > > 
> > > Most of that code is not written by me, but I think the rule is to disable
> > > "migration" be it via pool lock or class lock.
> > 
> > It seems like we're not holding either of these locks in
> > async_free_zspage() when we call lock_zspage(). Is it safe for a
> > different reason?
> 
> I think we hold size class lock there. async-free is only for pages that
> reached 0 usage ratio (empty fullness group), so they don't hold any
> objects any more and from her such zspages either get freed or
> find_get_zspage() recovers them from fullness 0 and allocates an object.
> Both are synchronized by size class lock.
> 
> > > Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data
> > > patterns the clients have.   I suspect we'd need to synchronize RCU every
> > > time a zspage is freed: zs_free() [this one is complicated], or migration,
> > > or compaction?  Sounds like anti-pattern for RCU?
> > 
> > Can't we use kfree_rcu() instead of synchronizing? Not sure if this
> > would still be an antipattern tbh.
> 
> Yeah, I don't know.  The last time I wrongly used kfree_rcu() it caused a
> 27% performance drop (some internal code).  This zspage thingy maybe will
> be better, but still has a potential to generate high numbers of RCU calls,
> depends on the clients.  Probably the chances are too high.  Apart from
> that, kvfree_rcu() can sleep, as far as I understand, so zram might have
> some extra things to deal with, namely slot-free notifications which can
> be called from softirq, and always called under spinlock:
> 
>  mm slot-free -> zram slot-free -> zs_free -> empty zspage -> kfree_rcu
> 
> > It just seems like the current locking scheme is really complicated :/
> 
> That's very true.

Seems like we have to compromise either way, custom locking or we enter
into a new complexity realm with RCU freeing.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/02/06 16:19), Yosry Ahmed wrote:
> > static void zspage_read_lock(struct zspage *zspage)
> > {
> >         atomic_t *lock = &zspage->lock;
> >         int old = atomic_read_acquire(lock);
> > 
> >         do {
> >                 if (old == ZS_PAGE_WRLOCKED) {
> >                         cpu_relax();
> >                         old = atomic_read_acquire(lock);
> >                         continue;
> >                 }
> >         } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
> > 
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >         rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > #endif
> > }
> > 
> > static void zspage_read_unlock(struct zspage *zspage)
> > {
> >         atomic_dec_return_release(&zspage->lock);
> > 
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >         rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > #endif
> > }
> > 
> > static bool zspage_try_write_lock(struct zspage *zspage)
> > {
> >         atomic_t *lock = &zspage->lock;
> >         int old = ZS_PAGE_UNLOCKED;
> > 
> >         preempt_disable();
> >         if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >                 rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > #endif
> >                 return true;
> >         }
> > 
> >         preempt_enable();
> >         return false;
> > }
> > 
> > static void zspage_write_unlock(struct zspage *zspage)
> > {
> >         atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >         rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > #endif
> >         preempt_enable();
> > }
> > ---
> > 
> > Maybe I'll just copy-paste the locking rules list, a list is always cleaner.
> 
> Thanks. I think it would be nice if we could also get someone with
> locking expertise to take a look at this.

Sure.

I moved the lockdep acquire/release before atomic ops (except for try),
as was suggested by Sebastian in zram sub-thread.

[..]
> Seems like we have to compromise either way, custom locking or we enter
> into a new complexity realm with RCU freeing.

Let's take the blue pill? :)

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 1 year ago

On Fri, Feb 07, 2025 at 11:48:55AM +0900, Sergey Senozhatsky wrote:
> On (25/02/06 16:19), Yosry Ahmed wrote:
> > > static void zspage_read_lock(struct zspage *zspage)
> > > {
> > >         atomic_t *lock = &zspage->lock;
> > >         int old = atomic_read_acquire(lock);
> > > 
> > >         do {
> > >                 if (old == ZS_PAGE_WRLOCKED) {
> > >                         cpu_relax();
> > >                         old = atomic_read_acquire(lock);
> > >                         continue;
> > >                 }
> > >         } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
> > > 
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > >         rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > > #endif
> > > }
> > > 
> > > static void zspage_read_unlock(struct zspage *zspage)
> > > {
> > >         atomic_dec_return_release(&zspage->lock);
> > > 
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > >         rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > > #endif
> > > }
> > > 
> > > static bool zspage_try_write_lock(struct zspage *zspage)
> > > {
> > >         atomic_t *lock = &zspage->lock;
> > >         int old = ZS_PAGE_UNLOCKED;
> > > 
> > >         preempt_disable();
> > >         if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > >                 rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > > #endif
> > >                 return true;
> > >         }
> > > 
> > >         preempt_enable();
> > >         return false;
> > > }
> > > 
> > > static void zspage_write_unlock(struct zspage *zspage)
> > > {
> > >         atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > >         rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > > #endif
> > >         preempt_enable();
> > > }
> > > ---
> > > 
> > > Maybe I'll just copy-paste the locking rules list, a list is always cleaner.
> > 
> > Thanks. I think it would be nice if we could also get someone with
> > locking expertise to take a look at this.
> 
> Sure.
> 
> I moved the lockdep acquire/release before atomic ops (except for try),
> as was suggested by Sebastian in zram sub-thread.
> 
> [..]
> > Seems like we have to compromise either way, custom locking or we enter
> > into a new complexity realm with RCU freeing.
> 
> Let's take the blue pill? :)

Can we do some perf testing to make sure this custom locking is not
regressing performance (selfishly I'd like some zswap testing too)?

Perhaps Kairui can help with that since he was already testing this
series.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 12 months ago

On (25/02/07 21:09), Yosry Ahmed wrote:
> Can we do some perf testing to make sure this custom locking is not
> regressing performance (selfishly I'd like some zswap testing too)?

So for zsmalloc I (usually) write some simple testing code which is
triggered via sysfs (device attr) and that is completely reproducible,
so that I compares apples to apples.  In this particular case I just
have a loop that creates objects (we don't need to compress or decompress
anything, zsmalloc doesn't really care)

-	echo 1 > /sys/ ... / test_prepare

	for (sz = 32; sz < PAGE_SIZE; sz += 64) {
		for (i = 0; i < 4096; i++) {
			ent->handle = zs_malloc(zram->mem_pool, sz)
			list_add(ent)
		}
	}


And now I just `perf stat` writes:

-	perf stat echo 1 > /sys/ ... / test_exec_old

	list_for_each_entry
		zs_map_object(ent->handle, ZS_MM_RO);
		zs_unmap_object(ent->handle)

	list_for_each_entry
		dst = zs_map_object(ent->handle, ZS_MM_WO);
		memcpy(dst, tmpbuf, ent->sz)
		zs_unmap_object(ent->handle)



-	perf stat echo 1 > /sys/ ... / test_exec_new

	list_for_each_entry
		dst = zs_obj_read_begin(ent->handle, loc);
		zs_obj_read_end(ent->handle, dst);

	list_for_each_entry
		zs_obj_write(ent->handle, tmpbuf, ent->sz);


-	echo 1 > /sys/ ... / test_finish

	free all handles and ent-s


The nice part is that we don't depend on any of the upper layers, we
don't even need to compress/decompress anything; we allocate objects
of required sizes and memcpy static data there (zsmalloc doesn't have
any opinion on that) and that's pretty much it.


OLD API
=======

10 runs

       369,205,778      instructions                     #    0.80  insn per cycle            
        40,467,926      branches                         #  113.732 M/sec                     

       369,002,122      instructions                     #    0.62  insn per cycle            
        40,426,145      branches                         #  189.361 M/sec                     

       369,051,170      instructions                     #    0.45  insn per cycle            
        40,434,677      branches                         #  157.574 M/sec                     

       369,014,522      instructions                     #    0.63  insn per cycle            
        40,427,754      branches                         #  201.464 M/sec                     

       369,019,179      instructions                     #    0.64  insn per cycle            
        40,429,327      branches                         #  198.321 M/sec                     

       368,973,095      instructions                     #    0.64  insn per cycle            
        40,419,245      branches                         #  234.210 M/sec                     

       368,950,705      instructions                     #    0.64  insn per cycle            
        40,414,305      branches                         #  231.460 M/sec                     

       369,041,288      instructions                     #    0.46  insn per cycle            
        40,432,599      branches                         #  155.576 M/sec                     

       368,964,080      instructions                     #    0.67  insn per cycle            
        40,417,025      branches                         #  245.665 M/sec                     

       369,036,706      instructions                     #    0.63  insn per cycle            
        40,430,860      branches                         #  204.105 M/sec                     


NEW API
=======

10 runs

       265,799,293      instructions                     #    0.51  insn per cycle            
        29,834,567      branches                         #  170.281 M/sec                     

       265,765,970      instructions                     #    0.55  insn per cycle            
        29,829,019      branches                         #  161.602 M/sec                     

       265,764,702      instructions                     #    0.51  insn per cycle            
        29,828,015      branches                         #  189.677 M/sec                     

       265,836,506      instructions                     #    0.38  insn per cycle            
        29,840,650      branches                         #  124.237 M/sec                     

       265,836,061      instructions                     #    0.36  insn per cycle            
        29,842,285      branches                         #  137.670 M/sec                     

       265,887,080      instructions                     #    0.37  insn per cycle            
        29,852,881      branches                         #  126.060 M/sec                     

       265,769,869      instructions                     #    0.57  insn per cycle            
        29,829,873      branches                         #  210.157 M/sec                     

       265,803,732      instructions                     #    0.58  insn per cycle            
        29,835,391      branches                         #  186.940 M/sec                     

       265,766,624      instructions                     #    0.58  insn per cycle            
        29,827,537      branches                         #  212.609 M/sec                     

       265,843,597      instructions                     #    0.57  insn per cycle            
        29,843,650      branches                         #  171.877 M/sec                     


x old-api-insn
+ new-api-insn
+-------------------------------------------------------------------------------------+
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|+                                                                                   x|
|A                                                                                   A|
+-------------------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  10  3.689507e+08 3.6920578e+08 3.6901918e+08 3.6902586e+08     71765.519
+  10  2.657647e+08 2.6588708e+08 2.6580373e+08 2.6580734e+08     42187.024
Difference at 95.0% confidence
	-1.03219e+08 +/- 55308.7
	-27.9705% +/- 0.0149878%
	(Student's t, pooled s = 58864.4)


> Perhaps Kairui can help with that since he was already testing this
> series.

Yeah, would be great.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 12 months ago

On Wed, Feb 12, 2025 at 02:00:26PM +0900, Sergey Senozhatsky wrote:
> On (25/02/07 21:09), Yosry Ahmed wrote:
> > Can we do some perf testing to make sure this custom locking is not
> > regressing performance (selfishly I'd like some zswap testing too)?
> 
> So for zsmalloc I (usually) write some simple testing code which is
> triggered via sysfs (device attr) and that is completely reproducible,
> so that I compares apples to apples.  In this particular case I just
> have a loop that creates objects (we don't need to compress or decompress
> anything, zsmalloc doesn't really care)
> 
> -	echo 1 > /sys/ ... / test_prepare
> 
> 	for (sz = 32; sz < PAGE_SIZE; sz += 64) {
> 		for (i = 0; i < 4096; i++) {
> 			ent->handle = zs_malloc(zram->mem_pool, sz)
> 			list_add(ent)
> 		}
> 	}
> 
> 
> And now I just `perf stat` writes:
> 
> -	perf stat echo 1 > /sys/ ... / test_exec_old
> 
> 	list_for_each_entry
> 		zs_map_object(ent->handle, ZS_MM_RO);
> 		zs_unmap_object(ent->handle)
> 
> 	list_for_each_entry
> 		dst = zs_map_object(ent->handle, ZS_MM_WO);
> 		memcpy(dst, tmpbuf, ent->sz)
> 		zs_unmap_object(ent->handle)
> 
> 
> 
> -	perf stat echo 1 > /sys/ ... / test_exec_new
> 
> 	list_for_each_entry
> 		dst = zs_obj_read_begin(ent->handle, loc);
> 		zs_obj_read_end(ent->handle, dst);
> 
> 	list_for_each_entry
> 		zs_obj_write(ent->handle, tmpbuf, ent->sz);
> 
> 
> -	echo 1 > /sys/ ... / test_finish
> 
> 	free all handles and ent-s
> 
> 
> The nice part is that we don't depend on any of the upper layers, we
> don't even need to compress/decompress anything; we allocate objects
> of required sizes and memcpy static data there (zsmalloc doesn't have
> any opinion on that) and that's pretty much it.
> 
> 
> OLD API
> =======
> 
> 10 runs
> 
>        369,205,778      instructions                     #    0.80  insn per cycle            
>         40,467,926      branches                         #  113.732 M/sec                     
> 
>        369,002,122      instructions                     #    0.62  insn per cycle            
>         40,426,145      branches                         #  189.361 M/sec                     
> 
>        369,051,170      instructions                     #    0.45  insn per cycle            
>         40,434,677      branches                         #  157.574 M/sec                     
> 
>        369,014,522      instructions                     #    0.63  insn per cycle            
>         40,427,754      branches                         #  201.464 M/sec                     
> 
>        369,019,179      instructions                     #    0.64  insn per cycle            
>         40,429,327      branches                         #  198.321 M/sec                     
> 
>        368,973,095      instructions                     #    0.64  insn per cycle            
>         40,419,245      branches                         #  234.210 M/sec                     
> 
>        368,950,705      instructions                     #    0.64  insn per cycle            
>         40,414,305      branches                         #  231.460 M/sec                     
> 
>        369,041,288      instructions                     #    0.46  insn per cycle            
>         40,432,599      branches                         #  155.576 M/sec                     
> 
>        368,964,080      instructions                     #    0.67  insn per cycle            
>         40,417,025      branches                         #  245.665 M/sec                     
> 
>        369,036,706      instructions                     #    0.63  insn per cycle            
>         40,430,860      branches                         #  204.105 M/sec                     
> 
> 
> NEW API
> =======
> 
> 10 runs
> 
>        265,799,293      instructions                     #    0.51  insn per cycle            
>         29,834,567      branches                         #  170.281 M/sec                     
> 
>        265,765,970      instructions                     #    0.55  insn per cycle            
>         29,829,019      branches                         #  161.602 M/sec                     
> 
>        265,764,702      instructions                     #    0.51  insn per cycle            
>         29,828,015      branches                         #  189.677 M/sec                     
> 
>        265,836,506      instructions                     #    0.38  insn per cycle            
>         29,840,650      branches                         #  124.237 M/sec                     
> 
>        265,836,061      instructions                     #    0.36  insn per cycle            
>         29,842,285      branches                         #  137.670 M/sec                     
> 
>        265,887,080      instructions                     #    0.37  insn per cycle            
>         29,852,881      branches                         #  126.060 M/sec                     
> 
>        265,769,869      instructions                     #    0.57  insn per cycle            
>         29,829,873      branches                         #  210.157 M/sec                     
> 
>        265,803,732      instructions                     #    0.58  insn per cycle            
>         29,835,391      branches                         #  186.940 M/sec                     
> 
>        265,766,624      instructions                     #    0.58  insn per cycle            
>         29,827,537      branches                         #  212.609 M/sec                     
> 
>        265,843,597      instructions                     #    0.57  insn per cycle            
>         29,843,650      branches                         #  171.877 M/sec                     
> 
> 
> x old-api-insn
> + new-api-insn
> +-------------------------------------------------------------------------------------+
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |+                                                                                   x|
> |A                                                                                   A|
> +-------------------------------------------------------------------------------------+
>     N           Min           Max        Median           Avg        Stddev
> x  10  3.689507e+08 3.6920578e+08 3.6901918e+08 3.6902586e+08     71765.519
> +  10  2.657647e+08 2.6588708e+08 2.6580373e+08 2.6580734e+08     42187.024
> Difference at 95.0% confidence
> 	-1.03219e+08 +/- 55308.7
> 	-27.9705% +/- 0.0149878%
> 	(Student's t, pooled s = 58864.4)

Thanks for sharing these results, but I wonder if this will capture
regressions from locking changes (e.g. a lock being preemtible)? IIUC
this is counting the instructions executed in these paths, and that
won't change if the task gets preempted. Lock contention may be captured
as extra instructions, but I am not sure we'll directly see its effect
in terms of serialization and delays.

I think we also need some high level testing (e.g. concurrent
swapins/swapouts) to find that out. I think that's what Kairui's testing
covers.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 12 months ago

On (25/02/12 15:35), Yosry Ahmed wrote:
> > Difference at 95.0% confidence
> > 	-1.03219e+08 +/- 55308.7
> > 	-27.9705% +/- 0.0149878%
> > 	(Student's t, pooled s = 58864.4)
> 
> Thanks for sharing these results, but I wonder if this will capture
> regressions from locking changes (e.g. a lock being preemtible)? IIUC
> this is counting the instructions executed in these paths, and that
> won't change if the task gets preempted. Lock contention may be captured
> as extra instructions, but I am not sure we'll directly see its effect
> in terms of serialization and delays.

Yeah..

> I think we also need some high level testing (e.g. concurrent
> swapins/swapouts) to find that out. I think that's what Kairui's testing
> covers.

I do a fair amount of high-level testing: heavy parallel (make -j36 and
parallel dd) workloads (multiple zram devices configuration - zram0 ext4,
zram1 writeback device, zram2 swap) w/ and w/o lockdep.  In addition I also
run these workloads under heavy memory pressure (a 4GB VM), when oom-killer
starts to run around with a pair of scissors.  But it's mostly regression
testing.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 12 months ago

February 12, 2025 at 6:18 PM, "Sergey Senozhatsky" <senozhatsky@chromium.org> wrote:



> 
> On (25/02/12 15:35), Yosry Ahmed wrote:
> 
> > 
> > Difference at 95.0% confidence
> > 
> >  -1.03219e+08 +/- 55308.7
> > 
> >  -27.9705% +/- 0.0149878%
> > 
> >  (Student's t, pooled s = 58864.4)
> > 
> >  
> > 
> >  Thanks for sharing these results, but I wonder if this will capture
> > 
> >  regressions from locking changes (e.g. a lock being preemtible)? IIUC
> > 
> >  this is counting the instructions executed in these paths, and that
> > 
> >  won't change if the task gets preempted. Lock contention may be captured
> > 
> >  as extra instructions, but I am not sure we'll directly see its effect
> > 
> >  in terms of serialization and delays.
> > 
> 
> Yeah..
> 
> > 
> > I think we also need some high level testing (e.g. concurrent
> > 
> >  swapins/swapouts) to find that out. I think that's what Kairui's testing
> > 
> >  covers.
> > 
> 
> I do a fair amount of high-level testing: heavy parallel (make -j36 and
> 
> parallel dd) workloads (multiple zram devices configuration - zram0 ext4,
> 
> zram1 writeback device, zram2 swap) w/ and w/o lockdep. In addition I also
> 
> run these workloads under heavy memory pressure (a 4GB VM), when oom-killer
> 
> starts to run around with a pair of scissors. But it's mostly regression
> 
> testing.
>

If we can get some numbers from these parallel workloads that would be better than the perf stats imo.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 12 months ago

On (25/02/13 02:57), Yosry Ahmed wrote:
> > > I think we also need some high level testing (e.g. concurrent
> > >
> > >  swapins/swapouts) to find that out. I think that's what Kairui's testing
> > >
> > >  covers.
> > >
> >
> > I do a fair amount of high-level testing: heavy parallel (make -j36 and
> >
> > parallel dd) workloads (multiple zram devices configuration - zram0 ext4,
> >
> > zram1 writeback device, zram2 swap) w/ and w/o lockdep. In addition I also
> >
> > run these workloads under heavy memory pressure (a 4GB VM), when oom-killer
> >
> > starts to run around with a pair of scissors. But it's mostly regression
> >
> > testing.
> >

// JFI it seems your email client/service for some reason injects a lot
// of empty lines

> If we can get some numbers from these parallel workloads that would be better than the perf stats imo.

make -j24  CONFIG_PREEMPT


BASE
====

1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k

lock stats

                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
                   &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00          10001         702081           0.14         104.74      125571.64           0.18
                            &class->lock:             1              1           0.25           0.25           0.25           0.25           6320         840542           0.06         809.72      191214.87           0.23
                         &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           6452         664129           0.12         660.24      201888.61           0.30
                &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1716362        3096466           0.07         811.10      365551.24           0.12
                            &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664129           1.68        1004.80    14853571.32          22.37

PATCHED
=======

1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k

lock stats

                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
                         &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           3648         701979           0.12          44.09      107333.02           0.15
                            &class->lock:             0              0           0.00           0.00           0.00           0.00           5038         840434           0.06        1245.90      211814.60           0.25
                         zsmalloc-page-R:             0              0           0.00           0.00           0.00           0.00              0         664078           0.05         699.35      236641.75           0.36
                        zram-entry->lock:             0              0           0.00           0.00           0.00           0.00              0        3098328           0.06        2987.02      313339.11           0.10
   &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         664078           1.77        7071.30    14838397.61          22.34

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 12 months ago

On (25/02/13 16:21), Sergey Senozhatsky wrote:
> BASE
> ====
> 
> 1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k
> 
> lock stats
> 
>                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
>                    &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00          10001         702081           0.14         104.74      125571.64           0.18
>                             &class->lock:             1              1           0.25           0.25           0.25           0.25           6320         840542           0.06         809.72      191214.87           0.23
>                          &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           6452         664129           0.12         660.24      201888.61           0.30
>                 &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1716362        3096466           0.07         811.10      365551.24           0.12
>                             &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664129           1.68        1004.80    14853571.32          22.37
> 
> PATCHED
> =======
> 
> 1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k
> 
> lock stats
> 
>                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
>                          &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           3648         701979           0.12          44.09      107333.02           0.15
>                             &class->lock:             0              0           0.00           0.00           0.00           0.00           5038         840434           0.06        1245.90      211814.60           0.25
>                          zsmalloc-page-R:             0              0           0.00           0.00           0.00           0.00              0         664078           0.05         699.35      236641.75           0.36
>                         zram-entry->lock:             0              0           0.00           0.00           0.00           0.00              0        3098328           0.06        2987.02      313339.11           0.10
>    &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         664078           1.77        7071.30    14838397.61          22.34

So...

I added lock-stat handling to zspage->lock and to zram (in zram it's only
trylock that we can track, but it doesn't really bother me).  I also
renamed zsmalloc-page-R to old zspage->lock-R and zram-entry->lock to
old zram->table[index].lock, just in case if anyone cares.

Now bounces stats for zspage->lock and zram->table[index].lock look
pretty much like in BASE case.

PATCHED
=======

                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
                         &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           2702         703841           0.22         873.90      197110.49           0.28
                            &class->lock:             0              0           0.00           0.00           0.00           0.00           4590         842336           0.10        3329.63      256595.70           0.30
                          zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           4750         665011           0.08        3360.60      258402.21           0.39
                 zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1722291        3099346           0.12        6943.09      721282.34           0.23
   &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         665011           2.84        7062.18    14896206.16          22.40

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Yosry Ahmed 12 months ago

On Thu, Feb 13, 2025 at 05:22:20PM +0900, Sergey Senozhatsky wrote:
> On (25/02/13 16:21), Sergey Senozhatsky wrote:
> > BASE
> > ====
> > 
> > 1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k
> > 
> > lock stats
> > 
> >                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
> >                    &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00          10001         702081           0.14         104.74      125571.64           0.18
> >                             &class->lock:             1              1           0.25           0.25           0.25           0.25           6320         840542           0.06         809.72      191214.87           0.23
> >                          &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           6452         664129           0.12         660.24      201888.61           0.30
> >                 &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1716362        3096466           0.07         811.10      365551.24           0.12
> >                             &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664129           1.68        1004.80    14853571.32          22.37
> > 
> > PATCHED
> > =======
> > 
> > 1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k
> > 
> > lock stats
> > 
> >                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
> >                          &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           3648         701979           0.12          44.09      107333.02           0.15
> >                             &class->lock:             0              0           0.00           0.00           0.00           0.00           5038         840434           0.06        1245.90      211814.60           0.25
> >                          zsmalloc-page-R:             0              0           0.00           0.00           0.00           0.00              0         664078           0.05         699.35      236641.75           0.36
> >                         zram-entry->lock:             0              0           0.00           0.00           0.00           0.00              0        3098328           0.06        2987.02      313339.11           0.10
> >    &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         664078           1.77        7071.30    14838397.61          22.34
> 
> So...
> 
> I added lock-stat handling to zspage->lock and to zram (in zram it's only
> trylock that we can track, but it doesn't really bother me).  I also
> renamed zsmalloc-page-R to old zspage->lock-R and zram-entry->lock to
> old zram->table[index].lock, just in case if anyone cares.
> 
> Now bounces stats for zspage->lock and zram->table[index].lock look
> pretty much like in BASE case.
> 
> PATCHED
> =======
> 
>                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
>                          &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           2702         703841           0.22         873.90      197110.49           0.28
>                             &class->lock:             0              0           0.00           0.00           0.00           0.00           4590         842336           0.10        3329.63      256595.70           0.30
>                           zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           4750         665011           0.08        3360.60      258402.21           0.39
>                  zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1722291        3099346           0.12        6943.09      721282.34           0.23
>    &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         665011           2.84        7062.18    14896206.16          22.40
> 

holdtime-max and holdtime-total are higher in the patched kernel. Not
sure if this is just an artifact of lock holders being preemtible.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 12 months ago

On (25/02/13 15:25), Yosry Ahmed wrote:
> On Thu, Feb 13, 2025 at 05:22:20PM +0900, Sergey Senozhatsky wrote:
> > On (25/02/13 16:21), Sergey Senozhatsky wrote:
> > > BASE
> > > ====
> > > 
> > > 1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k
> > > 
> > > lock stats
> > > 
> > >                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
> > >                    &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00          10001         702081           0.14         104.74      125571.64           0.18
> > >                             &class->lock:             1              1           0.25           0.25           0.25           0.25           6320         840542           0.06         809.72      191214.87           0.23
> > >                          &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           6452         664129           0.12         660.24      201888.61           0.30
> > >                 &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1716362        3096466           0.07         811.10      365551.24           0.12
> > >                             &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664129           1.68        1004.80    14853571.32          22.37
> > > 
> > > PATCHED
> > > =======
> > > 
> > > 1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k
> > > 
> > > lock stats
> > > 
> > >                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
> > >                          &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           3648         701979           0.12          44.09      107333.02           0.15
> > >                             &class->lock:             0              0           0.00           0.00           0.00           0.00           5038         840434           0.06        1245.90      211814.60           0.25
> > >                          zsmalloc-page-R:             0              0           0.00           0.00           0.00           0.00              0         664078           0.05         699.35      236641.75           0.36
> > >                         zram-entry->lock:             0              0           0.00           0.00           0.00           0.00              0        3098328           0.06        2987.02      313339.11           0.10
> > >    &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         664078           1.77        7071.30    14838397.61          22.34
> > 
> > So...
> > 
> > I added lock-stat handling to zspage->lock and to zram (in zram it's only
> > trylock that we can track, but it doesn't really bother me).  I also
> > renamed zsmalloc-page-R to old zspage->lock-R and zram-entry->lock to
> > old zram->table[index].lock, just in case if anyone cares.
> > 
> > Now bounces stats for zspage->lock and zram->table[index].lock look
> > pretty much like in BASE case.
> > 
> > PATCHED
> > =======
> > 
> >                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
> >                          &pool->lock#3-R:             0              0           0.00           0.00           0.00           0.00           2702         703841           0.22         873.90      197110.49           0.28
> >                             &class->lock:             0              0           0.00           0.00           0.00           0.00           4590         842336           0.10        3329.63      256595.70           0.30
> >                           zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           4750         665011           0.08        3360.60      258402.21           0.39
> >                  zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1722291        3099346           0.12        6943.09      721282.34           0.23
> >    &per_cpu_ptr(comp->stream, cpu)->lock:             0              0           0.00           0.00           0.00           0.00             23         665011           2.84        7062.18    14896206.16          22.40
> > 
> 
> holdtime-max and holdtime-total are higher in the patched kernel. Not
> sure if this is just an artifact of lock holders being preemtible. 

Hmm, pool->lock shouldn't be affected at all, however BASE holds it much
longer than PATCHED

        holdtime-max            holdtime-total
BASE    104.74                  125571.64
PATCHED 44.09                   107333.02

Doesn't make sense.  I can understand zspage->lock and
zram->table[index].lock, but for zram->table[index] things look
strange (comparing run #1 and #2)

        holdtime-total
BASE    365551.24
PATCHED 313339.11

And run #3 is in its own league.



Very likely just a very very bad way to test things.


Re-based on 6.14.0-rc2-next-20250213.


BASE
====

PREEMPT_NONE

                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
                   &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00           3624         702276           0.15          35.96      126562.90           0.18
                            &class->lock:             0              0           0.00           0.00           0.00           0.00           5084         840733           0.06         795.26      183238.22           0.22
                         &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           5358         664228           0.12          43.71      192732.71           0.29
                &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1528645        3095862           0.07         764.76      370881.23           0.12
                            &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664228           2.52        2033.81    14605911.45          21.99

PREEMPT_VOLUNTARY

                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
                   &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00           3039         699556           0.14          50.78      125553.59           0.18
                            &class->lock:             0              0           0.00           0.00           0.00           0.00           5259         838005           0.06         943.43      177108.05           0.21
                         &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           5581         664096           0.12          81.56      190235.48           0.29
                &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1731706        3098570           0.07         796.87      366934.54           0.12
                            &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664096           3.38        5074.72    14472697.91          21.79

PREEMPT

                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
                   &pool->migrate_lock-R:             0              0           0.00           0.00           0.00           0.00           2545         701827           0.14         773.56      125463.37           0.18
                            &class->lock:             0              0           0.00           0.00           0.00           0.00           4697         840281           0.06        1701.18      231657.38           0.28
                         &zspage->lock-R:             0              0           0.00           0.00           0.00           0.00           4778         664002           0.12         755.62      181215.17           0.27
                &zram->table[index].lock:             0              0           0.00           0.00           0.00           0.00        1731737        3096937           0.07        1703.92      384633.29           0.12
                            &zstrm->lock:             0              0           0.00           0.00           0.00           0.00              0         664002           2.85        3603.20    14586900.58          21.97


So somehow holdtime-max for per-CPU stream is 2.5x higher for PREEMPT_VOLUNTARY
than for PREEMPT_NONE.  And class->lock holdtime-total is much much higher for
PREEMPT than for any other preemption models.  And that's BASE kernel, which
runs fully atomic zsmalloc and zram.  I call this rubbish.

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/02/06 12:05), Sergey Senozhatsky wrote:
> 
> Sure.  That's what it currently looks like (can always improve)
> 

- added must-check
- added preemptible() check  // just in case
- added locking rules list

Oh, and also switched to acquire/release semantics, like you suggested
a couple of days ago.

---

/*
 * zspage locking rules:
 *
 * 1) writer-lock is exclusive
 *
 * 2) writer-lock owner cannot sleep
 *
 * 3) writer-lock owner cannot spin waiting for the lock
 *   - caller (e.g. compaction and migration) must check return value and
 *     handle locking failures
 *   - there is only TRY variant of writer-lock function
 *
 * 4) reader-lock owners (multiple) can sleep
 *
 * 5) reader-lock owners can spin waiting for the lock, in any context
 *   - existing readers (even preempted ones) don't block new readers
 *   - writer-lock owners never sleep, always unlock at some point
 */
static void zspage_read_lock(struct zspage *zspage)
{
        atomic_t *lock = &zspage->lock;
        int old = atomic_read_acquire(lock);

        do {
                if (old == ZS_PAGE_WRLOCKED) {
                        cpu_relax();
                        old = atomic_read_acquire(lock);
                        continue;
                }
        } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
}

static void zspage_read_unlock(struct zspage *zspage)
{
        atomic_dec_return_release(&zspage->lock);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
}

static __must_check bool zspage_try_write_lock(struct zspage *zspage)
{
        atomic_t *lock = &zspage->lock;
        int old = ZS_PAGE_UNLOCKED;

        WARN_ON_ONCE(preemptible());

        preempt_disable();
        if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
                rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
                return true;
        }

        preempt_enable();
        return false;
}

static void zspage_write_unlock(struct zspage *zspage)
{
        atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
        preempt_enable();
}

Re: [PATCHv4 14/17] zsmalloc: make zspage lock preemptible

Posted by Sergey Senozhatsky 1 year ago

On (25/02/03 12:13), Sergey Senozhatsky wrote:
> > Just my 2c.
> 
> Perhaps we can sprinkle some lockdep on it.

I forgot to rwsem_release() in zspage_write_unlock() and that
has triggered lockdep :)

---
 mm/zsmalloc.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 956445f4d554..1d4700e457d4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -308,6 +308,10 @@ struct zspage {
 	struct list_head list; /* fullness list */
 	struct zs_pool *pool;
 	atomic_t lock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map lockdep_map;
+#endif
 };
 
 struct mapping_area {
@@ -319,6 +323,12 @@ struct mapping_area {
 
 static void zspage_lock_init(struct zspage *zspage)
 {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key key;
+
+	lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page", &key, 0);
+#endif
+
 	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
 }
 
@@ -344,11 +354,19 @@ static void zspage_read_lock(struct zspage *zspage)
 			continue;
 		}
 	} while (!atomic_try_cmpxchg(lock, &old, old + 1));
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
 }
 
 static void zspage_read_unlock(struct zspage *zspage)
 {
 	atomic_dec(&zspage->lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
 }
 
 static bool zspage_try_write_lock(struct zspage *zspage)
@@ -357,8 +375,12 @@ static bool zspage_try_write_lock(struct zspage *zspage)
 	int old = ZS_PAGE_UNLOCKED;
 
 	preempt_disable();
-	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
+	if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+		rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
 		return true;
+	}
 
 	preempt_enable();
 	return false;
@@ -367,6 +389,9 @@ static bool zspage_try_write_lock(struct zspage *zspage)
 static void zspage_write_unlock(struct zspage *zspage)
 {
 	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
 	preempt_enable();
 }
 
-- 
2.48.1.362.g079036d154-goog