Switch over from rwlock_t to a atomic_t variable that takes negative
value when the page is under migration, or positive values when the
page is used by zsmalloc users (object map, etc.) Using a rwsem
per-zspage is a little too memory heavy, a simple atomic_t should
suffice.
zspage lock is a leaf lock for zs_map_object(), where it's read-acquired.
Since this lock now permits preemption extra care needs to be taken when
it is write-acquired - all writers grab it in atomic context, so they
cannot spin and wait for (potentially preempted) reader to unlock zspage.
There are only two writers at this moment - migration and compaction. In
both cases we use write-try-lock and bail out if zspage is read locked.
Writers, on the other hand, never get preempted, so readers can spin
waiting for the writer to unlock zspage.
With this we can implement a preemptible object mapping.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
---
mm/zsmalloc.c | 135 +++++++++++++++++++++++++++++++-------------------
1 file changed, 83 insertions(+), 52 deletions(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 4b4c77bc08f9..f5b5fe732e50 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -292,6 +292,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
__free_page(page);
}
+#define ZS_PAGE_UNLOCKED 0
+#define ZS_PAGE_WRLOCKED -1
+
struct zspage {
struct {
unsigned int huge:HUGE_BITS;
@@ -304,7 +307,7 @@ struct zspage {
struct zpdesc *first_zpdesc;
struct list_head list; /* fullness list */
struct zs_pool *pool;
- rwlock_t lock;
+ atomic_t lock;
};
struct mapping_area {
@@ -314,6 +317,59 @@ struct mapping_area {
enum zs_mapmode vm_mm; /* mapping mode */
};
+static void zspage_lock_init(struct zspage *zspage)
+{
+ atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+}
+
+/*
+ * zspage lock permits preemption on the reader-side (there can be multiple
+ * readers). Writers (exclusive zspage ownership), on the other hand, are
+ * always run in atomic context and cannot spin waiting for a (potentially
+ * preempted) reader to unlock zspage. This, basically, means that writers
+ * can only call write-try-lock and must bail out if it didn't succeed.
+ *
+ * At the same time, writers cannot reschedule under zspage write-lock,
+ * so readers can spin waiting for the writer to unlock zspage.
+ */
+static void zspage_read_lock(struct zspage *zspage)
+{
+ atomic_t *lock = &zspage->lock;
+ int old = atomic_read(lock);
+
+ do {
+ if (old == ZS_PAGE_WRLOCKED) {
+ cpu_relax();
+ old = atomic_read(lock);
+ continue;
+ }
+ } while (!atomic_try_cmpxchg(lock, &old, old + 1));
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+ atomic_dec(&zspage->lock);
+}
+
+static bool zspage_try_write_lock(struct zspage *zspage)
+{
+ atomic_t *lock = &zspage->lock;
+ int old = ZS_PAGE_UNLOCKED;
+
+ preempt_disable();
+ if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
+ return true;
+
+ preempt_enable();
+ return false;
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+ atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+ preempt_enable();
+}
+
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
static void SetZsHugePage(struct zspage *zspage)
{
@@ -325,12 +381,6 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-static void lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
-
#ifdef CONFIG_COMPACTION
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
@@ -1026,7 +1076,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
return NULL;
zspage->magic = ZSPAGE_MAGIC;
- lock_init(zspage);
+ zspage_lock_init(zspage);
for (i = 0; i < class->pages_per_zspage; i++) {
struct zpdesc *zpdesc;
@@ -1251,7 +1301,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
* zs_unmap_object API so delegate the locking from class to zspage
* which is smaller granularity.
*/
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
pool_read_unlock(pool);
class = zspage_class(pool, zspage);
@@ -1311,7 +1361,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
local_unlock(&zs_map_area.lock);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
@@ -1705,18 +1755,18 @@ static void lock_zspage(struct zspage *zspage)
/*
* Pages we haven't locked yet can be migrated off the list while we're
* trying to lock them, so we need to be careful and only attempt to
- * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * lock each page under zspage_read_lock(). Otherwise, the page we lock
* may no longer belong to the zspage. This means that we may wait for
* the wrong page to unlock, so we must take a reference to the page
- * prior to waiting for it to unlock outside migrate_read_lock().
+ * prior to waiting for it to unlock outside zspage_read_lock().
*/
while (1) {
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
zpdesc = get_first_zpdesc(zspage);
if (zpdesc_trylock(zpdesc))
break;
zpdesc_get(zpdesc);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
}
@@ -1727,41 +1777,16 @@ static void lock_zspage(struct zspage *zspage)
curr_zpdesc = zpdesc;
} else {
zpdesc_get(zpdesc);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
}
}
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
#endif /* CONFIG_COMPACTION */
-static void lock_init(struct zspage *zspage)
-{
- rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
- read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
- read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
- write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
- write_unlock(&zspage->lock);
-}
-
#ifdef CONFIG_COMPACTION
static const struct movable_operations zsmalloc_mops;
@@ -1803,7 +1828,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
}
static int zs_page_migrate(struct page *newpage, struct page *page,
- enum migrate_mode mode)
+ enum migrate_mode mode)
{
struct zs_pool *pool;
struct size_class *class;
@@ -1819,15 +1844,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
- /* We're committed, tell the world that this is a Zsmalloc page. */
- __zpdesc_set_zsmalloc(newzpdesc);
-
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
pool = zspage->pool;
/*
- * The pool lock protects the race between zpage migration
+ * The pool->lock protects the race between zpage migration
* and zs_free.
*/
pool_write_lock(pool);
@@ -1837,8 +1859,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* the class lock protects zpage alloc/free in the zspage.
*/
size_class_lock(class);
- /* the migrate_write_lock protects zpage access via zs_map_object */
- migrate_write_lock(zspage);
+ /* the zspage write_lock protects zpage access via zs_map_object */
+ if (!zspage_try_write_lock(zspage)) {
+ size_class_unlock(class);
+ pool_write_unlock(pool);
+ return -EINVAL;
+ }
+
+ /* We're committed, tell the world that this is a Zsmalloc page. */
+ __zpdesc_set_zsmalloc(newzpdesc);
offset = get_first_obj_offset(zpdesc);
s_addr = kmap_local_zpdesc(zpdesc);
@@ -1869,7 +1898,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
*/
pool_write_unlock(pool);
size_class_unlock(class);
- migrate_write_unlock(zspage);
+ zspage_write_unlock(zspage);
zpdesc_get(newzpdesc);
if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
@@ -2005,9 +2034,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!src_zspage)
break;
- migrate_write_lock(src_zspage);
+ if (!zspage_try_write_lock(src_zspage))
+ break;
+
migrate_zspage(pool, src_zspage, dst_zspage);
- migrate_write_unlock(src_zspage);
+ zspage_write_unlock(src_zspage);
fg = putback_zspage(class, src_zspage);
if (fg == ZS_INUSE_RATIO_0) {
--
2.48.1.362.g079036d154-goog
On Fri, Jan 31, 2025 at 06:06:13PM +0900, Sergey Senozhatsky wrote:
> Switch over from rwlock_t to a atomic_t variable that takes negative
> value when the page is under migration, or positive values when the
> page is used by zsmalloc users (object map, etc.) Using a rwsem
> per-zspage is a little too memory heavy, a simple atomic_t should
> suffice.
>
> zspage lock is a leaf lock for zs_map_object(), where it's read-acquired.
> Since this lock now permits preemption extra care needs to be taken when
> it is write-acquired - all writers grab it in atomic context, so they
> cannot spin and wait for (potentially preempted) reader to unlock zspage.
> There are only two writers at this moment - migration and compaction. In
> both cases we use write-try-lock and bail out if zspage is read locked.
> Writers, on the other hand, never get preempted, so readers can spin
> waiting for the writer to unlock zspage.
>
> With this we can implement a preemptible object mapping.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
> ---
> mm/zsmalloc.c | 135 +++++++++++++++++++++++++++++++-------------------
> 1 file changed, 83 insertions(+), 52 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 4b4c77bc08f9..f5b5fe732e50 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -292,6 +292,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
> __free_page(page);
> }
>
> +#define ZS_PAGE_UNLOCKED 0
> +#define ZS_PAGE_WRLOCKED -1
> +
> struct zspage {
> struct {
> unsigned int huge:HUGE_BITS;
> @@ -304,7 +307,7 @@ struct zspage {
> struct zpdesc *first_zpdesc;
> struct list_head list; /* fullness list */
> struct zs_pool *pool;
> - rwlock_t lock;
> + atomic_t lock;
> };
>
> struct mapping_area {
> @@ -314,6 +317,59 @@ struct mapping_area {
> enum zs_mapmode vm_mm; /* mapping mode */
> };
>
> +static void zspage_lock_init(struct zspage *zspage)
> +{
> + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
> +}
> +
> +/*
> + * zspage lock permits preemption on the reader-side (there can be multiple
> + * readers). Writers (exclusive zspage ownership), on the other hand, are
> + * always run in atomic context and cannot spin waiting for a (potentially
> + * preempted) reader to unlock zspage. This, basically, means that writers
> + * can only call write-try-lock and must bail out if it didn't succeed.
> + *
> + * At the same time, writers cannot reschedule under zspage write-lock,
> + * so readers can spin waiting for the writer to unlock zspage.
> + */
> +static void zspage_read_lock(struct zspage *zspage)
> +{
> + atomic_t *lock = &zspage->lock;
> + int old = atomic_read(lock);
> +
> + do {
> + if (old == ZS_PAGE_WRLOCKED) {
> + cpu_relax();
> + old = atomic_read(lock);
> + continue;
> + }
> + } while (!atomic_try_cmpxchg(lock, &old, old + 1));
> +}
> +
> +static void zspage_read_unlock(struct zspage *zspage)
> +{
> + atomic_dec(&zspage->lock);
> +}
> +
> +static bool zspage_try_write_lock(struct zspage *zspage)
> +{
> + atomic_t *lock = &zspage->lock;
> + int old = ZS_PAGE_UNLOCKED;
> +
> + preempt_disable();
> + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
FWIW, I am usually afraid to manually implement locking like this. For
example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not
atomic_try_cmpxchg(), and I am not quite sure what could happen without
ACQUIRE semantics here on some architectures.
We also lose some debugging capabilities as Hilf pointed out in another
patch.
Just my 2c.
> + return true;
> +
> + preempt_enable();
> + return false;
> +}
> +
> +static void zspage_write_unlock(struct zspage *zspage)
> +{
> + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
> + preempt_enable();
> +}
> +
> /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
> static void SetZsHugePage(struct zspage *zspage)
> {
> @@ -325,12 +381,6 @@ static bool ZsHugePage(struct zspage *zspage)
> return zspage->huge;
> }
>
> -static void lock_init(struct zspage *zspage);
> -static void migrate_read_lock(struct zspage *zspage);
> -static void migrate_read_unlock(struct zspage *zspage);
> -static void migrate_write_lock(struct zspage *zspage);
> -static void migrate_write_unlock(struct zspage *zspage);
> -
> #ifdef CONFIG_COMPACTION
> static void kick_deferred_free(struct zs_pool *pool);
> static void init_deferred_free(struct zs_pool *pool);
> @@ -1026,7 +1076,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> return NULL;
>
> zspage->magic = ZSPAGE_MAGIC;
> - lock_init(zspage);
> + zspage_lock_init(zspage);
>
> for (i = 0; i < class->pages_per_zspage; i++) {
> struct zpdesc *zpdesc;
> @@ -1251,7 +1301,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> * zs_unmap_object API so delegate the locking from class to zspage
> * which is smaller granularity.
> */
> - migrate_read_lock(zspage);
> + zspage_read_lock(zspage);
> pool_read_unlock(pool);
>
> class = zspage_class(pool, zspage);
> @@ -1311,7 +1361,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> }
> local_unlock(&zs_map_area.lock);
>
> - migrate_read_unlock(zspage);
> + zspage_read_unlock(zspage);
> }
> EXPORT_SYMBOL_GPL(zs_unmap_object);
>
> @@ -1705,18 +1755,18 @@ static void lock_zspage(struct zspage *zspage)
> /*
> * Pages we haven't locked yet can be migrated off the list while we're
> * trying to lock them, so we need to be careful and only attempt to
> - * lock each page under migrate_read_lock(). Otherwise, the page we lock
> + * lock each page under zspage_read_lock(). Otherwise, the page we lock
> * may no longer belong to the zspage. This means that we may wait for
> * the wrong page to unlock, so we must take a reference to the page
> - * prior to waiting for it to unlock outside migrate_read_lock().
> + * prior to waiting for it to unlock outside zspage_read_lock().
> */
> while (1) {
> - migrate_read_lock(zspage);
> + zspage_read_lock(zspage);
> zpdesc = get_first_zpdesc(zspage);
> if (zpdesc_trylock(zpdesc))
> break;
> zpdesc_get(zpdesc);
> - migrate_read_unlock(zspage);
> + zspage_read_unlock(zspage);
> zpdesc_wait_locked(zpdesc);
> zpdesc_put(zpdesc);
> }
> @@ -1727,41 +1777,16 @@ static void lock_zspage(struct zspage *zspage)
> curr_zpdesc = zpdesc;
> } else {
> zpdesc_get(zpdesc);
> - migrate_read_unlock(zspage);
> + zspage_read_unlock(zspage);
> zpdesc_wait_locked(zpdesc);
> zpdesc_put(zpdesc);
> - migrate_read_lock(zspage);
> + zspage_read_lock(zspage);
> }
> }
> - migrate_read_unlock(zspage);
> + zspage_read_unlock(zspage);
> }
> #endif /* CONFIG_COMPACTION */
>
> -static void lock_init(struct zspage *zspage)
> -{
> - rwlock_init(&zspage->lock);
> -}
> -
> -static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
> -{
> - read_lock(&zspage->lock);
> -}
> -
> -static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
> -{
> - read_unlock(&zspage->lock);
> -}
> -
> -static void migrate_write_lock(struct zspage *zspage)
> -{
> - write_lock(&zspage->lock);
> -}
> -
> -static void migrate_write_unlock(struct zspage *zspage)
> -{
> - write_unlock(&zspage->lock);
> -}
> -
> #ifdef CONFIG_COMPACTION
>
> static const struct movable_operations zsmalloc_mops;
> @@ -1803,7 +1828,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
> }
>
> static int zs_page_migrate(struct page *newpage, struct page *page,
> - enum migrate_mode mode)
> + enum migrate_mode mode)
> {
> struct zs_pool *pool;
> struct size_class *class;
> @@ -1819,15 +1844,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
>
> VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
>
> - /* We're committed, tell the world that this is a Zsmalloc page. */
> - __zpdesc_set_zsmalloc(newzpdesc);
> -
> /* The page is locked, so this pointer must remain valid */
> zspage = get_zspage(zpdesc);
> pool = zspage->pool;
>
> /*
> - * The pool lock protects the race between zpage migration
> + * The pool->lock protects the race between zpage migration
> * and zs_free.
> */
> pool_write_lock(pool);
> @@ -1837,8 +1859,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
> * the class lock protects zpage alloc/free in the zspage.
> */
> size_class_lock(class);
> - /* the migrate_write_lock protects zpage access via zs_map_object */
> - migrate_write_lock(zspage);
> + /* the zspage write_lock protects zpage access via zs_map_object */
> + if (!zspage_try_write_lock(zspage)) {
> + size_class_unlock(class);
> + pool_write_unlock(pool);
> + return -EINVAL;
> + }
> +
> + /* We're committed, tell the world that this is a Zsmalloc page. */
> + __zpdesc_set_zsmalloc(newzpdesc);
>
> offset = get_first_obj_offset(zpdesc);
> s_addr = kmap_local_zpdesc(zpdesc);
> @@ -1869,7 +1898,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
> */
> pool_write_unlock(pool);
> size_class_unlock(class);
> - migrate_write_unlock(zspage);
> + zspage_write_unlock(zspage);
>
> zpdesc_get(newzpdesc);
> if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
> @@ -2005,9 +2034,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
> if (!src_zspage)
> break;
>
> - migrate_write_lock(src_zspage);
> + if (!zspage_try_write_lock(src_zspage))
> + break;
> +
> migrate_zspage(pool, src_zspage, dst_zspage);
> - migrate_write_unlock(src_zspage);
> + zspage_write_unlock(src_zspage);
>
> fg = putback_zspage(class, src_zspage);
> if (fg == ZS_INUSE_RATIO_0) {
> --
> 2.48.1.362.g079036d154-goog
>
On (25/01/31 15:51), Yosry Ahmed wrote:
> > +static void zspage_read_lock(struct zspage *zspage)
> > +{
> > + atomic_t *lock = &zspage->lock;
> > + int old = atomic_read(lock);
> > +
> > + do {
> > + if (old == ZS_PAGE_WRLOCKED) {
> > + cpu_relax();
> > + old = atomic_read(lock);
> > + continue;
> > + }
> > + } while (!atomic_try_cmpxchg(lock, &old, old + 1));
> > +}
> > +
> > +static void zspage_read_unlock(struct zspage *zspage)
> > +{
> > + atomic_dec(&zspage->lock);
> > +}
> > +
> > +static bool zspage_try_write_lock(struct zspage *zspage)
> > +{
> > + atomic_t *lock = &zspage->lock;
> > + int old = ZS_PAGE_UNLOCKED;
> > +
> > + preempt_disable();
> > + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
>
> FWIW, I am usually afraid to manually implement locking like this. For
> example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not
> atomic_try_cmpxchg(), and I am not quite sure what could happen without
> ACQUIRE semantics here on some architectures.
I looked into it a bit, wasn't sure either. Perhaps we can switch
to acquire/release semantics, I'm not an expert on this, would highly
appreciate help.
> We also lose some debugging capabilities as Hilf pointed out in another
> patch.
So that zspage lock should have not been a lock, I think, it's a ref-counter
and it's being used as one
map()
{
page->users++;
}
unmap()
{
page->users--;
}
migrate()
{
if (!page->users)
migrate_page();
}
> Just my 2c.
Perhaps we can sprinkle some lockdep on it. For instance:
---
mm/zsmalloc.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 956445f4d554..06b1d8ca9e89 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -308,6 +308,10 @@ struct zspage {
struct list_head list; /* fullness list */
struct zs_pool *pool;
atomic_t lock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map lockdep_map;
+#endif
};
struct mapping_area {
@@ -319,6 +323,12 @@ struct mapping_area {
static void zspage_lock_init(struct zspage *zspage)
{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key key;
+
+ lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page", &key, 0);
+#endif
+
atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
}
@@ -344,11 +354,19 @@ static void zspage_read_lock(struct zspage *zspage)
continue;
}
} while (!atomic_try_cmpxchg(lock, &old, old + 1));
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
}
static void zspage_read_unlock(struct zspage *zspage)
{
atomic_dec(&zspage->lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
}
static bool zspage_try_write_lock(struct zspage *zspage)
@@ -357,8 +375,12 @@ static bool zspage_try_write_lock(struct zspage *zspage)
int old = ZS_PAGE_UNLOCKED;
preempt_disable();
- if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
+ if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
return true;
+ }
preempt_enable();
return false;
--
2.48.1.362.g079036d154-goog
On Mon, Feb 03, 2025 at 12:13:49PM +0900, Sergey Senozhatsky wrote:
> On (25/01/31 15:51), Yosry Ahmed wrote:
> > > +static void zspage_read_lock(struct zspage *zspage)
> > > +{
> > > + atomic_t *lock = &zspage->lock;
> > > + int old = atomic_read(lock);
> > > +
> > > + do {
> > > + if (old == ZS_PAGE_WRLOCKED) {
> > > + cpu_relax();
> > > + old = atomic_read(lock);
> > > + continue;
> > > + }
> > > + } while (!atomic_try_cmpxchg(lock, &old, old + 1));
> > > +}
> > > +
> > > +static void zspage_read_unlock(struct zspage *zspage)
> > > +{
> > > + atomic_dec(&zspage->lock);
> > > +}
> > > +
> > > +static bool zspage_try_write_lock(struct zspage *zspage)
> > > +{
> > > + atomic_t *lock = &zspage->lock;
> > > + int old = ZS_PAGE_UNLOCKED;
> > > +
> > > + preempt_disable();
> > > + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
> >
> > FWIW, I am usually afraid to manually implement locking like this. For
> > example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not
> > atomic_try_cmpxchg(), and I am not quite sure what could happen without
> > ACQUIRE semantics here on some architectures.
>
> I looked into it a bit, wasn't sure either. Perhaps we can switch
> to acquire/release semantics, I'm not an expert on this, would highly
> appreciate help.
>
> > We also lose some debugging capabilities as Hilf pointed out in another
> > patch.
>
> So that zspage lock should have not been a lock, I think, it's a ref-counter
> and it's being used as one
>
> map()
> {
> page->users++;
> }
>
> unmap()
> {
> page->users--;
> }
>
> migrate()
> {
> if (!page->users)
> migrate_page();
> }
Hmm, but in this case we want migration to block new map/unmap
operations. So a vanilla refcount won't work.
>
> > Just my 2c.
>
> Perhaps we can sprinkle some lockdep on it. For instance:
Honestly this looks like more reason to use existing lock primitives to
me. What are the candidates? I assume rw_semaphore, anything else?
I guess the main reason you didn't use a rw_semaphore is the extra
memory usage. Seems like it uses ~32 bytes more than rwlock_t on x86_64.
That's per zspage. Depending on how many compressed pages we have
per-zspage this may not be too bad.
For example, if a zspage has a chain length of 4, and the average
compression ratio of 1/3, that's 12 compressed pages so the extra
overhead is <3 bytes per compressed page.
Given that the chain length is a function of the class size, I think we
can calculate the exact extra memory overhead per-compressed page for
each class and get a mean/median over all classes.
If the memory overhead is insignificant I'd rather use exisitng lock
primitives tbh.
On (25/02/03 21:11), Yosry Ahmed wrote:
> > > We also lose some debugging capabilities as Hilf pointed out in another
> > > patch.
> >
> > So that zspage lock should have not been a lock, I think, it's a ref-counter
> > and it's being used as one
> >
> > map()
> > {
> > page->users++;
> > }
> >
> > unmap()
> > {
> > page->users--;
> > }
> >
> > migrate()
> > {
> > if (!page->users)
> > migrate_page();
> > }
>
> Hmm, but in this case we want migration to block new map/unmap
> operations. So a vanilla refcount won't work.
Yeah, correct - migration needs negative values so that map would
wait until it's positive (or zero).
> > > Just my 2c.
> >
> > Perhaps we can sprinkle some lockdep on it. For instance:
>
> Honestly this looks like more reason to use existing lock primitives to
> me. What are the candidates? I assume rw_semaphore, anything else?
Right, rwsem "was" the first choice.
> I guess the main reason you didn't use a rw_semaphore is the extra
> memory usage.
sizeof(struct zs_page) change is one thing. Another thing is that
zspage->lock is taken from atomic sections, pretty much everywhere.
compaction/migration write-lock it under pool rwlock and class spinlock,
but both compaction and migration now EAGAIN if the lock is locked
already, so that is sorted out.
The remaining problem is map(), which takes zspage read-lock under pool
rwlock. RFC series (which you hated with passion :P) converted all zsmalloc
into preemptible ones because of this - zspage->lock is a nested leaf-lock,
so it cannot schedule unless locks it's nested under permit it (needless to
say neither rwlock nor spinlock permit it).
> Seems like it uses ~32 bytes more than rwlock_t on x86_64.
> That's per zspage. Depending on how many compressed pages we have
> per-zspage this may not be too bad.
So on a 16GB laptop our memory pressure test at peak used approx 1M zspages.
That is 32 bytes * 1M ~ 32MB of extra memory use. Not alarmingly a lot,
less than what a single browser tab needs nowadays. I suppose on 4GB/8GB
that will be even smaller (because those device generate less zspages).
Numbers are not the main issue, however.
On Tue, Feb 04, 2025 at 03:59:42PM +0900, Sergey Senozhatsky wrote:
> On (25/02/03 21:11), Yosry Ahmed wrote:
> > > > We also lose some debugging capabilities as Hilf pointed out in another
> > > > patch.
> > >
> > > So that zspage lock should have not been a lock, I think, it's a ref-counter
> > > and it's being used as one
> > >
> > > map()
> > > {
> > > page->users++;
> > > }
> > >
> > > unmap()
> > > {
> > > page->users--;
> > > }
> > >
> > > migrate()
> > > {
> > > if (!page->users)
> > > migrate_page();
> > > }
> >
> > Hmm, but in this case we want migration to block new map/unmap
> > operations. So a vanilla refcount won't work.
>
> Yeah, correct - migration needs negative values so that map would
> wait until it's positive (or zero).
>
> > > > Just my 2c.
> > >
> > > Perhaps we can sprinkle some lockdep on it. For instance:
> >
> > Honestly this looks like more reason to use existing lock primitives to
> > me. What are the candidates? I assume rw_semaphore, anything else?
>
> Right, rwsem "was" the first choice.
>
> > I guess the main reason you didn't use a rw_semaphore is the extra
> > memory usage.
>
> sizeof(struct zs_page) change is one thing. Another thing is that
> zspage->lock is taken from atomic sections, pretty much everywhere.
> compaction/migration write-lock it under pool rwlock and class spinlock,
> but both compaction and migration now EAGAIN if the lock is locked
> already, so that is sorted out.
>
> The remaining problem is map(), which takes zspage read-lock under pool
> rwlock. RFC series (which you hated with passion :P) converted all zsmalloc
> into preemptible ones because of this - zspage->lock is a nested leaf-lock,
> so it cannot schedule unless locks it's nested under permit it (needless to
> say neither rwlock nor spinlock permit it).
Hmm, so we want the lock to be preemtible, but we don't want to use an
existing preemtible lock because it may be held it from atomic context.
I think one problem here is that the lock you are introducing is a
spinning lock but the lock holder can be preempted. This is why spinning
locks do not allow preemption. Others waiting for the lock can spin
waiting for a process that is scheduled out.
For example, the compaction/migration code could be sleeping holding the
write lock, and a map() call would spin waiting for that sleeping task.
I wonder if there's a way to rework the locking instead to avoid the
nesting. It seems like sometimes we lock the zspage with the pool lock
held, sometimes with the class lock held, and sometimes with no lock
held.
What are the rules here for acquiring the zspage lock? Do we need to
hold another lock just to make sure the zspage does not go away from
under us? Can we use RCU or something similar to do that instead?
>
> > Seems like it uses ~32 bytes more than rwlock_t on x86_64.
> > That's per zspage. Depending on how many compressed pages we have
> > per-zspage this may not be too bad.
>
> So on a 16GB laptop our memory pressure test at peak used approx 1M zspages.
> That is 32 bytes * 1M ~ 32MB of extra memory use. Not alarmingly a lot,
> less than what a single browser tab needs nowadays. I suppose on 4GB/8GB
> that will be even smaller (because those device generate less zspages).
> Numbers are not the main issue, however.
>
On (25/02/04 17:19), Yosry Ahmed wrote: > > sizeof(struct zs_page) change is one thing. Another thing is that > > zspage->lock is taken from atomic sections, pretty much everywhere. > > compaction/migration write-lock it under pool rwlock and class spinlock, > > but both compaction and migration now EAGAIN if the lock is locked > > already, so that is sorted out. > > > > The remaining problem is map(), which takes zspage read-lock under pool > > rwlock. RFC series (which you hated with passion :P) converted all zsmalloc > > into preemptible ones because of this - zspage->lock is a nested leaf-lock, > > so it cannot schedule unless locks it's nested under permit it (needless to > > say neither rwlock nor spinlock permit it). > > Hmm, so we want the lock to be preemtible, but we don't want to use an > existing preemtible lock because it may be held it from atomic context. > > I think one problem here is that the lock you are introducing is a > spinning lock but the lock holder can be preempted. This is why spinning > locks do not allow preemption. Others waiting for the lock can spin > waiting for a process that is scheduled out. > > For example, the compaction/migration code could be sleeping holding the > write lock, and a map() call would spin waiting for that sleeping task. write-lock holders cannot sleep, that's the key part. So the rules are: 1) writer cannot sleep - migration/compaction runs in atomic context and grabs write-lock only from atomic context - write-locking function disables preemption before lock(), just to be safe, and enables it after unlock() 2) writer does not spin waiting - that's why there is only write_try_lock function - compaction and migration bail out when they cannot lock the zspage 3) readers can sleep and can spin waiting for a lock - other (even preempted) readers don't block new readers - writers don't sleep, they always unlock > I wonder if there's a way to rework the locking instead to avoid the > nesting. It seems like sometimes we lock the zspage with the pool lock > held, sometimes with the class lock held, and sometimes with no lock > held. > > What are the rules here for acquiring the zspage lock? Most of that code is not written by me, but I think the rule is to disable "migration" be it via pool lock or class lock. > Do we need to hold another lock just to make sure the zspage does not go > away from under us? Yes, the page cannot go away via "normal" path: zs_free(last object) -> zspage becomes empty -> free zspage so when we have active mapping() it's only migration and compaction that can free zspage (its content is migrated and so it becomes empty). > Can we use RCU or something similar to do that instead? Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data patterns the clients have. I suspect we'd need to synchronize RCU every time a zspage is freed: zs_free() [this one is complicated], or migration, or compaction? Sounds like anti-pattern for RCU?
On Wed, Feb 05, 2025 at 11:43:16AM +0900, Sergey Senozhatsky wrote: > On (25/02/04 17:19), Yosry Ahmed wrote: > > > sizeof(struct zs_page) change is one thing. Another thing is that > > > zspage->lock is taken from atomic sections, pretty much everywhere. > > > compaction/migration write-lock it under pool rwlock and class spinlock, > > > but both compaction and migration now EAGAIN if the lock is locked > > > already, so that is sorted out. > > > > > > The remaining problem is map(), which takes zspage read-lock under pool > > > rwlock. RFC series (which you hated with passion :P) converted all zsmalloc > > > into preemptible ones because of this - zspage->lock is a nested leaf-lock, > > > so it cannot schedule unless locks it's nested under permit it (needless to > > > say neither rwlock nor spinlock permit it). > > > > Hmm, so we want the lock to be preemtible, but we don't want to use an > > existing preemtible lock because it may be held it from atomic context. > > > > I think one problem here is that the lock you are introducing is a > > spinning lock but the lock holder can be preempted. This is why spinning > > locks do not allow preemption. Others waiting for the lock can spin > > waiting for a process that is scheduled out. > > > > For example, the compaction/migration code could be sleeping holding the > > write lock, and a map() call would spin waiting for that sleeping task. > > write-lock holders cannot sleep, that's the key part. > > So the rules are: > > 1) writer cannot sleep > - migration/compaction runs in atomic context and grabs > write-lock only from atomic context > - write-locking function disables preemption before lock(), just to be > safe, and enables it after unlock() > > 2) writer does not spin waiting > - that's why there is only write_try_lock function > - compaction and migration bail out when they cannot lock the > zspage > > 3) readers can sleep and can spin waiting for a lock > - other (even preempted) readers don't block new readers > - writers don't sleep, they always unlock That's useful, thanks. If we go with custom locking we need to document this clearly and add debug checks where possible. > > > I wonder if there's a way to rework the locking instead to avoid the > > nesting. It seems like sometimes we lock the zspage with the pool lock > > held, sometimes with the class lock held, and sometimes with no lock > > held. > > > > What are the rules here for acquiring the zspage lock? > > Most of that code is not written by me, but I think the rule is to disable > "migration" be it via pool lock or class lock. It seems like we're not holding either of these locks in async_free_zspage() when we call lock_zspage(). Is it safe for a different reason? > > > Do we need to hold another lock just to make sure the zspage does not go > > away from under us? > > Yes, the page cannot go away via "normal" path: > zs_free(last object) -> zspage becomes empty -> free zspage > > so when we have active mapping() it's only migration and compaction > that can free zspage (its content is migrated and so it becomes empty). > > > Can we use RCU or something similar to do that instead? > > Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data > patterns the clients have. I suspect we'd need to synchronize RCU every > time a zspage is freed: zs_free() [this one is complicated], or migration, > or compaction? Sounds like anti-pattern for RCU? Can't we use kfree_rcu() instead of synchronizing? Not sure if this would still be an antipattern tbh. It just seems like the current locking scheme is really complicated :/
On (25/02/05 19:06), Yosry Ahmed wrote:
> > > For example, the compaction/migration code could be sleeping holding the
> > > write lock, and a map() call would spin waiting for that sleeping task.
> >
> > write-lock holders cannot sleep, that's the key part.
> >
> > So the rules are:
> >
> > 1) writer cannot sleep
> > - migration/compaction runs in atomic context and grabs
> > write-lock only from atomic context
> > - write-locking function disables preemption before lock(), just to be
> > safe, and enables it after unlock()
> >
> > 2) writer does not spin waiting
> > - that's why there is only write_try_lock function
> > - compaction and migration bail out when they cannot lock the
> > zspage
> >
> > 3) readers can sleep and can spin waiting for a lock
> > - other (even preempted) readers don't block new readers
> > - writers don't sleep, they always unlock
>
> That's useful, thanks. If we go with custom locking we need to document
> this clearly and add debug checks where possible.
Sure. That's what it currently looks like (can always improve)
---
/*
* zspage lock permits preemption on the reader-side (there can be multiple
* readers). Writers (exclusive zspage ownership), on the other hand, are
* always run in atomic context and cannot spin waiting for a (potentially
* preempted) reader to unlock zspage. This, basically, means that writers
* can only call write-try-lock and must bail out if it didn't succeed.
*
* At the same time, writers cannot reschedule under zspage write-lock,
* so readers can spin waiting for the writer to unlock zspage.
*/
static void zspage_read_lock(struct zspage *zspage)
{
atomic_t *lock = &zspage->lock;
int old = atomic_read_acquire(lock);
do {
if (old == ZS_PAGE_WRLOCKED) {
cpu_relax();
old = atomic_read_acquire(lock);
continue;
}
} while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
}
static void zspage_read_unlock(struct zspage *zspage)
{
atomic_dec_return_release(&zspage->lock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
}
static bool zspage_try_write_lock(struct zspage *zspage)
{
atomic_t *lock = &zspage->lock;
int old = ZS_PAGE_UNLOCKED;
preempt_disable();
if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
return true;
}
preempt_enable();
return false;
}
static void zspage_write_unlock(struct zspage *zspage)
{
atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
preempt_enable();
}
---
Maybe I'll just copy-paste the locking rules list, a list is always cleaner.
> > > I wonder if there's a way to rework the locking instead to avoid the
> > > nesting. It seems like sometimes we lock the zspage with the pool lock
> > > held, sometimes with the class lock held, and sometimes with no lock
> > > held.
> > >
> > > What are the rules here for acquiring the zspage lock?
> >
> > Most of that code is not written by me, but I think the rule is to disable
> > "migration" be it via pool lock or class lock.
>
> It seems like we're not holding either of these locks in
> async_free_zspage() when we call lock_zspage(). Is it safe for a
> different reason?
I think we hold size class lock there. async-free is only for pages that
reached 0 usage ratio (empty fullness group), so they don't hold any
objects any more and from her such zspages either get freed or
find_get_zspage() recovers them from fullness 0 and allocates an object.
Both are synchronized by size class lock.
> > Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data
> > patterns the clients have. I suspect we'd need to synchronize RCU every
> > time a zspage is freed: zs_free() [this one is complicated], or migration,
> > or compaction? Sounds like anti-pattern for RCU?
>
> Can't we use kfree_rcu() instead of synchronizing? Not sure if this
> would still be an antipattern tbh.
Yeah, I don't know. The last time I wrongly used kfree_rcu() it caused a
27% performance drop (some internal code). This zspage thingy maybe will
be better, but still has a potential to generate high numbers of RCU calls,
depends on the clients. Probably the chances are too high. Apart from
that, kvfree_rcu() can sleep, as far as I understand, so zram might have
some extra things to deal with, namely slot-free notifications which can
be called from softirq, and always called under spinlock:
mm slot-free -> zram slot-free -> zs_free -> empty zspage -> kfree_rcu
> It just seems like the current locking scheme is really complicated :/
That's very true.
On Thu, Feb 06, 2025 at 12:05:55PM +0900, Sergey Senozhatsky wrote:
> On (25/02/05 19:06), Yosry Ahmed wrote:
> > > > For example, the compaction/migration code could be sleeping holding the
> > > > write lock, and a map() call would spin waiting for that sleeping task.
> > >
> > > write-lock holders cannot sleep, that's the key part.
> > >
> > > So the rules are:
> > >
> > > 1) writer cannot sleep
> > > - migration/compaction runs in atomic context and grabs
> > > write-lock only from atomic context
> > > - write-locking function disables preemption before lock(), just to be
> > > safe, and enables it after unlock()
> > >
> > > 2) writer does not spin waiting
> > > - that's why there is only write_try_lock function
> > > - compaction and migration bail out when they cannot lock the
> > > zspage
> > >
> > > 3) readers can sleep and can spin waiting for a lock
> > > - other (even preempted) readers don't block new readers
> > > - writers don't sleep, they always unlock
> >
> > That's useful, thanks. If we go with custom locking we need to document
> > this clearly and add debug checks where possible.
>
> Sure. That's what it currently looks like (can always improve)
>
> ---
> /*
> * zspage lock permits preemption on the reader-side (there can be multiple
> * readers). Writers (exclusive zspage ownership), on the other hand, are
> * always run in atomic context and cannot spin waiting for a (potentially
> * preempted) reader to unlock zspage. This, basically, means that writers
> * can only call write-try-lock and must bail out if it didn't succeed.
> *
> * At the same time, writers cannot reschedule under zspage write-lock,
> * so readers can spin waiting for the writer to unlock zspage.
> */
> static void zspage_read_lock(struct zspage *zspage)
> {
> atomic_t *lock = &zspage->lock;
> int old = atomic_read_acquire(lock);
>
> do {
> if (old == ZS_PAGE_WRLOCKED) {
> cpu_relax();
> old = atomic_read_acquire(lock);
> continue;
> }
> } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
>
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> #endif
> }
>
> static void zspage_read_unlock(struct zspage *zspage)
> {
> atomic_dec_return_release(&zspage->lock);
>
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> rwsem_release(&zspage->lockdep_map, _RET_IP_);
> #endif
> }
>
> static bool zspage_try_write_lock(struct zspage *zspage)
> {
> atomic_t *lock = &zspage->lock;
> int old = ZS_PAGE_UNLOCKED;
>
> preempt_disable();
> if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> #endif
> return true;
> }
>
> preempt_enable();
> return false;
> }
>
> static void zspage_write_unlock(struct zspage *zspage)
> {
> atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> rwsem_release(&zspage->lockdep_map, _RET_IP_);
> #endif
> preempt_enable();
> }
> ---
>
> Maybe I'll just copy-paste the locking rules list, a list is always cleaner.
Thanks. I think it would be nice if we could also get someone with
locking expertise to take a look at this.
>
> > > > I wonder if there's a way to rework the locking instead to avoid the
> > > > nesting. It seems like sometimes we lock the zspage with the pool lock
> > > > held, sometimes with the class lock held, and sometimes with no lock
> > > > held.
> > > >
> > > > What are the rules here for acquiring the zspage lock?
> > >
> > > Most of that code is not written by me, but I think the rule is to disable
> > > "migration" be it via pool lock or class lock.
> >
> > It seems like we're not holding either of these locks in
> > async_free_zspage() when we call lock_zspage(). Is it safe for a
> > different reason?
>
> I think we hold size class lock there. async-free is only for pages that
> reached 0 usage ratio (empty fullness group), so they don't hold any
> objects any more and from her such zspages either get freed or
> find_get_zspage() recovers them from fullness 0 and allocates an object.
> Both are synchronized by size class lock.
>
> > > Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data
> > > patterns the clients have. I suspect we'd need to synchronize RCU every
> > > time a zspage is freed: zs_free() [this one is complicated], or migration,
> > > or compaction? Sounds like anti-pattern for RCU?
> >
> > Can't we use kfree_rcu() instead of synchronizing? Not sure if this
> > would still be an antipattern tbh.
>
> Yeah, I don't know. The last time I wrongly used kfree_rcu() it caused a
> 27% performance drop (some internal code). This zspage thingy maybe will
> be better, but still has a potential to generate high numbers of RCU calls,
> depends on the clients. Probably the chances are too high. Apart from
> that, kvfree_rcu() can sleep, as far as I understand, so zram might have
> some extra things to deal with, namely slot-free notifications which can
> be called from softirq, and always called under spinlock:
>
> mm slot-free -> zram slot-free -> zs_free -> empty zspage -> kfree_rcu
>
> > It just seems like the current locking scheme is really complicated :/
>
> That's very true.
Seems like we have to compromise either way, custom locking or we enter
into a new complexity realm with RCU freeing.
On (25/02/06 16:19), Yosry Ahmed wrote:
> > static void zspage_read_lock(struct zspage *zspage)
> > {
> > atomic_t *lock = &zspage->lock;
> > int old = atomic_read_acquire(lock);
> >
> > do {
> > if (old == ZS_PAGE_WRLOCKED) {
> > cpu_relax();
> > old = atomic_read_acquire(lock);
> > continue;
> > }
> > } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
> >
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > #endif
> > }
> >
> > static void zspage_read_unlock(struct zspage *zspage)
> > {
> > atomic_dec_return_release(&zspage->lock);
> >
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > #endif
> > }
> >
> > static bool zspage_try_write_lock(struct zspage *zspage)
> > {
> > atomic_t *lock = &zspage->lock;
> > int old = ZS_PAGE_UNLOCKED;
> >
> > preempt_disable();
> > if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > #endif
> > return true;
> > }
> >
> > preempt_enable();
> > return false;
> > }
> >
> > static void zspage_write_unlock(struct zspage *zspage)
> > {
> > atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > #endif
> > preempt_enable();
> > }
> > ---
> >
> > Maybe I'll just copy-paste the locking rules list, a list is always cleaner.
>
> Thanks. I think it would be nice if we could also get someone with
> locking expertise to take a look at this.
Sure.
I moved the lockdep acquire/release before atomic ops (except for try),
as was suggested by Sebastian in zram sub-thread.
[..]
> Seems like we have to compromise either way, custom locking or we enter
> into a new complexity realm with RCU freeing.
Let's take the blue pill? :)
On Fri, Feb 07, 2025 at 11:48:55AM +0900, Sergey Senozhatsky wrote:
> On (25/02/06 16:19), Yosry Ahmed wrote:
> > > static void zspage_read_lock(struct zspage *zspage)
> > > {
> > > atomic_t *lock = &zspage->lock;
> > > int old = atomic_read_acquire(lock);
> > >
> > > do {
> > > if (old == ZS_PAGE_WRLOCKED) {
> > > cpu_relax();
> > > old = atomic_read_acquire(lock);
> > > continue;
> > > }
> > > } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
> > >
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > > #endif
> > > }
> > >
> > > static void zspage_read_unlock(struct zspage *zspage)
> > > {
> > > atomic_dec_return_release(&zspage->lock);
> > >
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > > #endif
> > > }
> > >
> > > static bool zspage_try_write_lock(struct zspage *zspage)
> > > {
> > > atomic_t *lock = &zspage->lock;
> > > int old = ZS_PAGE_UNLOCKED;
> > >
> > > preempt_disable();
> > > if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
> > > #endif
> > > return true;
> > > }
> > >
> > > preempt_enable();
> > > return false;
> > > }
> > >
> > > static void zspage_write_unlock(struct zspage *zspage)
> > > {
> > > atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > rwsem_release(&zspage->lockdep_map, _RET_IP_);
> > > #endif
> > > preempt_enable();
> > > }
> > > ---
> > >
> > > Maybe I'll just copy-paste the locking rules list, a list is always cleaner.
> >
> > Thanks. I think it would be nice if we could also get someone with
> > locking expertise to take a look at this.
>
> Sure.
>
> I moved the lockdep acquire/release before atomic ops (except for try),
> as was suggested by Sebastian in zram sub-thread.
>
> [..]
> > Seems like we have to compromise either way, custom locking or we enter
> > into a new complexity realm with RCU freeing.
>
> Let's take the blue pill? :)
Can we do some perf testing to make sure this custom locking is not
regressing performance (selfishly I'd like some zswap testing too)?
Perhaps Kairui can help with that since he was already testing this
series.
On (25/02/07 21:09), Yosry Ahmed wrote:
> Can we do some perf testing to make sure this custom locking is not
> regressing performance (selfishly I'd like some zswap testing too)?
So for zsmalloc I (usually) write some simple testing code which is
triggered via sysfs (device attr) and that is completely reproducible,
so that I compares apples to apples. In this particular case I just
have a loop that creates objects (we don't need to compress or decompress
anything, zsmalloc doesn't really care)
- echo 1 > /sys/ ... / test_prepare
for (sz = 32; sz < PAGE_SIZE; sz += 64) {
for (i = 0; i < 4096; i++) {
ent->handle = zs_malloc(zram->mem_pool, sz)
list_add(ent)
}
}
And now I just `perf stat` writes:
- perf stat echo 1 > /sys/ ... / test_exec_old
list_for_each_entry
zs_map_object(ent->handle, ZS_MM_RO);
zs_unmap_object(ent->handle)
list_for_each_entry
dst = zs_map_object(ent->handle, ZS_MM_WO);
memcpy(dst, tmpbuf, ent->sz)
zs_unmap_object(ent->handle)
- perf stat echo 1 > /sys/ ... / test_exec_new
list_for_each_entry
dst = zs_obj_read_begin(ent->handle, loc);
zs_obj_read_end(ent->handle, dst);
list_for_each_entry
zs_obj_write(ent->handle, tmpbuf, ent->sz);
- echo 1 > /sys/ ... / test_finish
free all handles and ent-s
The nice part is that we don't depend on any of the upper layers, we
don't even need to compress/decompress anything; we allocate objects
of required sizes and memcpy static data there (zsmalloc doesn't have
any opinion on that) and that's pretty much it.
OLD API
=======
10 runs
369,205,778 instructions # 0.80 insn per cycle
40,467,926 branches # 113.732 M/sec
369,002,122 instructions # 0.62 insn per cycle
40,426,145 branches # 189.361 M/sec
369,051,170 instructions # 0.45 insn per cycle
40,434,677 branches # 157.574 M/sec
369,014,522 instructions # 0.63 insn per cycle
40,427,754 branches # 201.464 M/sec
369,019,179 instructions # 0.64 insn per cycle
40,429,327 branches # 198.321 M/sec
368,973,095 instructions # 0.64 insn per cycle
40,419,245 branches # 234.210 M/sec
368,950,705 instructions # 0.64 insn per cycle
40,414,305 branches # 231.460 M/sec
369,041,288 instructions # 0.46 insn per cycle
40,432,599 branches # 155.576 M/sec
368,964,080 instructions # 0.67 insn per cycle
40,417,025 branches # 245.665 M/sec
369,036,706 instructions # 0.63 insn per cycle
40,430,860 branches # 204.105 M/sec
NEW API
=======
10 runs
265,799,293 instructions # 0.51 insn per cycle
29,834,567 branches # 170.281 M/sec
265,765,970 instructions # 0.55 insn per cycle
29,829,019 branches # 161.602 M/sec
265,764,702 instructions # 0.51 insn per cycle
29,828,015 branches # 189.677 M/sec
265,836,506 instructions # 0.38 insn per cycle
29,840,650 branches # 124.237 M/sec
265,836,061 instructions # 0.36 insn per cycle
29,842,285 branches # 137.670 M/sec
265,887,080 instructions # 0.37 insn per cycle
29,852,881 branches # 126.060 M/sec
265,769,869 instructions # 0.57 insn per cycle
29,829,873 branches # 210.157 M/sec
265,803,732 instructions # 0.58 insn per cycle
29,835,391 branches # 186.940 M/sec
265,766,624 instructions # 0.58 insn per cycle
29,827,537 branches # 212.609 M/sec
265,843,597 instructions # 0.57 insn per cycle
29,843,650 branches # 171.877 M/sec
x old-api-insn
+ new-api-insn
+-------------------------------------------------------------------------------------+
|+ x|
|+ x|
|+ x|
|+ x|
|+ x|
|+ x|
|+ x|
|+ x|
|+ x|
|+ x|
|A A|
+-------------------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 10 3.689507e+08 3.6920578e+08 3.6901918e+08 3.6902586e+08 71765.519
+ 10 2.657647e+08 2.6588708e+08 2.6580373e+08 2.6580734e+08 42187.024
Difference at 95.0% confidence
-1.03219e+08 +/- 55308.7
-27.9705% +/- 0.0149878%
(Student's t, pooled s = 58864.4)
> Perhaps Kairui can help with that since he was already testing this
> series.
Yeah, would be great.
On Wed, Feb 12, 2025 at 02:00:26PM +0900, Sergey Senozhatsky wrote:
> On (25/02/07 21:09), Yosry Ahmed wrote:
> > Can we do some perf testing to make sure this custom locking is not
> > regressing performance (selfishly I'd like some zswap testing too)?
>
> So for zsmalloc I (usually) write some simple testing code which is
> triggered via sysfs (device attr) and that is completely reproducible,
> so that I compares apples to apples. In this particular case I just
> have a loop that creates objects (we don't need to compress or decompress
> anything, zsmalloc doesn't really care)
>
> - echo 1 > /sys/ ... / test_prepare
>
> for (sz = 32; sz < PAGE_SIZE; sz += 64) {
> for (i = 0; i < 4096; i++) {
> ent->handle = zs_malloc(zram->mem_pool, sz)
> list_add(ent)
> }
> }
>
>
> And now I just `perf stat` writes:
>
> - perf stat echo 1 > /sys/ ... / test_exec_old
>
> list_for_each_entry
> zs_map_object(ent->handle, ZS_MM_RO);
> zs_unmap_object(ent->handle)
>
> list_for_each_entry
> dst = zs_map_object(ent->handle, ZS_MM_WO);
> memcpy(dst, tmpbuf, ent->sz)
> zs_unmap_object(ent->handle)
>
>
>
> - perf stat echo 1 > /sys/ ... / test_exec_new
>
> list_for_each_entry
> dst = zs_obj_read_begin(ent->handle, loc);
> zs_obj_read_end(ent->handle, dst);
>
> list_for_each_entry
> zs_obj_write(ent->handle, tmpbuf, ent->sz);
>
>
> - echo 1 > /sys/ ... / test_finish
>
> free all handles and ent-s
>
>
> The nice part is that we don't depend on any of the upper layers, we
> don't even need to compress/decompress anything; we allocate objects
> of required sizes and memcpy static data there (zsmalloc doesn't have
> any opinion on that) and that's pretty much it.
>
>
> OLD API
> =======
>
> 10 runs
>
> 369,205,778 instructions # 0.80 insn per cycle
> 40,467,926 branches # 113.732 M/sec
>
> 369,002,122 instructions # 0.62 insn per cycle
> 40,426,145 branches # 189.361 M/sec
>
> 369,051,170 instructions # 0.45 insn per cycle
> 40,434,677 branches # 157.574 M/sec
>
> 369,014,522 instructions # 0.63 insn per cycle
> 40,427,754 branches # 201.464 M/sec
>
> 369,019,179 instructions # 0.64 insn per cycle
> 40,429,327 branches # 198.321 M/sec
>
> 368,973,095 instructions # 0.64 insn per cycle
> 40,419,245 branches # 234.210 M/sec
>
> 368,950,705 instructions # 0.64 insn per cycle
> 40,414,305 branches # 231.460 M/sec
>
> 369,041,288 instructions # 0.46 insn per cycle
> 40,432,599 branches # 155.576 M/sec
>
> 368,964,080 instructions # 0.67 insn per cycle
> 40,417,025 branches # 245.665 M/sec
>
> 369,036,706 instructions # 0.63 insn per cycle
> 40,430,860 branches # 204.105 M/sec
>
>
> NEW API
> =======
>
> 10 runs
>
> 265,799,293 instructions # 0.51 insn per cycle
> 29,834,567 branches # 170.281 M/sec
>
> 265,765,970 instructions # 0.55 insn per cycle
> 29,829,019 branches # 161.602 M/sec
>
> 265,764,702 instructions # 0.51 insn per cycle
> 29,828,015 branches # 189.677 M/sec
>
> 265,836,506 instructions # 0.38 insn per cycle
> 29,840,650 branches # 124.237 M/sec
>
> 265,836,061 instructions # 0.36 insn per cycle
> 29,842,285 branches # 137.670 M/sec
>
> 265,887,080 instructions # 0.37 insn per cycle
> 29,852,881 branches # 126.060 M/sec
>
> 265,769,869 instructions # 0.57 insn per cycle
> 29,829,873 branches # 210.157 M/sec
>
> 265,803,732 instructions # 0.58 insn per cycle
> 29,835,391 branches # 186.940 M/sec
>
> 265,766,624 instructions # 0.58 insn per cycle
> 29,827,537 branches # 212.609 M/sec
>
> 265,843,597 instructions # 0.57 insn per cycle
> 29,843,650 branches # 171.877 M/sec
>
>
> x old-api-insn
> + new-api-insn
> +-------------------------------------------------------------------------------------+
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |+ x|
> |A A|
> +-------------------------------------------------------------------------------------+
> N Min Max Median Avg Stddev
> x 10 3.689507e+08 3.6920578e+08 3.6901918e+08 3.6902586e+08 71765.519
> + 10 2.657647e+08 2.6588708e+08 2.6580373e+08 2.6580734e+08 42187.024
> Difference at 95.0% confidence
> -1.03219e+08 +/- 55308.7
> -27.9705% +/- 0.0149878%
> (Student's t, pooled s = 58864.4)
Thanks for sharing these results, but I wonder if this will capture
regressions from locking changes (e.g. a lock being preemtible)? IIUC
this is counting the instructions executed in these paths, and that
won't change if the task gets preempted. Lock contention may be captured
as extra instructions, but I am not sure we'll directly see its effect
in terms of serialization and delays.
I think we also need some high level testing (e.g. concurrent
swapins/swapouts) to find that out. I think that's what Kairui's testing
covers.
On (25/02/12 15:35), Yosry Ahmed wrote: > > Difference at 95.0% confidence > > -1.03219e+08 +/- 55308.7 > > -27.9705% +/- 0.0149878% > > (Student's t, pooled s = 58864.4) > > Thanks for sharing these results, but I wonder if this will capture > regressions from locking changes (e.g. a lock being preemtible)? IIUC > this is counting the instructions executed in these paths, and that > won't change if the task gets preempted. Lock contention may be captured > as extra instructions, but I am not sure we'll directly see its effect > in terms of serialization and delays. Yeah.. > I think we also need some high level testing (e.g. concurrent > swapins/swapouts) to find that out. I think that's what Kairui's testing > covers. I do a fair amount of high-level testing: heavy parallel (make -j36 and parallel dd) workloads (multiple zram devices configuration - zram0 ext4, zram1 writeback device, zram2 swap) w/ and w/o lockdep. In addition I also run these workloads under heavy memory pressure (a 4GB VM), when oom-killer starts to run around with a pair of scissors. But it's mostly regression testing.
February 12, 2025 at 6:18 PM, "Sergey Senozhatsky" <senozhatsky@chromium.org> wrote: > > On (25/02/12 15:35), Yosry Ahmed wrote: > > > > > Difference at 95.0% confidence > > > > -1.03219e+08 +/- 55308.7 > > > > -27.9705% +/- 0.0149878% > > > > (Student's t, pooled s = 58864.4) > > > > > > > > Thanks for sharing these results, but I wonder if this will capture > > > > regressions from locking changes (e.g. a lock being preemtible)? IIUC > > > > this is counting the instructions executed in these paths, and that > > > > won't change if the task gets preempted. Lock contention may be captured > > > > as extra instructions, but I am not sure we'll directly see its effect > > > > in terms of serialization and delays. > > > > Yeah.. > > > > > I think we also need some high level testing (e.g. concurrent > > > > swapins/swapouts) to find that out. I think that's what Kairui's testing > > > > covers. > > > > I do a fair amount of high-level testing: heavy parallel (make -j36 and > > parallel dd) workloads (multiple zram devices configuration - zram0 ext4, > > zram1 writeback device, zram2 swap) w/ and w/o lockdep. In addition I also > > run these workloads under heavy memory pressure (a 4GB VM), when oom-killer > > starts to run around with a pair of scissors. But it's mostly regression > > testing. > If we can get some numbers from these parallel workloads that would be better than the perf stats imo.
On (25/02/13 02:57), Yosry Ahmed wrote:
> > > I think we also need some high level testing (e.g. concurrent
> > >
> > > swapins/swapouts) to find that out. I think that's what Kairui's testing
> > >
> > > covers.
> > >
> >
> > I do a fair amount of high-level testing: heavy parallel (make -j36 and
> >
> > parallel dd) workloads (multiple zram devices configuration - zram0 ext4,
> >
> > zram1 writeback device, zram2 swap) w/ and w/o lockdep. In addition I also
> >
> > run these workloads under heavy memory pressure (a 4GB VM), when oom-killer
> >
> > starts to run around with a pair of scissors. But it's mostly regression
> >
> > testing.
> >
// JFI it seems your email client/service for some reason injects a lot
// of empty lines
> If we can get some numbers from these parallel workloads that would be better than the perf stats imo.
make -j24 CONFIG_PREEMPT
BASE
====
1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k
lock stats
class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
&pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 10001 702081 0.14 104.74 125571.64 0.18
&class->lock: 1 1 0.25 0.25 0.25 0.25 6320 840542 0.06 809.72 191214.87 0.23
&zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 6452 664129 0.12 660.24 201888.61 0.30
&zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1716362 3096466 0.07 811.10 365551.24 0.12
&zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664129 1.68 1004.80 14853571.32 22.37
PATCHED
=======
1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k
lock stats
class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
&pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 3648 701979 0.12 44.09 107333.02 0.15
&class->lock: 0 0 0.00 0.00 0.00 0.00 5038 840434 0.06 1245.90 211814.60 0.25
zsmalloc-page-R: 0 0 0.00 0.00 0.00 0.00 0 664078 0.05 699.35 236641.75 0.36
zram-entry->lock: 0 0 0.00 0.00 0.00 0.00 0 3098328 0.06 2987.02 313339.11 0.10
&per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 664078 1.77 7071.30 14838397.61 22.34
On (25/02/13 16:21), Sergey Senozhatsky wrote:
> BASE
> ====
>
> 1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k
>
> lock stats
>
> class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
> &pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 10001 702081 0.14 104.74 125571.64 0.18
> &class->lock: 1 1 0.25 0.25 0.25 0.25 6320 840542 0.06 809.72 191214.87 0.23
> &zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 6452 664129 0.12 660.24 201888.61 0.30
> &zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1716362 3096466 0.07 811.10 365551.24 0.12
> &zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664129 1.68 1004.80 14853571.32 22.37
>
> PATCHED
> =======
>
> 1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k
>
> lock stats
>
> class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
> &pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 3648 701979 0.12 44.09 107333.02 0.15
> &class->lock: 0 0 0.00 0.00 0.00 0.00 5038 840434 0.06 1245.90 211814.60 0.25
> zsmalloc-page-R: 0 0 0.00 0.00 0.00 0.00 0 664078 0.05 699.35 236641.75 0.36
> zram-entry->lock: 0 0 0.00 0.00 0.00 0.00 0 3098328 0.06 2987.02 313339.11 0.10
> &per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 664078 1.77 7071.30 14838397.61 22.34
So...
I added lock-stat handling to zspage->lock and to zram (in zram it's only
trylock that we can track, but it doesn't really bother me). I also
renamed zsmalloc-page-R to old zspage->lock-R and zram-entry->lock to
old zram->table[index].lock, just in case if anyone cares.
Now bounces stats for zspage->lock and zram->table[index].lock look
pretty much like in BASE case.
PATCHED
=======
class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
&pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 2702 703841 0.22 873.90 197110.49 0.28
&class->lock: 0 0 0.00 0.00 0.00 0.00 4590 842336 0.10 3329.63 256595.70 0.30
zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 4750 665011 0.08 3360.60 258402.21 0.39
zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1722291 3099346 0.12 6943.09 721282.34 0.23
&per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 665011 2.84 7062.18 14896206.16 22.40
On Thu, Feb 13, 2025 at 05:22:20PM +0900, Sergey Senozhatsky wrote: > On (25/02/13 16:21), Sergey Senozhatsky wrote: > > BASE > > ==== > > > > 1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k > > > > lock stats > > > > class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg > > &pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 10001 702081 0.14 104.74 125571.64 0.18 > > &class->lock: 1 1 0.25 0.25 0.25 0.25 6320 840542 0.06 809.72 191214.87 0.23 > > &zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 6452 664129 0.12 660.24 201888.61 0.30 > > &zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1716362 3096466 0.07 811.10 365551.24 0.12 > > &zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664129 1.68 1004.80 14853571.32 22.37 > > > > PATCHED > > ======= > > > > 1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k > > > > lock stats > > > > class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg > > &pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 3648 701979 0.12 44.09 107333.02 0.15 > > &class->lock: 0 0 0.00 0.00 0.00 0.00 5038 840434 0.06 1245.90 211814.60 0.25 > > zsmalloc-page-R: 0 0 0.00 0.00 0.00 0.00 0 664078 0.05 699.35 236641.75 0.36 > > zram-entry->lock: 0 0 0.00 0.00 0.00 0.00 0 3098328 0.06 2987.02 313339.11 0.10 > > &per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 664078 1.77 7071.30 14838397.61 22.34 > > So... > > I added lock-stat handling to zspage->lock and to zram (in zram it's only > trylock that we can track, but it doesn't really bother me). I also > renamed zsmalloc-page-R to old zspage->lock-R and zram-entry->lock to > old zram->table[index].lock, just in case if anyone cares. > > Now bounces stats for zspage->lock and zram->table[index].lock look > pretty much like in BASE case. > > PATCHED > ======= > > class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg > &pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 2702 703841 0.22 873.90 197110.49 0.28 > &class->lock: 0 0 0.00 0.00 0.00 0.00 4590 842336 0.10 3329.63 256595.70 0.30 > zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 4750 665011 0.08 3360.60 258402.21 0.39 > zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1722291 3099346 0.12 6943.09 721282.34 0.23 > &per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 665011 2.84 7062.18 14896206.16 22.40 > holdtime-max and holdtime-total are higher in the patched kernel. Not sure if this is just an artifact of lock holders being preemtible.
On (25/02/13 15:25), Yosry Ahmed wrote:
> On Thu, Feb 13, 2025 at 05:22:20PM +0900, Sergey Senozhatsky wrote:
> > On (25/02/13 16:21), Sergey Senozhatsky wrote:
> > > BASE
> > > ====
> > >
> > > 1363.64user 157.08system 1:30.89elapsed 1673%CPU (0avgtext+0avgdata 825692maxresident)k
> > >
> > > lock stats
> > >
> > > class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
> > > &pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 10001 702081 0.14 104.74 125571.64 0.18
> > > &class->lock: 1 1 0.25 0.25 0.25 0.25 6320 840542 0.06 809.72 191214.87 0.23
> > > &zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 6452 664129 0.12 660.24 201888.61 0.30
> > > &zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1716362 3096466 0.07 811.10 365551.24 0.12
> > > &zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664129 1.68 1004.80 14853571.32 22.37
> > >
> > > PATCHED
> > > =======
> > >
> > > 1366.50user 154.89system 1:30.33elapsed 1684%CPU (0avgtext+0avgdata 825692maxresident)k
> > >
> > > lock stats
> > >
> > > class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
> > > &pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 3648 701979 0.12 44.09 107333.02 0.15
> > > &class->lock: 0 0 0.00 0.00 0.00 0.00 5038 840434 0.06 1245.90 211814.60 0.25
> > > zsmalloc-page-R: 0 0 0.00 0.00 0.00 0.00 0 664078 0.05 699.35 236641.75 0.36
> > > zram-entry->lock: 0 0 0.00 0.00 0.00 0.00 0 3098328 0.06 2987.02 313339.11 0.10
> > > &per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 664078 1.77 7071.30 14838397.61 22.34
> >
> > So...
> >
> > I added lock-stat handling to zspage->lock and to zram (in zram it's only
> > trylock that we can track, but it doesn't really bother me). I also
> > renamed zsmalloc-page-R to old zspage->lock-R and zram-entry->lock to
> > old zram->table[index].lock, just in case if anyone cares.
> >
> > Now bounces stats for zspage->lock and zram->table[index].lock look
> > pretty much like in BASE case.
> >
> > PATCHED
> > =======
> >
> > class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
> > &pool->lock#3-R: 0 0 0.00 0.00 0.00 0.00 2702 703841 0.22 873.90 197110.49 0.28
> > &class->lock: 0 0 0.00 0.00 0.00 0.00 4590 842336 0.10 3329.63 256595.70 0.30
> > zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 4750 665011 0.08 3360.60 258402.21 0.39
> > zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1722291 3099346 0.12 6943.09 721282.34 0.23
> > &per_cpu_ptr(comp->stream, cpu)->lock: 0 0 0.00 0.00 0.00 0.00 23 665011 2.84 7062.18 14896206.16 22.40
> >
>
> holdtime-max and holdtime-total are higher in the patched kernel. Not
> sure if this is just an artifact of lock holders being preemtible.
Hmm, pool->lock shouldn't be affected at all, however BASE holds it much
longer than PATCHED
holdtime-max holdtime-total
BASE 104.74 125571.64
PATCHED 44.09 107333.02
Doesn't make sense. I can understand zspage->lock and
zram->table[index].lock, but for zram->table[index] things look
strange (comparing run #1 and #2)
holdtime-total
BASE 365551.24
PATCHED 313339.11
And run #3 is in its own league.
Very likely just a very very bad way to test things.
Re-based on 6.14.0-rc2-next-20250213.
BASE
====
PREEMPT_NONE
class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
&pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 3624 702276 0.15 35.96 126562.90 0.18
&class->lock: 0 0 0.00 0.00 0.00 0.00 5084 840733 0.06 795.26 183238.22 0.22
&zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 5358 664228 0.12 43.71 192732.71 0.29
&zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1528645 3095862 0.07 764.76 370881.23 0.12
&zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664228 2.52 2033.81 14605911.45 21.99
PREEMPT_VOLUNTARY
class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
&pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 3039 699556 0.14 50.78 125553.59 0.18
&class->lock: 0 0 0.00 0.00 0.00 0.00 5259 838005 0.06 943.43 177108.05 0.21
&zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 5581 664096 0.12 81.56 190235.48 0.29
&zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1731706 3098570 0.07 796.87 366934.54 0.12
&zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664096 3.38 5074.72 14472697.91 21.79
PREEMPT
class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
&pool->migrate_lock-R: 0 0 0.00 0.00 0.00 0.00 2545 701827 0.14 773.56 125463.37 0.18
&class->lock: 0 0 0.00 0.00 0.00 0.00 4697 840281 0.06 1701.18 231657.38 0.28
&zspage->lock-R: 0 0 0.00 0.00 0.00 0.00 4778 664002 0.12 755.62 181215.17 0.27
&zram->table[index].lock: 0 0 0.00 0.00 0.00 0.00 1731737 3096937 0.07 1703.92 384633.29 0.12
&zstrm->lock: 0 0 0.00 0.00 0.00 0.00 0 664002 2.85 3603.20 14586900.58 21.97
So somehow holdtime-max for per-CPU stream is 2.5x higher for PREEMPT_VOLUNTARY
than for PREEMPT_NONE. And class->lock holdtime-total is much much higher for
PREEMPT than for any other preemption models. And that's BASE kernel, which
runs fully atomic zsmalloc and zram. I call this rubbish.
On (25/02/06 12:05), Sergey Senozhatsky wrote:
>
> Sure. That's what it currently looks like (can always improve)
>
- added must-check
- added preemptible() check // just in case
- added locking rules list
Oh, and also switched to acquire/release semantics, like you suggested
a couple of days ago.
---
/*
* zspage locking rules:
*
* 1) writer-lock is exclusive
*
* 2) writer-lock owner cannot sleep
*
* 3) writer-lock owner cannot spin waiting for the lock
* - caller (e.g. compaction and migration) must check return value and
* handle locking failures
* - there is only TRY variant of writer-lock function
*
* 4) reader-lock owners (multiple) can sleep
*
* 5) reader-lock owners can spin waiting for the lock, in any context
* - existing readers (even preempted ones) don't block new readers
* - writer-lock owners never sleep, always unlock at some point
*/
static void zspage_read_lock(struct zspage *zspage)
{
atomic_t *lock = &zspage->lock;
int old = atomic_read_acquire(lock);
do {
if (old == ZS_PAGE_WRLOCKED) {
cpu_relax();
old = atomic_read_acquire(lock);
continue;
}
} while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
}
static void zspage_read_unlock(struct zspage *zspage)
{
atomic_dec_return_release(&zspage->lock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
}
static __must_check bool zspage_try_write_lock(struct zspage *zspage)
{
atomic_t *lock = &zspage->lock;
int old = ZS_PAGE_UNLOCKED;
WARN_ON_ONCE(preemptible());
preempt_disable();
if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
#endif
return true;
}
preempt_enable();
return false;
}
static void zspage_write_unlock(struct zspage *zspage)
{
atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
rwsem_release(&zspage->lockdep_map, _RET_IP_);
#endif
preempt_enable();
}
On (25/02/03 12:13), Sergey Senozhatsky wrote:
> > Just my 2c.
>
> Perhaps we can sprinkle some lockdep on it.
I forgot to rwsem_release() in zspage_write_unlock() and that
has triggered lockdep :)
---
mm/zsmalloc.c | 27 ++++++++++++++++++++++++++-
1 file changed, 26 insertions(+), 1 deletion(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 956445f4d554..1d4700e457d4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -308,6 +308,10 @@ struct zspage {
struct list_head list; /* fullness list */
struct zs_pool *pool;
atomic_t lock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map lockdep_map;
+#endif
};
struct mapping_area {
@@ -319,6 +323,12 @@ struct mapping_area {
static void zspage_lock_init(struct zspage *zspage)
{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key key;
+
+ lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page", &key, 0);
+#endif
+
atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
}
@@ -344,11 +354,19 @@ static void zspage_read_lock(struct zspage *zspage)
continue;
}
} while (!atomic_try_cmpxchg(lock, &old, old + 1));
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
}
static void zspage_read_unlock(struct zspage *zspage)
{
atomic_dec(&zspage->lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
}
static bool zspage_try_write_lock(struct zspage *zspage)
@@ -357,8 +375,12 @@ static bool zspage_try_write_lock(struct zspage *zspage)
int old = ZS_PAGE_UNLOCKED;
preempt_disable();
- if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED))
+ if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
return true;
+ }
preempt_enable();
return false;
@@ -367,6 +389,9 @@ static bool zspage_try_write_lock(struct zspage *zspage)
static void zspage_write_unlock(struct zspage *zspage)
{
atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
preempt_enable();
}
--
2.48.1.362.g079036d154-goog
© 2016 - 2026 Red Hat, Inc.