From nobody Wed Apr 15 19:25:06 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id EB42DC433FE for ; Thu, 17 Nov 2022 16:39:21 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S240401AbiKQQjU (ORCPT ); Thu, 17 Nov 2022 11:39:20 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47744 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S240637AbiKQQiz (ORCPT ); Thu, 17 Nov 2022 11:38:55 -0500 Received: from mail-pl1-x632.google.com (mail-pl1-x632.google.com [IPv6:2607:f8b0:4864:20::632]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 4D29E7818B for ; Thu, 17 Nov 2022 08:38:43 -0800 (PST) Received: by mail-pl1-x632.google.com with SMTP id y4so2133599plb.2 for ; Thu, 17 Nov 2022 08:38:43 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=TdXYXXujE5c6Z8uTdZ5KokARMRQa3FrJU+bkNzs2RLY=; b=oUdeLYBGlvLG123q8GqQX48SG762gZkHyxwD/CQnrBoB4Vh1+p2Ck7lVhiNkqnu5q0 du9Xij/rT2JVxeyh9jH5BAwEhVln1dU0S3KA2SfSRSnQQG0IOlwjQHR7toI9DdBaEbeQ 9B1MH+AyYhr8Zxcxdi22Wm8dI5rSEieLZCPSN2sPD16X47ZsCxyHQp4Avg6axUIlyOim PBuesYanmIb4IOmdKWe6NhkCBviHMSzeYwiqsNlw1dNcmsQZP963LulvMKKMZhxHI5Rv bJlQHezRy1lbHMPvTEV9CPvM/QW6fWHqSCqpAhiY34OIqcdl621KD7NG/zqRW5tQZWJP Z6tg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=TdXYXXujE5c6Z8uTdZ5KokARMRQa3FrJU+bkNzs2RLY=; b=XohBn0zuMJZjEceSSX2c2B/PBdD6spgKptGeFwpPnEbrCtngG3eFHTRhTfpWB5W5vr K5u/T+PGZHsRR6/zubNST8/nVZ8E0H5ee24BBsu060u8Qcb+nmfm1/GiDOgffbR40N0l 8F3zMp6ox+/hhNJEphiOy0Gw8YfLMweXsia0og/QX0eZ+ZXY/B0Ud7m9d+wyI6TnQWbo c6nEj7X+koDY+tvYeSxcXsMPzlSQJlSAAnBZ/YZ55B8lUcDpbtmyBaut95iv09FdJlzL 790Y+JSFU+7/8U8in5I4M/3BC+Hsoh9vdbMtzJwq1WF3Z27sTwVx0b9Us4lt8Sn2tUPH dMPw== X-Gm-Message-State: ANoB5pnoJjaok/68kN99W+B5jBaqgSAUIbE/mTz2E5doIHNmnA5M8fx5 v0QT4RdWe6dHAHWiXBbgkNMYIYjllGavCg== X-Google-Smtp-Source: AA0mqf4dCzk0UaidJ1LEguhOxPNhCf2SFFc2QYWKmgSXtPv/ga+VnLB8+OZZUwB8M6zR7N1hzOWP/g== X-Received: by 2002:a17:902:c409:b0:186:fd58:c706 with SMTP id k9-20020a170902c40900b00186fd58c706mr3658532plk.4.1668703122572; Thu, 17 Nov 2022 08:38:42 -0800 (PST) Received: from localhost (fwdproxy-prn-000.fbsv.net. [2a03:2880:ff::face:b00c]) by smtp.gmail.com with ESMTPSA id y65-20020a626444000000b0056d73ef41fdsm1377275pfb.75.2022.11.17.08.38.41 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 17 Nov 2022 08:38:42 -0800 (PST) From: Nhat Pham To: akpm@linux-foundation.org Cc: hannes@cmpxchg.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, minchan@kernel.org, ngupta@vflare.org, senozhatsky@chromium.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com Subject: [PATCH v4 1/5] zswap: fix writeback lock ordering for zsmalloc Date: Thu, 17 Nov 2022 08:38:35 -0800 Message-Id: <20221117163839.230900-2-nphamcs@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221117163839.230900-1-nphamcs@gmail.com> References: <20221117163839.230900-1-nphamcs@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Johannes Weiner zswap's customary lock order is tree->lock before pool->lock, because the tree->lock protects the entries' refcount, and the free callbacks in the backends acquire their respective pool locks to dispatch the backing object. zsmalloc's map callback takes the pool lock, so zswap must not grab the tree->lock while a handle is mapped. This currently only happens during writeback, which isn't implemented for zsmalloc. In preparation for it, move the tree->lock section out of the mapped entry section Signed-off-by: Johannes Weiner Signed-off-by: Nhat Pham --- mm/zswap.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 2d48fd59cc7a..2d69c1d678fe 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -958,7 +958,7 @@ static int zswap_writeback_entry(struct zpool *pool, un= signed long handle) }; if (!zpool_can_sleep_mapped(pool)) { - tmp =3D kmalloc(PAGE_SIZE, GFP_ATOMIC); + tmp =3D kmalloc(PAGE_SIZE, GFP_KERNEL); if (!tmp) return -ENOMEM; } @@ -968,6 +968,7 @@ static int zswap_writeback_entry(struct zpool *pool, un= signed long handle) swpentry =3D zhdr->swpentry; /* here */ tree =3D zswap_trees[swp_type(swpentry)]; offset =3D swp_offset(swpentry); + zpool_unmap_handle(pool, handle); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -975,20 +976,12 @@ static int zswap_writeback_entry(struct zpool *pool, = unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); - zpool_unmap_handle(pool, handle); kfree(tmp); return 0; } spin_unlock(&tree->lock); BUG_ON(offset !=3D entry->offset); - src =3D (u8 *)zhdr + sizeof(struct zswap_header); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src =3D tmp; - zpool_unmap_handle(pool, handle); - } - /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ @@ -1006,6 +999,14 @@ static int zswap_writeback_entry(struct zpool *pool, = unsigned long handle) acomp_ctx =3D raw_cpu_ptr(entry->pool->acomp_ctx); dlen =3D PAGE_SIZE; + zhdr =3D zpool_map_handle(pool, handle, ZPOOL_MM_RO); + src =3D (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src =3D tmp; + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1015,6 +1016,11 @@ static int zswap_writeback_entry(struct zpool *pool,= unsigned long handle) dlen =3D acomp_ctx->req->dlen; mutex_unlock(acomp_ctx->mutex); + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); + else + zpool_unmap_handle(pool, handle); + BUG_ON(ret); BUG_ON(dlen !=3D PAGE_SIZE); @@ -1045,7 +1051,11 @@ static int zswap_writeback_entry(struct zpool *pool,= unsigned long handle) zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - goto end; + return ret; + +fail: + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); /* * if we get here due to ZSWAP_SWAPCACHE_EXIST @@ -1054,17 +1064,10 @@ static int zswap_writeback_entry(struct zpool *pool= , unsigned long handle) * if we free the entry in the following put * it is also okay to return !0 */ -fail: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); -end: - if (zpool_can_sleep_mapped(pool)) - zpool_unmap_handle(pool, handle); - else - kfree(tmp); - return ret; } -- 2.30.2 From nobody Wed Apr 15 19:25:06 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id A499DC433FE for ; Thu, 17 Nov 2022 16:39:24 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234887AbiKQQjX (ORCPT ); Thu, 17 Nov 2022 11:39:23 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:46476 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S240673AbiKQQi4 (ORCPT ); Thu, 17 Nov 2022 11:38:56 -0500 Received: from mail-pf1-x430.google.com (mail-pf1-x430.google.com [IPv6:2607:f8b0:4864:20::430]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 80AC37819C for ; Thu, 17 Nov 2022 08:38:44 -0800 (PST) Received: by mail-pf1-x430.google.com with SMTP id d192so2329012pfd.0 for ; Thu, 17 Nov 2022 08:38:44 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=Ub0OYe15qtfzmwShje8nnnhP69fp8NMaFxLpIwoOo2w=; b=X2dprwB0QPC7j1TVSlklZ88Q8XmK/LmAGADpOqATmpKBdhZqoAz9gHttUjlAr7LyYS 2ggFXhUxlfBNZDbRTVfJqo48oeuvu7i55YO0s67//uQ1HYT/OtC/vp4r/Qqh8JlGXidJ BFDMTVkZso68Q9PeguqKeOR4rVMQCbkBzuevXvop1HtCPnR3GCka8ZFmcrJsTJM1VLrk 7nduDgTiDLqXK9QgBdQq7dfRp3kKkjdIWbYrF55Vk1ZdsFGGIgEaS5rapEzAJVJkqSrN kTLsN4h07HhAlVC53YObiNpdyub9Q4y13KhPAm4MEMVgPeuY055s8mdJmAymVKtJJbmf b1tg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=Ub0OYe15qtfzmwShje8nnnhP69fp8NMaFxLpIwoOo2w=; b=5OLhOmAw6kdn3peICzJ2QxyAVBGcHX8B6h0Y6uzaN+tzXkvKzsB+s1y0FqW9DX2AzL 02PJbmrnweMp9isHtADG8eWTuXDsIkPL7dgb09OmPWC6/etzKZYGR1+FhXbEQ5v6nsbe p4B8zWklp7RbEJ75rJFZa047KpGHJLnWRm+STLrAGPQJADI2uh5dEUbEiKYuJ+UzOfuw Q3vPqScJIxyWnye+gA6gQtkhzy3e7pStwZTzrqxvJbj/RW35X+yTecqaXZbg/nGhOr07 p9Qx2aoVtUgMrRMAtxXgsC0+f7RWK7q1QeTyxG1QapUDgpRYmFGHp9+KqYEMiGs0Yw1n +R5A== X-Gm-Message-State: ANoB5pkrXsy6KGoc8LUioG3A0yutuT/kTX9r/9yXK2EZ7hT4hMew/15T UpbArIpp8aRDg1ZlADJYWZ0= X-Google-Smtp-Source: AA0mqf7L/OMubMPqn1OdvvDkKt+FAPKOS+epc/byFsdnPPcV3BRvyFiFa9fcrUJTL9IsdlbBAAWBDw== X-Received: by 2002:a62:54c2:0:b0:56b:fb4f:3d7c with SMTP id i185-20020a6254c2000000b0056bfb4f3d7cmr3709086pfb.54.1668703123868; Thu, 17 Nov 2022 08:38:43 -0800 (PST) Received: from localhost (fwdproxy-prn-000.fbsv.net. [2a03:2880:ff::face:b00c]) by smtp.gmail.com with ESMTPSA id a13-20020a170902eccd00b0016d4f05eb95sm1572828plh.272.2022.11.17.08.38.43 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 17 Nov 2022 08:38:43 -0800 (PST) From: Nhat Pham To: akpm@linux-foundation.org Cc: hannes@cmpxchg.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, minchan@kernel.org, ngupta@vflare.org, senozhatsky@chromium.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com Subject: [PATCH v4 2/5] zsmalloc: Consolidate zs_pool's migrate_lock and size_class's locks Date: Thu, 17 Nov 2022 08:38:36 -0800 Message-Id: <20221117163839.230900-3-nphamcs@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221117163839.230900-1-nphamcs@gmail.com> References: <20221117163839.230900-1-nphamcs@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Currently, zsmalloc has a hierarchy of locks, which includes a pool-level migrate_lock, and a lock for each size class. We have to obtain both locks in the hotpath in most cases anyway, except for zs_malloc. This exception will no longer exist when we introduce a LRU into the zs_pool for the new writeback functionality - we will need to obtain a pool-level lock to synchronize LRU handling even in zs_malloc. In preparation for zsmalloc writeback, consolidate these locks into a single pool-level lock, which drastically reduces the complexity of synchronization in zsmalloc. We have also benchmarked the lock consolidation to see the performance effect of this change on zram. First, we ran a synthetic FS workload on a server machine with 36 cores (same machine for all runs), using fs_mark -d ../zram1mnt -s 100000 -n 2500 -t 32 -k before and after for btrfs and ext4 on zram (FS usage is 80%). Here is the result (unit is file/second): With lock consolidation (btrfs): Average: 13520.2, Median: 13531.0, Stddev: 137.5961482019028 Without lock consolidation (btrfs): Average: 13487.2, Median: 13575.0, Stddev: 309.08283679298665 With lock consolidation (ext4): Average: 16824.4, Median: 16839.0, Stddev: 89.97388510006668 Without lock consolidation (ext4) Average: 16958.0, Median: 16986.0, Stddev: 194.7370021336469 As you can see, we observe a 0.3% regression for btrfs, and a 0.9% regression for ext4. This is a small, barely measurable difference in my opinion. For a more realistic scenario, we also tries building the kernel on zram. Here is the time it takes (in seconds): With lock consolidation (btrfs): real Average: 319.6, Median: 320.0, Stddev: 0.8944271909999159 user Average: 6894.2, Median: 6895.0, Stddev: 25.528415540334656 sys Average: 521.4, Median: 522.0, Stddev: 1.51657508881031 Without lock consolidation (btrfs): real Average: 319.8, Median: 320.0, Stddev: 0.8366600265340756 user Average: 6896.6, Median: 6899.0, Stddev: 16.04057355583023 sys Average: 520.6, Median: 521.0, Stddev: 1.140175425099138 With lock consolidation (ext4): real Average: 320.0, Median: 319.0, Stddev: 1.4142135623730951 user Average: 6896.8, Median: 6878.0, Stddev: 28.621670111997307 sys Average: 521.2, Median: 521.0, Stddev: 1.7888543819998317 Without lock consolidation (ext4) real Average: 319.6, Median: 319.0, Stddev: 0.8944271909999159 user Average: 6886.2, Median: 6887.0, Stddev: 16.93221781102523 sys Average: 520.4, Median: 520.0, Stddev: 1.140175425099138 The difference is entirely within the noise of a typical run on zram. This hardly justifies the complexity of maintaining both the pool lock and the class lock. In fact, for writeback, we would need to introduce yet another lock to prevent data races on the pool's LRU, further complicating the lock handling logic. IMHO, it is just better to collapse all of these into a single pool-level lock. Suggested-by: Johannes Weiner Signed-off-by: Nhat Pham Acked-by: Minchan Kim --- mm/zsmalloc.c | 87 ++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d03941cace2c..326faa751f0a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -33,8 +33,7 @@ /* * lock ordering: * page_lock - * pool->migrate_lock - * class->lock + * pool->lock * zspage->lock */ @@ -192,7 +191,6 @@ static const int fullness_threshold_frac =3D 4; static size_t huge_class_size; struct size_class { - spinlock_t lock; struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple @@ -247,8 +245,7 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif - /* protect page/zspage migration */ - rwlock_t migrate_lock; + spinlock_t lock; }; struct zspage { @@ -355,7 +352,7 @@ static void cache_free_zspage(struct zs_pool *pool, str= uct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } -/* class->lock(which owns the handle) synchronizes races */ +/* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { *(unsigned long *)handle =3D obj; @@ -452,7 +449,7 @@ static __maybe_unused int is_first_page(struct page *pa= ge) return PagePrivate(page); } -/* Protected by class->lock */ +/* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; @@ -597,13 +594,13 @@ static int zs_stats_size_show(struct seq_file *s, voi= d *v) if (class->index !=3D i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); class_almost_full =3D zs_stat_get(class, CLASS_ALMOST_FULL); class_almost_empty =3D zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated =3D zs_stat_get(class, OBJ_ALLOCATED); obj_used =3D zs_stat_get(class, OBJ_USED); freeable =3D zs_can_compact(class); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); objs_per_zspage =3D class->objs_per_zspage; pages_used =3D obj_allocated / objs_per_zspage * @@ -916,7 +913,7 @@ static void __free_zspage(struct zs_pool *pool, struct = size_class *class, get_zspage_mapping(zspage, &class_idx, &fg); - assert_spin_locked(&class->lock); + assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg !=3D ZS_EMPTY); @@ -1247,19 +1244,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned = long handle, BUG_ON(in_interrupt()); /* It guarantees it can get zspage from handle safely */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj =3D handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage =3D get_zspage(page); /* - * migration cannot move any zpages in this zspage. Here, class->lock + * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ migrate_read_lock(zspage); - read_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); class =3D zspage_class(pool, zspage); off =3D (class->size * obj_idx) & ~PAGE_MASK; @@ -1412,8 +1409,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t = size, gfp_t gfp) size +=3D ZS_HANDLE_SIZE; class =3D pool->size_class[get_size_class_index(size)]; - /* class->lock effectively protects the zpage migration */ - spin_lock(&class->lock); + /* pool->lock effectively protects the zpage migration */ + spin_lock(&pool->lock); zspage =3D find_get_zspage(class); if (likely(zspage)) { obj =3D obj_malloc(pool, zspage, handle); @@ -1421,12 +1418,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_= t size, gfp_t gfp) fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, OBJ_USED, 1); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } - spin_unlock(&class->lock); + spin_unlock(&pool->lock); zspage =3D alloc_zspage(pool, class, gfp); if (!zspage) { @@ -1434,7 +1431,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t = size, gfp_t gfp) return (unsigned long)ERR_PTR(-ENOMEM); } - spin_lock(&class->lock); + spin_lock(&pool->lock); obj =3D obj_malloc(pool, zspage, handle); newfg =3D get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); @@ -1447,7 +1444,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t = size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } @@ -1491,16 +1488,14 @@ void zs_free(struct zs_pool *pool, unsigned long ha= ndle) return; /* - * The pool->migrate_lock protects the race with zpage's migration + * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj =3D handle_to_obj(handle); obj_to_page(obj, &f_page); zspage =3D get_zspage(f_page); class =3D zspage_class(pool, zspage); - spin_lock(&class->lock); - read_unlock(&pool->migrate_lock); obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); @@ -1510,7 +1505,7 @@ void zs_free(struct zs_pool *pool, unsigned long hand= le) free_zspage(pool, class, zspage); out: - spin_unlock(&class->lock); + spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1867,16 +1862,12 @@ static int zs_page_migrate(struct page *newpage, st= ruct page *page, pool =3D zspage->pool; /* - * The pool migrate_lock protects the race between zpage migration + * The pool's lock protects the race between zpage migration * and zs_free. */ - write_lock(&pool->migrate_lock); + spin_lock(&pool->lock); class =3D zspage_class(pool, zspage); - /* - * the class lock protects zpage alloc/free in the zspage. - */ - spin_lock(&class->lock); /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); @@ -1906,10 +1897,9 @@ static int zs_page_migrate(struct page *newpage, str= uct page *page, replace_sub_page(class, zspage, newpage, page); /* * Since we complete the data copy and set up new zspage structure, - * it's okay to release migration_lock. + * it's okay to release the pool's lock. */ - write_unlock(&pool->migrate_lock); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); dec_zspage_isolation(zspage); migrate_write_unlock(zspage); @@ -1964,9 +1954,9 @@ static void async_free_zspage(struct work_struct *wor= k) if (class->index !=3D i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } list_for_each_entry_safe(zspage, tmp, &free_pages, list) { @@ -1976,9 +1966,9 @@ static void async_free_zspage(struct work_struct *wor= k) get_zspage_mapping(zspage, &class_idx, &fullness); VM_BUG_ON(fullness !=3D ZS_EMPTY); class =3D pool->size_class[class_idx]; - spin_lock(&class->lock); + spin_lock(&pool->lock); __free_zspage(pool, class, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } }; @@ -2039,10 +2029,11 @@ static unsigned long __zs_compact(struct zs_pool *p= ool, struct zspage *dst_zspage =3D NULL; unsigned long pages_freed =3D 0; - /* protect the race between zpage migration and zs_free */ - write_lock(&pool->migrate_lock); - /* protect zpage allocation/free */ - spin_lock(&class->lock); + /* + * protect the race between zpage migration and zs_free + * as well as zpage allocation/free + */ + spin_lock(&pool->lock); while ((src_zspage =3D isolate_zspage(class, true))) { /* protect someone accessing the zspage(i.e., zs_map_object) */ migrate_write_lock(src_zspage); @@ -2067,7 +2058,7 @@ static unsigned long __zs_compact(struct zs_pool *poo= l, putback_zspage(class, dst_zspage); migrate_write_unlock(dst_zspage); dst_zspage =3D NULL; - if (rwlock_is_contended(&pool->migrate_lock)) + if (spin_is_contended(&pool->lock)) break; } @@ -2084,11 +2075,9 @@ static unsigned long __zs_compact(struct zs_pool *po= ol, pages_freed +=3D class->pages_per_zspage; } else migrate_write_unlock(src_zspage); - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); cond_resched(); - write_lock(&pool->migrate_lock); - spin_lock(&class->lock); + spin_lock(&pool->lock); } if (src_zspage) { @@ -2096,8 +2085,7 @@ static unsigned long __zs_compact(struct zs_pool *poo= l, migrate_write_unlock(src_zspage); } - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); return pages_freed; } @@ -2200,7 +2188,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - rwlock_init(&pool->migrate_lock); + spin_lock_init(&pool->lock); pool->name =3D kstrdup(name, GFP_KERNEL); if (!pool->name) @@ -2271,7 +2259,6 @@ struct zs_pool *zs_create_pool(const char *name) class->index =3D i; class->pages_per_zspage =3D pages_per_zspage; class->objs_per_zspage =3D objs_per_zspage; - spin_lock_init(&class->lock); pool->size_class[i] =3D class; for (fullness =3D ZS_EMPTY; fullness < NR_ZS_FULLNESS; fullness++) -- 2.30.2 From nobody Wed Apr 15 19:25:06 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id DDFD4C433FE for ; Thu, 17 Nov 2022 16:39:32 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S240337AbiKQQjb (ORCPT ); Thu, 17 Nov 2022 11:39:31 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47176 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S240677AbiKQQi4 (ORCPT ); Thu, 17 Nov 2022 11:38:56 -0500 Received: from mail-pf1-x42d.google.com (mail-pf1-x42d.google.com [IPv6:2607:f8b0:4864:20::42d]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D33635BD5C for ; Thu, 17 Nov 2022 08:38:45 -0800 (PST) Received: by mail-pf1-x42d.google.com with SMTP id 130so2282130pfu.8 for ; Thu, 17 Nov 2022 08:38:45 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=+p1hPj61mepMRhXTZUUxeqFE4oggG2yUtQcHGL8WEqI=; b=R3zzWdjc5t/hqkJuKcDtHkIvecRh7pqG32ZDOeKpoiUvm6JWW16cQBdC0opssauqa9 7zeNitEvIVRebPigqX1DOwT3lIXMPvIrYMmLV7WkKYai7bdcJur42doKXuGhgA1fWBYK YFET9iVKGKSnxcLUCzXWNasfQb4KvedIQ9EowX60sMHs/Ld+qan1Mhc9DOB34LoHONqi Wvrl1qxUfanPwueBdkvDOoynmkJBWjVNnj7WNgCC0XQ9hDnWZ7MQ33Ou4FhLpnWCnigG APgiIfw/vUuNDUgu8e0jgjJNvt/dfCUDFpIXhUm+WFMytCiLp9x5nCUsim84H9lqgcf7 xs1w== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=+p1hPj61mepMRhXTZUUxeqFE4oggG2yUtQcHGL8WEqI=; b=qt5IStls3ihDV3zsR0xyvjZodpnXFu7EEC52bEriPnu83cqiI11WQWY4H0xrylYNB+ E5XC2BX3jngrETq+gBFBGy4a8wt9uvC8fKi2cenBByafnUZ7dpw+7RIr4bLIbsguY7G4 W45Fi28x45eIDa+ny4Wc12DL2mL1SUjQFyApXdvz1ej7ilMnzcv9IalDpC4IUQEjF/fq sZYU+9buDnz9e/iOzC1Q3ymYv9oHf0zcNYqDcM5DHqTOjZnXiU8oQFiI9xunFFfCVzm0 uZi8uk8PryuAXwQR/CleQv2Z2r/EZ5WBs12zf7nvAxU7oeLoTYgzVYGj+AyZ7G+/JUCT F2yg== X-Gm-Message-State: ANoB5pm5kSzo1w+Wr4jGUQNez1Ncc/H1hQrWecKeG/aubca4kwqM/xN9 3Wa7NuL1knT2khYYAyC9UDs= X-Google-Smtp-Source: AA0mqf5I9JnPFeNm/aV8+xxE4ZiIxU0CP+AdVIaS0pp0EvR8dF6GS5lQnCasEfg8QfuISDAhtOtnSA== X-Received: by 2002:a05:6a00:2352:b0:572:91c6:9e4e with SMTP id j18-20020a056a00235200b0057291c69e4emr3760746pfj.53.1668703125266; Thu, 17 Nov 2022 08:38:45 -0800 (PST) Received: from localhost (fwdproxy-prn-000.fbsv.net. [2a03:2880:ff::face:b00c]) by smtp.gmail.com with ESMTPSA id a23-20020aa79717000000b0056bc31f4f9fsm1373044pfg.65.2022.11.17.08.38.44 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 17 Nov 2022 08:38:44 -0800 (PST) From: Nhat Pham To: akpm@linux-foundation.org Cc: hannes@cmpxchg.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, minchan@kernel.org, ngupta@vflare.org, senozhatsky@chromium.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com Subject: [PATCH v4 3/5] zsmalloc: Add a LRU to zs_pool to keep track of zspages in LRU order Date: Thu, 17 Nov 2022 08:38:37 -0800 Message-Id: <20221117163839.230900-4-nphamcs@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221117163839.230900-1-nphamcs@gmail.com> References: <20221117163839.230900-1-nphamcs@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" This helps determines the coldest zspages as candidates for writeback. Signed-off-by: Nhat Pham --- mm/zsmalloc.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 326faa751f0a..2557b55ec767 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -239,6 +239,11 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; +#ifdef CONFIG_ZPOOL + /* List tracking the zspages in LRU order by most recently added object */ + struct list_head lru; +#endif + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -260,6 +265,12 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + +#ifdef CONFIG_ZPOOL + /* links the zspage to the lru list in the pool */ + struct list_head lru; +#endif + struct zs_pool *pool; #ifdef CONFIG_COMPACTION rwlock_t lock; @@ -352,6 +363,18 @@ static void cache_free_zspage(struct zs_pool *pool, st= ruct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } +#ifdef CONFIG_ZPOOL +/* Moves the zspage to the front of the zspool's LRU */ +static void move_to_front(struct zs_pool *pool, struct zspage *zspage) +{ + assert_spin_locked(&pool->lock); + + if (!list_empty(&zspage->lru)) + list_del(&zspage->lru); + list_add(&zspage->lru, &pool->lru); +} +#endif + /* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { @@ -953,6 +976,9 @@ static void free_zspage(struct zs_pool *pool, struct si= ze_class *class, } remove_zspage(class, zspage, ZS_EMPTY); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); } @@ -998,6 +1024,10 @@ static void init_zspage(struct size_class *class, str= uct zspage *zspage) off %=3D PAGE_SIZE; } +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&zspage->lru); +#endif + set_freeobj(zspage, 0); } @@ -1418,6 +1448,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t= size, gfp_t gfp) fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, OBJ_USED, 1); + +#ifdef CONFIG_ZPOOL + /* Move the zspage to front of pool's LRU */ + move_to_front(pool, zspage); +#endif spin_unlock(&pool->lock); return handle; @@ -1444,6 +1479,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t= size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); +#ifdef CONFIG_ZPOOL + /* Move the zspage to front of pool's LRU */ + move_to_front(pool, zspage); +#endif spin_unlock(&pool->lock); return handle; @@ -1967,6 +2006,9 @@ static void async_free_zspage(struct work_struct *wor= k) VM_BUG_ON(fullness !=3D ZS_EMPTY); class =3D pool->size_class[class_idx]; spin_lock(&pool->lock); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } @@ -2278,6 +2320,10 @@ struct zs_pool *zs_create_pool(const char *name) */ zs_register_shrinker(pool); +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&pool->lru); +#endif + return pool; err: -- 2.30.2 From nobody Wed Apr 15 19:25:06 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 13399C433FE for ; Thu, 17 Nov 2022 16:39:30 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S240584AbiKQQj1 (ORCPT ); Thu, 17 Nov 2022 11:39:27 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47762 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233679AbiKQQi4 (ORCPT ); Thu, 17 Nov 2022 11:38:56 -0500 Received: from mail-pj1-x1036.google.com (mail-pj1-x1036.google.com [IPv6:2607:f8b0:4864:20::1036]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3DF5E7C011 for ; Thu, 17 Nov 2022 08:38:47 -0800 (PST) Received: by mail-pj1-x1036.google.com with SMTP id r61-20020a17090a43c300b00212f4e9cccdso5875119pjg.5 for ; Thu, 17 Nov 2022 08:38:47 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=8Nqg4Xt8lGKthWB+nE04XmQsPpZUs65jb+qHOq4Cr/k=; b=PRXwfJiPmdnJffoazkBjUBN8ZE3gBH/hkNy/65kEe1ygSQwxKpmoEIjZkvNBkBd3d4 UAcN7kr2yQiixORgoPBvpXxN27IMZ+QOmLa8coD8Fa9cMn2Khspr0MSgul60p0CoPc9S V+hm8qNojAxVS77NWN/UxmzX/5B2ujehMRzWW57FsYBMYCh65oPMx5O2EkKMzdJq8OIs wz9RkshLbCfjuOFy8mofp+9SYbllP0P65SoImEnn29027zUuKntQn2uac4eQxrdzBN7W n57wczZSCztTlEVXxl8d3ExlLHHiz5qAxdv8Y9hE3J/YV9vEgMNrMQKQ3YdiJzgniV/Z VWpw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=8Nqg4Xt8lGKthWB+nE04XmQsPpZUs65jb+qHOq4Cr/k=; b=yPrBJXPszLxGdSpBb3CPSD85IfpiYH2H31FrnITV4zsuRvfwDJjCVH/WQ9Xyp3rz+Q obfV7ot8cLemyiPz2HYMaBdQVSxM8daOlPQxT3B+An7f1ORnmPjElh56emLmb9es//7B qxopPOaNW0zhs9Dkiy83y5twGQ1pBoZ3raovOzzh8pk4hEsSX1Jz7omiRPWtRLnL6Pv9 YGWnGvzgDYc4hER3n+/dBi1R9hAjnZF3JkLA+VXzjmFl2EU8jfbGEQYISU4ydqjG0NTw sPV+C1tMOf/g+Mli38Mgrqyfptbnx2/iVKIxN5MBMxfdc8F5dMhxBG19aFxz9oFy/WWm WkrQ== X-Gm-Message-State: ANoB5pkzoP31++w4J9lQbWW2PXVfcSqBXkXNK2YiX/xafn47hRbHB/ay UpCq5djHN8AbKVvy3IjwlSDHdMMGuSOUlw== X-Google-Smtp-Source: AA0mqf44KzoNad4Ffghvr1rp79PUpUhzA6ssZf3/8UcqNZf9nc9k7+WSrqnu8RxDJ2OxLOuAnvhU0g== X-Received: by 2002:a17:90a:bc4b:b0:212:d796:d30f with SMTP id t11-20020a17090abc4b00b00212d796d30fmr3699709pjv.9.1668703126784; Thu, 17 Nov 2022 08:38:46 -0800 (PST) Received: from localhost (fwdproxy-prn-001.fbsv.net. [2a03:2880:ff:1::face:b00c]) by smtp.gmail.com with ESMTPSA id y4-20020a17090322c400b0016f196209c9sm1645420plg.123.2022.11.17.08.38.45 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 17 Nov 2022 08:38:46 -0800 (PST) From: Nhat Pham To: akpm@linux-foundation.org Cc: hannes@cmpxchg.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, minchan@kernel.org, ngupta@vflare.org, senozhatsky@chromium.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com Subject: [PATCH v4 4/5] zsmalloc: Add ops fields to zs_pool to store evict handlers Date: Thu, 17 Nov 2022 08:38:38 -0800 Message-Id: <20221117163839.230900-5-nphamcs@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221117163839.230900-1-nphamcs@gmail.com> References: <20221117163839.230900-1-nphamcs@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" This adds fields to zs_pool to store evict handlers for writeback, analogous to the zbud allocator. Signed-off-by: Nhat Pham --- mm/zsmalloc.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2557b55ec767..776d0e15a401 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -225,6 +225,12 @@ struct link_free { }; }; +struct zs_pool; + +struct zs_ops { + int (*evict)(struct zs_pool *pool, unsigned long handle); +}; + struct zs_pool { const char *name; @@ -242,6 +248,9 @@ struct zs_pool { #ifdef CONFIG_ZPOOL /* List tracking the zspages in LRU order by most recently added object */ struct list_head lru; + const struct zs_ops *ops; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; #endif #ifdef CONFIG_ZSMALLOC_STAT @@ -385,6 +394,18 @@ static void record_obj(unsigned long handle, unsigned = long obj) #ifdef CONFIG_ZPOOL +static int zs_zpool_evict(struct zs_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct zs_ops zs_zpool_ops =3D { + .evict =3D zs_zpool_evict +}; + static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) @@ -394,7 +415,19 @@ static void *zs_zpool_create(const char *name, gfp_t g= fp, * different contexts and its caller must provide a valid * gfp mask. */ - return zs_create_pool(name); + struct zs_pool *pool =3D zs_create_pool(name); + + if (pool) { + pool->zpool =3D zpool; + pool->zpool_ops =3D zpool_ops; + + if (zpool_ops) + pool->ops =3D &zs_zpool_ops; + else + pool->ops =3D NULL; + } + + return pool; } static void zs_zpool_destroy(void *pool) -- 2.30.2 From nobody Wed Apr 15 19:25:06 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 1D860C433FE for ; Thu, 17 Nov 2022 16:39:36 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S240277AbiKQQjd (ORCPT ); Thu, 17 Nov 2022 11:39:33 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:49678 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234810AbiKQQi5 (ORCPT ); Thu, 17 Nov 2022 11:38:57 -0500 Received: from mail-pl1-x630.google.com (mail-pl1-x630.google.com [IPv6:2607:f8b0:4864:20::630]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A6C267819D for ; Thu, 17 Nov 2022 08:38:48 -0800 (PST) Received: by mail-pl1-x630.google.com with SMTP id p21so2114842plr.7 for ; Thu, 17 Nov 2022 08:38:48 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=ZYWORxAy79T9WSWxF/CzcXujG+zLBtFc0XTbXyObrLk=; b=cxoxzA1yR0JaEcjtU7/sl5d2AWPjbyN5aOS8lYZ4bai+Z+XDdRAtGrSYARtZzd+1cY ZWH39uulSUzkCQGhLtKeairHXCRc6rkUM09SU6GptTcoXjZxBuM+SMbWbzyin4tVEzf/ zJ3Pji1h80X7w9GZG8QKUW/EPJDpAJLrjUTbj1QJ5N3Mkz49EBn41LC/YiovSlPHCxhb mu6MrNkCYoYlTIwbAh0OU0DSpNzizg5v874pHUqkIRKLnwVgkMpiUA8x+kEtPSxOqXtN 7yVsFRxSEgER6zooRBZfUO1wolw9tCxG82Z3TTRGZgYpBMrB0xMLRrJRuIa6i7Hq/TO8 3kZg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=ZYWORxAy79T9WSWxF/CzcXujG+zLBtFc0XTbXyObrLk=; b=lZarqQ9GEdnJy8jXMBRJYYs/SuWrpwkbLjSwhD660QE9FMFKknSZqwUrKQU4kXxIDd HX4piVioERt+zjAjyTISIqDrsMplpMtfODlyxX8LEKsJRaTiW1Ksph+I9uAUuuNnjqnN np6cVRrmQaplZyAskumuiwcNsKmETvirXvgwq9Rvpucm3yUYv8zDlXVTr9xTzzn9eEDv JjaJqMKt3/jFKxWNIRjmJJsIaIhwhfsE08GiQws+t9wQF9xwcm8Yv23e+g1IyVTEbOTE uaKtN4B2Wcv3HVn/69nrE8Bh1SpO/HzfSDykSfRkqWt9Cp/vIkxjrTz4yD9w8PIQeciP n4kQ== X-Gm-Message-State: ANoB5pkEfKrp5G731mUI5B4zUykSYocg1N2+mEYheIHCn5lCwy8cUQXC BViTZ4WjkoSH1bkTxHA396Y= X-Google-Smtp-Source: AA0mqf7fn8cu6fs/3fJDiIG2Radf7GNtZoRsfAylYg2J1YU0aHoQ+8MVjWntS4QeVlQ82OqGkAf7cQ== X-Received: by 2002:a17:902:d355:b0:17f:6fee:3334 with SMTP id l21-20020a170902d35500b0017f6fee3334mr3662600plk.10.1668703128072; Thu, 17 Nov 2022 08:38:48 -0800 (PST) Received: from localhost (fwdproxy-prn-012.fbsv.net. [2a03:2880:ff:c::face:b00c]) by smtp.gmail.com with ESMTPSA id m10-20020a634c4a000000b00476e84c3530sm1176069pgl.60.2022.11.17.08.38.47 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Thu, 17 Nov 2022 08:38:47 -0800 (PST) From: Nhat Pham To: akpm@linux-foundation.org Cc: hannes@cmpxchg.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, minchan@kernel.org, ngupta@vflare.org, senozhatsky@chromium.org, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com Subject: [PATCH v4 5/5] zsmalloc: Implement writeback mechanism for zsmalloc Date: Thu, 17 Nov 2022 08:38:39 -0800 Message-Id: <20221117163839.230900-6-nphamcs@gmail.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20221117163839.230900-1-nphamcs@gmail.com> References: <20221117163839.230900-1-nphamcs@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" This commit adds the writeback mechanism for zsmalloc, analogous to the zbud allocator. Zsmalloc will attempt to determine the coldest zspage (i.e least recently used) in the pool, and attempt to write back all the stored compressed objects via the pool's evict handler. Signed-off-by: Nhat Pham --- mm/zsmalloc.c | 203 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 184 insertions(+), 19 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 776d0e15a401..0ab9f173e964 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -280,10 +280,13 @@ struct zspage { struct list_head lru; #endif + bool under_reclaim; + + /* list of unfreed handles whose objects have been reclaimed */ + unsigned long *deferred_handles; + struct zs_pool *pool; -#ifdef CONFIG_COMPACTION rwlock_t lock; -#endif }; struct mapping_area { @@ -304,10 +307,11 @@ static bool ZsHugePage(struct zspage *zspage) return zspage->huge; } -#ifdef CONFIG_COMPACTION static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); @@ -315,9 +319,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static void migrate_lock_init(struct zspage *zspage) {} -static void migrate_read_lock(struct zspage *zspage) {} -static void migrate_read_unlock(struct zspage *zspage) {} static void migrate_write_lock(struct zspage *zspage) {} static void migrate_write_lock_nested(struct zspage *zspage) {} static void migrate_write_unlock(struct zspage *zspage) {} @@ -449,6 +450,27 @@ static void zs_zpool_free(void *pool, unsigned long ha= ndle) zs_free(pool, handle); } +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries); + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total =3D 0; + int ret =3D -EINVAL; + + while (total < pages) { + ret =3D zs_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed =3D total; + + return ret; +} + static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -487,6 +509,7 @@ static struct zpool_driver zs_zpool_driver =3D { .malloc_support_movable =3D true, .malloc =3D zs_zpool_malloc, .free =3D zs_zpool_free, + .shrink =3D zs_zpool_shrink, .map =3D zs_zpool_map, .unmap =3D zs_zpool_unmap, .total_size =3D zs_zpool_total_size, @@ -960,6 +983,21 @@ static int trylock_zspage(struct zspage *zspage) return 0; } +/* + * Free all the deferred handles whose objects are freed in zs_free. + */ +static void free_handles(struct zs_pool *pool, struct zspage *zspage) +{ + unsigned long handle =3D (unsigned long)zspage->deferred_handles; + + while (handle) { + unsigned long nxt_handle =3D handle_to_obj(handle); + + cache_free_handle(pool, handle); + handle =3D nxt_handle; + } +} + static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { @@ -974,6 +1012,9 @@ static void __free_zspage(struct zs_pool *pool, struct= size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg !=3D ZS_EMPTY); + /* Free all deferred handles from zs_free */ + free_handles(pool, zspage); + next =3D page =3D get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1060,6 +1101,8 @@ static void init_zspage(struct size_class *class, str= uct zspage *zspage) #ifdef CONFIG_ZPOOL INIT_LIST_HEAD(&zspage->lru); #endif + zspage->under_reclaim =3D false; + zspage->deferred_handles =3D NULL; set_freeobj(zspage, 0); } @@ -1482,13 +1525,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t= size, gfp_t gfp) record_obj(handle, obj); class_stat_inc(class, OBJ_USED, 1); -#ifdef CONFIG_ZPOOL - /* Move the zspage to front of pool's LRU */ - move_to_front(pool, zspage); -#endif - spin_unlock(&pool->lock); - - return handle; + goto out; } spin_unlock(&pool->lock); @@ -1512,12 +1549,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_= t size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); +out: #ifdef CONFIG_ZPOOL /* Move the zspage to front of pool's LRU */ move_to_front(pool, zspage); #endif spin_unlock(&pool->lock); - return handle; } EXPORT_SYMBOL_GPL(zs_malloc); @@ -1571,12 +1608,24 @@ void zs_free(struct zs_pool *pool, unsigned long ha= ndle) obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); + + if (zspage->under_reclaim) { + /* + * Reclaim needs the handles during writeback. It'll free + * them along with the zspage when it's done with them. + * + * Record current deferred handle at the memory location + * whose address is given by handle. + */ + record_obj(handle, (unsigned long)zspage->deferred_handles); + zspage->deferred_handles =3D (unsigned long *)handle; + spin_unlock(&pool->lock); + return; + } fullness =3D fix_fullness_group(class, zspage); - if (fullness !=3D ZS_EMPTY) - goto out; + if (fullness =3D=3D ZS_EMPTY) + free_zspage(pool, class, zspage); - free_zspage(pool, class, zspage); -out: spin_unlock(&pool->lock); cache_free_handle(pool, handle); } @@ -1776,7 +1825,7 @@ static enum fullness_group putback_zspage(struct size= _class *class, return fullness; } -#ifdef CONFIG_COMPACTION +#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. @@ -1818,6 +1867,24 @@ static void lock_zspage(struct zspage *zspage) } migrate_read_unlock(zspage); } +#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */ + +#ifdef CONFIG_ZPOOL +/* + * Unlocks all the pages of the zspage. + * + * pool->lock must be held before this function is called + * to prevent the underlying pages from migrating. + */ +static void unlock_zspage(struct zspage *zspage) +{ + struct page *page =3D get_first_page(zspage); + + do { + unlock_page(page); + } while ((page =3D get_next_page(page)) !=3D NULL); +} +#endif /* CONFIG_ZPOOL */ static void migrate_lock_init(struct zspage *zspage) { @@ -1834,6 +1901,7 @@ static void migrate_read_unlock(struct zspage *zspage= ) __releases(&zspage->lock) read_unlock(&zspage->lock); } +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); @@ -2398,6 +2466,103 @@ void zs_destroy_pool(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_destroy_pool); +#ifdef CONFIG_ZPOOL +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) +{ + int i, obj_idx, ret =3D 0; + unsigned long handle; + struct zspage *zspage; + struct page *page; + enum fullness_group fullness; + + if (retries =3D=3D 0 || !pool->ops || !pool->ops->evict) + return -EINVAL; + + /* Lock LRU and fullness list */ + spin_lock(&pool->lock); + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } + + for (i =3D 0; i < retries; i++) { + struct size_class *class; + + zspage =3D list_last_entry(&pool->lru, struct zspage, lru); + list_del(&zspage->lru); + + /* zs_free may free objects, but not the zspage and handles */ + zspage->under_reclaim =3D true; + + class =3D zspage_class(pool, zspage); + fullness =3D get_fullness_group(class, zspage); + + /* Lock out object allocations and object compaction */ + remove_zspage(class, zspage, fullness); + + spin_unlock(&pool->lock); + + /* Lock backing pages into place */ + lock_zspage(zspage); + + obj_idx =3D 0; + page =3D zspage->first_page; + while (1) { + handle =3D find_alloced_obj(class, page, &obj_idx); + if (!handle) { + page =3D get_next_page(page); + if (!page) + break; + obj_idx =3D 0; + continue; + } + + /* + * This will write the object and call + * zs_free. + * + * zs_free will free the object, but the + * under_reclaim flag prevents it from freeing + * the zspage altogether. This is necessary so + * that we can continue working with the + * zspage potentially after the last object + * has been freed. + */ + ret =3D pool->ops->evict(pool, handle); + if (ret) + goto next; + + obj_idx++; + } + +next: + /* For freeing the zspage, or putting it back in the pool and LRU list. = */ + spin_lock(&pool->lock); + zspage->under_reclaim =3D false; + + if (!get_zspage_inuse(zspage)) { + /* + * Fullness went stale as zs_free() won't touch it + * while the page is removed from the pool. Fix it + * up for the check in __free_zspage(). + */ + zspage->fullness =3D ZS_EMPTY; + + __free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); + return 0; + } + + putback_zspage(class, zspage); + list_add(&zspage->lru, &pool->lru); + unlock_zspage(zspage); + } + + spin_unlock(&pool->lock); + return -EAGAIN; +} +#endif /* CONFIG_ZPOOL */ + static int __init zs_init(void) { int ret; -- 2.30.2