From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7B3FB30F819; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=oa/QEuw/e6OCCPoxJmUAwERYXOeiKmX8TrxSIAaiPR0yMHOeUTVoycUglJrpaQoD4zUbwmPVTBmE0Ii6W2iaEvtBXfeAWGgW1j5REy8gC77nfXd18rQ73AAUe0hHjnpjBUaVg7FXt2iRnfiipvQhjUZwpdrx9vx1mPBbhv32Oas= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=ob6VrZobjIxhJd1syjRnD0Fz6UDbLx7k9+0T35c5Ifk=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=Voazdn/u1uhBX4/cVwPqKuivrr7yOdiCke6mApIIr7yrJLSA4AXvShEAxSnFOckrdfrtx4caSwK8wHsW7llhXE+MKJsKYIx2g8/gqpyqaUQrcdcHtx2Az+AD1ycYQr69o5s9yRJcUOFrgLB6FLuttiF8kxjyEmthK5QtiGRE0Xs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=RX0imueN; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="RX0imueN" Received: by smtp.kernel.org (Postfix) with ESMTPS id 2A7D0C19423; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=ob6VrZobjIxhJd1syjRnD0Fz6UDbLx7k9+0T35c5Ifk=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=RX0imueNLxnKo6rUXkF0FvQ104tNzT+RDIgI3/9VpHbGhPl5ylP02acqwECdl9lIi csDtkTmVwrUdMm5ocEnx0KSqrFsH/NreFVdyX+OZz4oV8nTO/fXJ36uuqx+Q90OLqj QtHAv2ozRZVQlO8EJXfZhrov9EzaOU6DtSnvEkEfSoNOFrDO2rl3il4T2iEK1xPhdM 29LcWXU+gcudZkW0wUMrsTxx4l6WtZfZQ1NcX7b3Hh9Jg0JurM3DDaBiyeaCNZlvq4 MbrubBu3OlOc0PTIHbGKHerkilSmMx8S+PT5KLNpXaFyRD3kgWmbMEqZZtvNd24yTF 3WbmjOcRZVPJg== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 185FBC531EA; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:02 +0800 Subject: [PATCH RFC 01/15] mm: move thp_limit_gfp_mask to header Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-1-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=3548; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=EsaDsTDGqRS39tj8l8nvpBNRy9miEhNym6TC8RjJmDw=; b=wZakvnnlS13do00sOlIWHX8u8pD1P7hONi6OwqAIkTyXNu+BQg7Edjqh/iZMYuGM58ZYbJdRG 6jgP9IrnMLJDt6qpU2w2ZuSjAyEz0NYHpKdYslZ4KSCbENbjAT0krGo X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song No feature change, to be used later. Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 24 ++++++++++++++++++++++++ mm/shmem.c | 30 +++--------------------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4d9f964dfde..d522e798822d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -237,6 +237,30 @@ static inline bool thp_vma_suitable_order(struct vm_ar= ea_struct *vma, return true; } =20 +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static inline gfp_t thp_limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags =3D __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags =3D __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags =3D limit_gfp & GFP_ZONEMASK; + gfp_t result =3D huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |=3D zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |=3D (limit_gfp & denyflags); + result |=3D (huge_gfp & limit_gfp) & allowflags; + + return result; +} + /* * Filter the bitfield of input orders to the ones suitable for use in the= vma. * See thp_vma_suitable_order(). diff --git a/mm/shmem.c b/mm/shmem.c index b976b40fd442..9f054b5aae8e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1788,30 +1788,6 @@ static struct folio *shmem_swapin_cluster(swp_entry_= t swap, gfp_t gfp, return folio; } =20 -/* - * Make sure huge_gfp is always more limited than limit_gfp. - * Some of the flags set permissions, while others set limitations. - */ -static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) -{ - gfp_t allowflags =3D __GFP_IO | __GFP_FS | __GFP_RECLAIM; - gfp_t denyflags =3D __GFP_NOWARN | __GFP_NORETRY; - gfp_t zoneflags =3D limit_gfp & GFP_ZONEMASK; - gfp_t result =3D huge_gfp & ~(allowflags | GFP_ZONEMASK); - - /* Allow allocations only from the originally specified zones. */ - result |=3D zoneflags; - - /* - * Minimize the result gfp by taking the union with the deny flags, - * and the intersection of the allow flags. - */ - result |=3D (limit_gfp & denyflags); - result |=3D (huge_gfp & limit_gfp) & allowflags; - - return result; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { @@ -2062,7 +2038,7 @@ static struct folio *shmem_swap_alloc_folio(struct in= ode *inode, non_swapcache_batch(entry, nr_pages) !=3D nr_pages) goto fallback; =20 - alloc_gfp =3D limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + alloc_gfp =3D thp_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new =3D shmem_alloc_folio(alloc_gfp, order, info, index); @@ -2138,7 +2114,7 @@ static int shmem_replace_folio(struct folio **foliop,= gfp_t gfp, if (nr_pages > 1) { gfp_t huge_gfp =3D vma_thp_gfp_mask(vma); =20 - gfp =3D limit_gfp_mask(huge_gfp, gfp); + gfp =3D thp_limit_gfp_mask(huge_gfp, gfp); } #endif =20 @@ -2545,7 +2521,7 @@ static int shmem_get_folio_gfp(struct inode *inode, p= goff_t index, gfp_t huge_gfp; =20 huge_gfp =3D vma_thp_gfp_mask(vma); - huge_gfp =3D limit_gfp_mask(huge_gfp, gfp); + huge_gfp =3D thp_limit_gfp_mask(huge_gfp, gfp); folio =3D shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7B37F30B539; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=e+knKnCxlqIAOPTTM5gtkscrMEWM/GV5MXK8fprJVt6VWi+50pEcqNKme0Vhoi1BQYEYu1ctFyuhfog/xkSTkd03O8pG66sYp2d4NzCQTUBEqG94Gjc/PR4jA34VXFkGC1j2HPvUURF+R6LIt9lMCF/Ex2HBW6K6jyqyUm7qxL4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=5uwq0THsq2yE7wG5RBgSijQvYGxW9tqtgr8TxpPxV/0=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=k0bWGQCvZZ31hvAQv1QHdUmzvqfQ0dsRBAevDxbEuR3b1grW3DKW9Ki468mSNjKB4BCLl+R+WqfdzFZl8OhYqqPV4s7xOt3f4QdJvwKJUA1P72RhOtCLfyo0df7FTrwvR2JdeoF9zw6CEV3deqJVwgenP1nvVG1xJcdKImd+o4c= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=nRrdbCjn; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="nRrdbCjn" Received: by smtp.kernel.org (Postfix) with ESMTPS id 3A67FC116C6; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=5uwq0THsq2yE7wG5RBgSijQvYGxW9tqtgr8TxpPxV/0=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=nRrdbCjnoXXR6G/oCB6bcdFMKGk6HxINrFXkE18iajX1TwVzHl5qq2wq3BnWKfOEW 5XtTxo33wH97kj1hXsdVAmd47YFeTgNBZVqsh5kC68qIoVKHdAdTIWYV2dvp6Z3ovZ 6G1WC/i4LW9VriWImqilMeMLUtaKaHuJHdn1O7THKxwHagwf04vJU7sFpNMOGgZold iK4u71B3+5IlDi+TeKASl358VbzH2OTTsVkB4TGjpmr0ieNddqmqmFrmIzrw7yAXwu Ai6Zs3mcp5jaJGQvP+8bIh6+j0X9m4oDV7mtwE9tYVnhgjVhP2wSxoMNvZerIDFnXF AqJZqUzUe80OA== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 2C68EC531E3; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:03 +0800 Subject: [PATCH RFC 02/15] mm, swap: simplify swap_cache_alloc_folio Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-2-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=13576; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=n1cHUHJ7MXKr4ug4xnaunQlLGRDtG75OYXxYppL+FWk=; b=rG4YFu9U3/Ih89l7XZZzYUxJLVJtLVHuwWYfjQRpWNLAsLy8kWBYQsXB41nE3xM5e97+pg9FG AKJktt5DLY+BSvzlENe0fgk9nrA8g9gZSTEQhmyPPTLIE1BCU+cARSg X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Instead of trying to return the existing folio if the entry is already cached, just return an error code if the allocation failed. And introduce proper wrappers that handle the allocation failure in different ways. For async swapin and readahead, the caller only wants to ensure that a swap in read if the allocation succeeded, and for zswap swap out, the caller will just abort if the allocation failed because the entry is gone or cached already. Signed-off-by: Kairui Song --- mm/swap.h | 3 +- mm/swap_state.c | 177 +++++++++++++++++++++++++++++-----------------------= ---- mm/zswap.c | 15 ++--- 3 files changed, 98 insertions(+), 97 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index a77016f2423b..ad8b17a93758 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -281,8 +281,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); void swap_cache_del_folio(struct folio *folio); struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, - struct mempolicy *mpol, pgoff_t ilx, - bool *alloced); + struct mempolicy *mpol, pgoff_t ilx); /* Below helpers require the caller to lock and pass in the swap cluster. = */ void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index 32d9d877bda8..53fa95059012 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -459,41 +459,24 @@ void swap_update_readahead(struct folio *folio, struc= t vm_area_struct *vma, * All swap slots covered by the folio must have a non-zero swap count. * * Context: Caller must protect the swap device with reference count or lo= cks. - * Return: Returns the folio being added on success. Returns the existing = folio - * if @entry is already cached. Returns NULL if raced with swapin or swapo= ff. + * Return: 0 if success, error code if failed. */ -static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, - struct folio *folio, - gfp_t gfp, bool charged) +static int __swap_cache_prepare_and_add(swp_entry_t entry, + struct folio *folio, + gfp_t gfp, bool charged) { - struct folio *swapcache =3D NULL; void *shadow; int ret; =20 __folio_set_locked(folio); __folio_set_swapbacked(folio); - for (;;) { - ret =3D swap_cache_add_folio(folio, entry, &shadow); - if (!ret) - break; - - /* - * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fallback to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. - */ - if (ret !=3D -EEXIST || folio_test_large(folio)) - goto failed; - - swapcache =3D swap_cache_get_folio(entry); - if (swapcache) - goto failed; - } + ret =3D swap_cache_add_folio(folio, entry, &shadow); + if (ret) + goto failed; =20 if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { swap_cache_del_folio(folio); + ret =3D -ENOMEM; goto failed; } =20 @@ -503,11 +486,11 @@ static struct folio *__swap_cache_prepare_and_add(swp= _entry_t entry, =20 /* Caller will initiate read into locked folio */ folio_add_lru(folio); - return folio; + return 0; =20 failed: folio_unlock(folio); - return swapcache; + return ret; } =20 /** @@ -516,7 +499,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_e= ntry_t entry, * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE - * @new_page_allocated: sets true if allocation happened, false otherwise * * Allocate a folio in the swap cache for one swap slot, typically before * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by @@ -524,18 +506,40 @@ static struct folio *__swap_cache_prepare_and_add(swp= _entry_t entry, * Currently only supports order 0. * * Context: Caller must protect the swap device with reference count or lo= cks. - * Return: Returns the existing folio if @entry is cached already. Returns - * NULL if failed due to -ENOMEM or @entry have a swap count < 1. + * Return: Returns the folio if allocation succeeded and folio is added to + * swap cache. Returns error code if allocation failed due to race. */ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx) +{ + int ret; + struct folio *folio; + + /* Allocate a new folio to be added into the swap cache. */ + folio =3D folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!folio) + return ERR_PTR(-ENOMEM); + + /* + * Try add the new folio, it returns NULL if already exist, + * since folio is order 0. + */ + ret =3D __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); + } + + return folio; +} + +static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, + struct mempolicy *mpol, pgoff_t ilx, + struct swap_iocb **plug, bool readahead) { struct swap_info_struct *si =3D __swap_entry_to_info(entry); struct folio *folio; - struct folio *result =3D NULL; =20 - *new_page_allocated =3D false; /* Check the swap cache again for readahead path. */ folio =3D swap_cache_get_folio(entry); if (folio) @@ -545,17 +549,24 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entr= y, gfp_t gfp_mask, if (!swap_entry_swapped(si, entry)) return NULL; =20 - /* Allocate a new folio to be added into the swap cache. */ - folio =3D folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) + do { + folio =3D swap_cache_get_folio(entry); + if (folio) + return folio; + + folio =3D swap_cache_alloc_folio(entry, gfp, mpol, ilx); + } while (PTR_ERR(folio) =3D=3D -EEXIST); + + if (IS_ERR_OR_NULL(folio)) return NULL; - /* Try add the new folio, returns existing folio or NULL on failure. */ - result =3D __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); - if (result =3D=3D folio) - *new_page_allocated =3D true; - else - folio_put(folio); - return result; + + swap_read_folio(folio, plug); + if (readahead) { + folio_set_readahead(folio); + count_vm_event(SWAP_RA); + } + + return folio; } =20 /** @@ -574,15 +585,35 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entr= y, gfp_t gfp_mask, */ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) { + int ret; struct folio *swapcache; pgoff_t offset =3D swp_offset(entry); unsigned long nr_pages =3D folio_nr_pages(folio); =20 entry =3D swp_entry(swp_type(entry), round_down(offset, nr_pages)); - swapcache =3D __swap_cache_prepare_and_add(entry, folio, 0, true); - if (swapcache =3D=3D folio) - swap_read_folio(folio, NULL); - return swapcache; + for (;;) { + ret =3D __swap_cache_prepare_and_add(entry, folio, 0, true); + if (!ret) { + swap_read_folio(folio, NULL); + break; + } + + /* + * Large order allocation needs special handling on + * race: if a smaller folio exists in cache, swapin needs + * to fallback to order 0, and doing a swap cache lookup + * might return a folio that is irrelevant to the faulting + * entry because @entry is aligned down. Just return NULL. + */ + if (ret !=3D -EEXIST || nr_pages > 1) + return NULL; + + swapcache =3D swap_cache_get_folio(entry); + if (swapcache) + return swapcache; + } + + return folio; } =20 /* @@ -596,7 +627,6 @@ struct folio *read_swap_cache_async(swp_entry_t entry, = gfp_t gfp_mask, struct swap_iocb **plug) { struct swap_info_struct *si; - bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; @@ -606,13 +636,9 @@ struct folio *read_swap_cache_async(swp_entry_t entry,= gfp_t gfp_mask, return NULL; =20 mpol =3D get_vma_policy(vma, addr, 0, &ilx); - folio =3D swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); + folio =3D swap_cache_read_folio(entry, gfp_mask, mpol, ilx, plug, false); mpol_cond_put(mpol); =20 - if (page_allocated) - swap_read_folio(folio, plug); - put_swap_device(si); return folio; } @@ -697,7 +723,7 @@ static unsigned long swapin_nr_pages(unsigned long offs= et) * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx) + struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset =3D swp_offset(entry); @@ -707,7 +733,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry,= gfp_t gfp_mask, struct swap_info_struct *si =3D __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug =3D NULL; - bool page_allocated; + swp_entry_t ra_entry; =20 mask =3D swapin_nr_pages(offset) - 1; if (!mask) @@ -724,18 +750,11 @@ struct folio *swap_cluster_readahead(swp_entry_t entr= y, gfp_t gfp_mask, blk_start_plug(&plug); for (offset =3D start_offset; offset <=3D end_offset ; offset++) { /* Ok, do the async read-ahead now */ - folio =3D swap_cache_alloc_folio( - swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, - &page_allocated); + ra_entry =3D swp_entry(swp_type(entry), offset); + folio =3D swap_cache_read_folio(ra_entry, gfp_mask, mpol, ilx, + &splug, offset !=3D entry_offset); if (!folio) continue; - if (page_allocated) { - swap_read_folio(folio, &splug); - if (offset !=3D entry_offset) { - folio_set_readahead(folio); - count_vm_event(SWAP_RA); - } - } folio_put(folio); } blk_finish_plug(&plug); @@ -743,11 +762,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry= , gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - folio =3D swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); - if (unlikely(page_allocated)) - swap_read_folio(folio, NULL); - return folio; + return swap_cache_read_folio(entry, gfp_mask, mpol, ilx, NULL, false); } =20 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, @@ -813,8 +828,7 @@ static struct folio *swap_vma_readahead(swp_entry_t tar= g_entry, gfp_t gfp_mask, pte_t *pte =3D NULL, pentry; int win; unsigned long start, end, addr; - pgoff_t ilx; - bool page_allocated; + pgoff_t ilx =3D targ_ilx; =20 win =3D swap_vma_ra_win(vmf, &start, &end); if (win =3D=3D 1) @@ -848,19 +862,12 @@ static struct folio *swap_vma_readahead(swp_entry_t t= arg_entry, gfp_t gfp_mask, if (!si) continue; } - folio =3D swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); + folio =3D swap_cache_read_folio(entry, gfp_mask, mpol, ilx, + &splug, addr !=3D vmf->address); if (si) put_swap_device(si); if (!folio) continue; - if (page_allocated) { - swap_read_folio(folio, &splug); - if (addr !=3D vmf->address) { - folio_set_readahead(folio); - count_vm_event(SWAP_RA); - } - } folio_put(folio); } if (pte) @@ -870,10 +877,8 @@ static struct folio *swap_vma_readahead(swp_entry_t ta= rg_entry, gfp_t gfp_mask, lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ - folio =3D swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); - if (unlikely(page_allocated)) - swap_read_folio(folio, NULL); + folio =3D swap_cache_read_folio(targ_entry, gfp_mask, mpol, targ_ilx, + NULL, false); return folio; } =20 diff --git a/mm/zswap.c b/mm/zswap.c index af3f0fbb0558..f3aa83a99636 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -992,7 +992,6 @@ static int zswap_writeback_entry(struct zswap_entry *en= try, pgoff_t offset =3D swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; - bool folio_was_allocated; struct swap_info_struct *si; int ret =3D 0; =20 @@ -1003,22 +1002,20 @@ static int zswap_writeback_entry(struct zswap_entry= *entry, =20 mpol =3D get_task_policy(current); folio =3D swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated); + NO_INTERLEAVE_INDEX); put_swap_device(si); - if (!folio) - return -ENOMEM; =20 /* + * Swap cache allocaiton might fail due to OOM, raced with free + * or existing folio when we due to concurrent swapin or free. * Found an existing folio, we raced with swapin or concurrent * shrinker. We generally writeback cold folios from zswap, and * swapin means the folio just became hot, so skip this folio. * For unlikely concurrent shrinker case, it will be unlinked * and freed when invalidated by the concurrent shrinker anyway. */ - if (!folio_was_allocated) { - ret =3D -EEXIST; - goto out; - } + if (IS_ERR(folio)) + return PTR_ERR(folio); =20 /* * folio is locked, and the swapcache is now secured against @@ -1058,7 +1055,7 @@ static int zswap_writeback_entry(struct zswap_entry *= entry, __swap_writepage(folio, NULL); =20 out: - if (ret && ret !=3D -EEXIST) { + if (ret) { swap_cache_del_folio(folio); folio_unlock(folio); } --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7F039318BAC; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=Fw1w/0ju0qYaBGN8egrqkyTh95pUnt+mt/b+abgBYbj54srFN7D4gGnMRohWIil0DEATz4wYHue7kaMWUSuPMaS2s5MMHUSV3gjz3RUjLIp3pwcKb2WKL50YJsfZHycTprzkpSPbvHefv2vIIDuF18Eer26PRrP85ftnRLVl4IQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=WSo3skmLSrebPHHPF6qNxghI6awr0lM5qgwbnMJhTJI=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=rQlv6VVqTyumqNQnMfBs43FfVA4JqudMZS4HTyEC6syJIBCmAmG6tvL8auxO3hKZNk2svFmi6qgmGva/Nzz9+xdqoMQHXFc6vCyljMLJ7dgShMwC63BsX1JDnN1WWpmIJIIrFbYyvA7N/84eKMw8q0PmDkwAoXouZAjEru1hiW0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=VFXvPCuF; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="VFXvPCuF" Received: by smtp.kernel.org (Postfix) with ESMTPS id 55FA9C19424; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=WSo3skmLSrebPHHPF6qNxghI6awr0lM5qgwbnMJhTJI=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=VFXvPCuF/nFHG4CnQmBgxAzX0WavcFlKpayccYoFpNvlmlr/wDxM390ScxIPDw5cE G4eIy5QNLNoncH59IZhtYkzpyDAep36Ox4c8lbgWjVIAh/otAcZp3MleL8iI4BPd5f uPURcb37dfXpMKGvM8non1GZ5Ntww1seUbgE11QXxJh4qGASLGDlO4Y5TktKwoHz/0 JR1m8gvalduWPdKkJivQnvXxtgdgMuuQEtXNSUiyfIy+gQ3ijhtMgtpy9vkmKwXxgy D2CGhiUe//3Z2fPXChgIpzEsdE3B/ZmQcDf5YhuhIwnGSGbc4deN2xkHW09XWcpClv XZbTUmznSrKUQ== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 447A5C531EC; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:04 +0800 Subject: [PATCH RFC 03/15] mm, swap: move conflict checking logic of out swap cache adding Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-3-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=2479; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=rqW0brukQP1nLveDT5lSrI7q5RqloSsXptUI4qjvbG0=; b=baKnG1/Gi7wFMKTzje7SnEgg9EDLUa44x9RAZMPl9v/vf3/tXIL//YuczgbEESSIT40UR2HUU O82G6iejjPfADkB7eXmmU+PTBDK4/16jjf74zjEuJmZ1qNRnyE7B6fM X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song No feature change, make later commits easier to review. Signed-off-by: Kairui Song --- mm/swap_state.c | 55 ++++++++++++++++++++++++++++++-----------------------= -- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 53fa95059012..1e340faea9ac 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -137,6 +137,28 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } =20 +static int __swap_cache_add_check(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr, + void **shadow) +{ + unsigned int ci_end =3D ci_off + nr; + unsigned long old_tb; + + if (unlikely(!ci->table)) + return -ENOENT; + do { + old_tb =3D __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb))) + return -EEXIST; + if (unlikely(!__swp_tb_get_count(old_tb))) + return -ENOENT; + if (swp_tb_is_shadow(old_tb)) + *shadow =3D swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); + + return 0; +} + void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry) { @@ -179,43 +201,26 @@ static int swap_cache_add_folio(struct folio *folio, = swp_entry_t entry, { int err; void *shadow =3D NULL; - unsigned long old_tb; + unsigned int ci_off; struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages =3D folio_nr_pages(folio); =20 si =3D __swap_entry_to_info(entry); - ci_start =3D swp_cluster_offset(entry); - ci_end =3D ci_start + nr_pages; - ci_off =3D ci_start; ci =3D swap_cluster_lock(si, swp_offset(entry)); - if (unlikely(!ci->table)) { - err =3D -ENOENT; - goto failed; + ci_off =3D swp_cluster_offset(entry); + err =3D __swap_cache_add_check(ci, ci_off, nr_pages, &shadow); + if (err) { + swap_cluster_unlock(ci); + return err; } - do { - old_tb =3D __swap_table_get(ci, ci_off); - if (unlikely(swp_tb_is_folio(old_tb))) { - err =3D -EEXIST; - goto failed; - } - if (unlikely(!__swp_tb_get_count(old_tb))) { - err =3D -ENOENT; - goto failed; - } - if (swp_tb_is_shadow(old_tb)) - shadow =3D swp_tb_to_shadow(old_tb); - } while (++ci_off < ci_end); + __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); if (shadowp) *shadowp =3D shadow; - return 0; =20 -failed: - swap_cluster_unlock(ci); - return err; + return 0; } =20 /** --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8BFDA3191D8; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=NsazowwOXNcjEzZhzFSWaf4353ctvXtHjKoz9N5DUiSgTa4I+rg8bn5T8P3hWnRXXBPHxqaAo3znjnvdb0c3wp5j+vy5SeAZ/kLLj/cu5goBTay+gApo+7uoIeVGTNzcBOoWTot+jYMvj4NOvHmPnr1uZA4zVlGpI3Cr+QDWOqA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=xxmdaXbJhm2fa/xOaTiqJLbiS6Wf5af2Y0X+Fu5RtUo=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=XsmKOf6nVbjwKPmMBh/XtN4tqXIlRjabnwVlQQs7VLjgxnngeEQFM8WoyB+G9u24l5iIzkSjv6L+WevFZ38bSAQjEGrWSrZIj3wAAX2Tb0sNeJMBJL7pKnCz1iPuR0HlQSi7+WwyG9GJ1MnlaZ+K1jM5rgPXqfgglXSqGZicqtk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ozQVrw3f; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ozQVrw3f" Received: by smtp.kernel.org (Postfix) with ESMTPS id 6754CC2BC87; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=xxmdaXbJhm2fa/xOaTiqJLbiS6Wf5af2Y0X+Fu5RtUo=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=ozQVrw3fR2WH1Iv9/eXne8azFHDLeuJC+Zmj3eo9zHJOTf7MFMkqLTNoJeNzZfxb7 vEaRwJtqDMv1v9alomEUEVe/Uv5UyeiUkw3ymYUqhQMxOAs1KMzgenkc5TlqC1LiDJ s1xLaiUR2ElFdmCQV4Oth0ukHW/uZ5YKfvbYK+8SiuFqFx4FwasYlyRtKu1h7U/20V PiNFBzl43elGgUTMbxHYIm0tHAHAgiwUr0q9jUsYQC5YuIJ4F7LTB7y3cXSWOamHWv gBZT/twWXLTH88IZlGn+3IITFPt/EHO9R2ORANTYa/8CUuhTSjZXaF61S3W0cx5L0S 9CbXqjfvLUAZg== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 5D584E9A04F; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:05 +0800 Subject: [PATCH RFC 04/15] mm, swap: add support for large order folios in swap cache directly Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-4-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=10217; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=iRaA16zPocun7MNZRSreX0nvZb3ZKPTMy/X1q06icVE=; b=zwpwc54oeGo2FV9brWSj7Ua9BWGbgsk7pitnp3rs3n/80DvSbA5U89WYopprBd/WDxiPl9kUJ 8kbvnF3WNGkBhVzE4ooNnEcc9N7F5qKH7pOt9uS3DzM4hdGngRJzAty X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song To make it possible to allocate large folios directly in swap cache, let swap_cache_alloc_folio handle larger orders too. This slightly changes how allocation is synchronized. Now, whoever first successfully allocates a folio in the swap cache will be the one who charges it and performs the swap-in. Raced swapin now should avoid a redundant charge and just wait for the swapin to finish. Large order fallback is also moved to the swap cache layer. This should make the fallback process less racy, too. Signed-off-by: Kairui Song --- mm/swap.h | 3 +- mm/swap_state.c | 193 +++++++++++++++++++++++++++++++++++++++++-----------= ---- mm/zswap.c | 2 +- 3 files changed, 145 insertions(+), 53 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index ad8b17a93758..6774af10a943 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -280,7 +280,8 @@ bool swap_cache_has_folio(swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); void swap_cache_del_folio(struct folio *folio); -struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, +struct folio *swap_cache_alloc_folio(swp_entry_t target_entry, gfp_t gfp_m= ask, + unsigned long orders, struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx); /* Below helpers require the caller to lock and pass in the swap cluster. = */ void __swap_cache_add_folio(struct swap_cluster_info *ci, diff --git a/mm/swap_state.c b/mm/swap_state.c index 1e340faea9ac..e32b06a1f229 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -137,26 +137,39 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } =20 -static int __swap_cache_add_check(struct swap_cluster_info *ci, - unsigned int ci_off, unsigned int nr, - void **shadow) +static int __swap_cache_check_batch(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int ci_targ, + unsigned int nr, void **shadowp) { unsigned int ci_end =3D ci_off + nr; unsigned long old_tb; =20 if (unlikely(!ci->table)) return -ENOENT; + do { old_tb =3D __swap_table_get(ci, ci_off); - if (unlikely(swp_tb_is_folio(old_tb))) - return -EEXIST; - if (unlikely(!__swp_tb_get_count(old_tb))) - return -ENOENT; + if (unlikely(swp_tb_is_folio(old_tb)) || + unlikely(!__swp_tb_get_count(old_tb))) + break; if (swp_tb_is_shadow(old_tb)) - *shadow =3D swp_tb_to_shadow(old_tb); + *shadowp =3D swp_tb_to_shadow(old_tb); } while (++ci_off < ci_end); =20 - return 0; + if (likely(ci_off =3D=3D ci_end)) + return 0; + + /* + * If the target slot is not suitable for adding swap cache, return + * -EEXIST or -ENOENT. If the batch is not suitable, could be a + * race with concurrent free or cache add, return -EBUSY. + */ + old_tb =3D __swap_table_get(ci, ci_targ); + if (swp_tb_is_folio(old_tb)) + return -EEXIST; + if (!__swp_tb_get_count(old_tb)) + return -ENOENT; + return -EBUSY; } =20 void __swap_cache_add_folio(struct swap_cluster_info *ci, @@ -209,7 +222,7 @@ static int swap_cache_add_folio(struct folio *folio, sw= p_entry_t entry, si =3D __swap_entry_to_info(entry); ci =3D swap_cluster_lock(si, swp_offset(entry)); ci_off =3D swp_cluster_offset(entry); - err =3D __swap_cache_add_check(ci, ci_off, nr_pages, &shadow); + err =3D __swap_cache_check_batch(ci, ci_off, ci_off, nr_pages, &shadow); if (err) { swap_cluster_unlock(ci); return err; @@ -223,6 +236,124 @@ static int swap_cache_add_folio(struct folio *folio, = swp_entry_t entry, return 0; } =20 +static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, + swp_entry_t targ_entry, gfp_t gfp, + unsigned int order, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx) +{ + int err; + swp_entry_t entry; + struct folio *folio; + void *shadow =3D NULL, *shadow_check =3D NULL; + unsigned long address, nr_pages =3D 1 << order; + unsigned int ci_off, ci_targ =3D swp_cluster_offset(targ_entry); + + entry.val =3D round_down(targ_entry.val, nr_pages); + ci_off =3D round_down(ci_targ, nr_pages); + + /* First check if the range is available */ + spin_lock(&ci->lock); + err =3D __swap_cache_check_batch(ci, ci_off, ci_targ, nr_pages, &shadow); + spin_unlock(&ci->lock); + if (unlikely(err)) + return ERR_PTR(err); + + if (vmf) { + if (order) + gfp =3D thp_limit_gfp_mask(vma_thp_gfp_mask(vmf->vma), gfp); + address =3D round_down(vmf->address, PAGE_SIZE << order); + folio =3D vma_alloc_folio(gfp, order, vmf->vma, address); + } else { + folio =3D folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + } + if (unlikely(!folio)) + return ERR_PTR(-ENOMEM); + + /* Double check the range is still not in conflict */ + spin_lock(&ci->lock); + err =3D __swap_cache_check_batch(ci, ci_off, ci_targ, nr_pages, &shadow_c= heck); + if (unlikely(err) || shadow_check !=3D shadow) { + spin_unlock(&ci->lock); + folio_put(folio); + + /* If shadow changed, just try again */ + return ERR_PTR(err ? err : -EAGAIN); + } + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __swap_cache_add_folio(ci, folio, entry); + spin_unlock(&ci->lock); + + if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, + gfp, entry)) { + spin_lock(&ci->lock); + __swap_cache_del_folio(ci, folio, shadow); + spin_unlock(&ci->lock); + folio_unlock(folio); + folio_put(folio); + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); + return ERR_PTR(-ENOMEM); + } + + /* For memsw accouting, swap is uncharged when folio is added to swap cac= he */ + memcg1_swapin(entry, 1 << order); + if (shadow) + workingset_refault(folio, shadow); + + /* Caller will initiate read into locked new_folio */ + folio_add_lru(folio); + + return folio; +} + +/** + * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap ca= che. + * @targ_entry: swap entry indicating the target slot + * @orders: allocation orders + * @vmf: fault information + * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE + * + * Allocate a folio in the swap cache for one swap slot, typically before + * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by + * @targ_entry must have a non-zero swap count (swapped out). + * + * Context: Caller must protect the swap device with reference count or lo= cks. + * Return: Returns the folio if allocation successed and folio is added to + * swap cache. Returns error code if allocation failed due to race. + */ +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp_mas= k, + unsigned long orders, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx) +{ + int order; + struct folio *folio; + struct swap_cluster_info *ci; + + ci =3D __swap_entry_to_cluster(targ_entry); + order =3D orders ? highest_order(orders) : 0; + for (;;) { + folio =3D __swap_cache_alloc(ci, targ_entry, gfp_mask, order, + vmf, mpol, ilx); + if (!IS_ERR(folio)) + return folio; + if (PTR_ERR(folio) =3D=3D -EAGAIN) + continue; + /* Only -EBUSY means we should fallback and retry. */ + if (PTR_ERR(folio) !=3D -EBUSY) + return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); + order =3D next_order(&orders, order); + if (!orders) + break; + } + /* Should never reach here, order 0 should not fail with -EBUSY. */ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} + /** * __swap_cache_del_folio - Removes a folio from the swap cache. * @ci: The locked swap cluster. @@ -498,46 +629,6 @@ static int __swap_cache_prepare_and_add(swp_entry_t en= try, return ret; } =20 -/** - * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap ca= che. - * @entry: the swapped out swap entry to be binded to the folio. - * @gfp_mask: memory allocation flags - * @mpol: NUMA memory allocation policy to be applied - * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE - * - * Allocate a folio in the swap cache for one swap slot, typically before - * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by - * @entry must have a non-zero swap count (swapped out). - * Currently only supports order 0. - * - * Context: Caller must protect the swap device with reference count or lo= cks. - * Return: Returns the folio if allocation succeeded and folio is added to - * swap cache. Returns error code if allocation failed due to race. - */ -struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx) -{ - int ret; - struct folio *folio; - - /* Allocate a new folio to be added into the swap cache. */ - folio =3D folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) - return ERR_PTR(-ENOMEM); - - /* - * Try add the new folio, it returns NULL if already exist, - * since folio is order 0. - */ - ret =3D __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); - if (ret) { - folio_put(folio); - return ERR_PTR(ret); - } - - return folio; -} - static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, struct mempolicy *mpol, pgoff_t ilx, struct swap_iocb **plug, bool readahead) @@ -559,7 +650,7 @@ static struct folio *swap_cache_read_folio(swp_entry_t = entry, gfp_t gfp, if (folio) return folio; =20 - folio =3D swap_cache_alloc_folio(entry, gfp, mpol, ilx); + folio =3D swap_cache_alloc_folio(entry, gfp, 0, NULL, mpol, ilx); } while (PTR_ERR(folio) =3D=3D -EEXIST); =20 if (IS_ERR_OR_NULL(folio)) diff --git a/mm/zswap.c b/mm/zswap.c index f3aa83a99636..5d83539a8bba 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1001,7 +1001,7 @@ static int zswap_writeback_entry(struct zswap_entry *= entry, return -EEXIST; =20 mpol =3D get_task_policy(current); - folio =3D swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, + folio =3D swap_cache_alloc_folio(swpentry, GFP_KERNEL, 0, NULL, mpol, NO_INTERLEAVE_INDEX); put_swap_device(si); =20 --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A0D29324B1F; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=Z5iYAISCH5rWWoiOEMGSNnBPHYZefIsiMPh6kxUZIsbheuzZ6LIgw/eo284v9QBpFLsmiAXS+Va6gyTJg5bRBfXGjeSWR32MkukDK7syzOd/yLHireEh8zN+anchzLyeze4bhfzYMRwKTFkhWYDHL5oAjIciHQIlyFA0DvkpOlg= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=pjJUa1fFgyrsyPyrCrROjkC/rrvPsGfIR4mvGiZIF7g=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=VMmP99GKwcZ/5Na2JqsLN6KHWwVsgZmPEYn5WJV9vcBgz95nBBQmV6dRBfxV97pZRxWqzsRrOavDSAPjmh4sryLTnS2K9fgbDr4xcAgCWTpkW/NaNDNY7RZHAYloAwR5GEq3rGb7anAJNp7i+vjelL8wNnK49ab9oJHT/HxeNmk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=TS1V7COW; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="TS1V7COW" Received: by smtp.kernel.org (Postfix) with ESMTPS id 7D5EDC2BC86; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=pjJUa1fFgyrsyPyrCrROjkC/rrvPsGfIR4mvGiZIF7g=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=TS1V7COWhmNelMcnOj04+7XD92TCDA7yhx9HvwDuofIcEQ80TMyO6/xJAHYc87tMB AROmFVzJHpPrusJmAy+qS4PJYmU9BxcxyOxDCsm4ytscHPWKcW+aPUyZ7JRz3Pbswd LZSsS5q02J0+m8ZfDGUI7MelY8cah/qt2h9nLyqj8ZXYWGWwL/Fo57p+K4x2m2dRe7 RupvSECyeo4u8Kyybzsw774xgasmiUdfhubbCbh+q6TsabPZyDK6492mK+p0PdCh6E ZZmvhs6pb5B8GlT+MkDiADIbICYSBjz1g18XdjLubPF4D1w3Vd/+uCmA/zFM5/eF5E GQsSnnMG8pXrw== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 71208C531EA; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:06 +0800 Subject: [PATCH RFC 05/15] mm, swap: unify large folio allocation Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-5-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=19505; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=zaL8E2c3BSd0dUGVRVd03OjIP9BByn+whCdDpg5s+H4=; b=uf0UfWfusmCfdSvsmKFiWhO5/Si66GCuWlnzS5Feoymfb9ZlBpLHWNuzNs+2rzeoeNbJFkp0+ KY+1cqqyWbsBVWoCMt1omqzZJjKfy7VzEN6mMusRquUPs/Jq7ZDBLB4 X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Now the large order allocation is supported in swap cache, making both anon and shmem use this instead of implementing their own different method for doing so. Signed-off-by: Kairui Song --- mm/memory.c | 77 +++++--------------------- mm/shmem.c | 94 ++++++++------------------------ mm/swap.h | 30 ++--------- mm/swap_state.c | 163 ++++++++++++----------------------------------------= ---- mm/swapfile.c | 3 +- 5 files changed, 76 insertions(+), 291 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 21bf2517fbce..e58f976508b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4520,26 +4520,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault = *vmf) return VM_FAULT_SIGBUS; } =20 -static struct folio *__alloc_swap_folio(struct vm_fault *vmf) -{ - struct vm_area_struct *vma =3D vmf->vma; - struct folio *folio; - softleaf_t entry; - - folio =3D vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); - if (!folio) - return NULL; - - entry =3D softleaf_from_pte(vmf->orig_pte); - if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - GFP_KERNEL, entry)) { - folio_put(folio); - return NULL; - } - - return folio; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Check if the PTEs within a range are contiguous swap entries @@ -4569,8 +4549,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_= t *ptep, int nr_pages) */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) !=3D nr_pages)) return false; - if (unlikely(non_swapcache_batch(entry, nr_pages) !=3D nr_pages)) - return false; =20 return true; } @@ -4598,16 +4576,14 @@ static inline unsigned long thp_swap_suitable_order= s(pgoff_t swp_offset, return orders; } =20 -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suiltable_orders(struct vm_fault *vmf) { struct vm_area_struct *vma =3D vmf->vma; unsigned long orders; - struct folio *folio; unsigned long addr; softleaf_t entry; spinlock_t *ptl; pte_t *pte; - gfp_t gfp; int order; =20 /* @@ -4615,7 +4591,7 @@ static struct folio *alloc_swap_folio(struct vm_fault= *vmf) * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) - goto fallback; + return 0; =20 /* * A large swapped out folio could be partially or fully in zswap. We @@ -4623,7 +4599,7 @@ static struct folio *alloc_swap_folio(struct vm_fault= *vmf) * folio. */ if (!zswap_never_enabled()) - goto fallback; + return 0; =20 entry =3D softleaf_from_pte(vmf->orig_pte); /* @@ -4637,12 +4613,12 @@ static struct folio *alloc_swap_folio(struct vm_fau= lt *vmf) vmf->address, orders); =20 if (!orders) - goto fallback; + return 0; =20 pte =3D pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) - goto fallback; + return 0; =20 /* * For do_swap_page, find the highest order where the aligned range is @@ -4658,29 +4634,12 @@ static struct folio *alloc_swap_folio(struct vm_fau= lt *vmf) =20 pte_unmap_unlock(pte, ptl); =20 - /* Try allocating the highest of the remaining orders. */ - gfp =3D vma_thp_gfp_mask(vma); - while (orders) { - addr =3D ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio =3D vma_alloc_folio(gfp, order, vma, addr); - if (folio) { - if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - gfp, entry)) - return folio; - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); - folio_put(folio); - } - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); - order =3D next_order(&orders, order); - } - -fallback: - return __alloc_swap_folio(vmf); + return orders; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suiltable_orders(struct vm_fault *vmf) { - return __alloc_swap_folio(vmf); + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ =20 @@ -4785,21 +4744,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (folio) swap_update_readahead(folio, vma, vmf->address); if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { - folio =3D alloc_swap_folio(vmf); - if (folio) { - /* - * folio is charged, so swapin can only fail due - * to raced swapin and return NULL. - */ - swapcache =3D swapin_folio(entry, folio); - if (swapcache !=3D folio) - folio_put(folio); - folio =3D swapcache; - } - } else { + /* Swapin bypass readahead for SWP_SYNCHRONOUS_IO devices */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) + folio =3D swapin_entry(entry, GFP_HIGHUSER_MOVABLE, + thp_swapin_suiltable_orders(vmf), + vmf, NULL, 0); + else folio =3D swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - } =20 if (!folio) { /* diff --git a/mm/shmem.c b/mm/shmem.c index 9f054b5aae8e..0a19ac82ec77 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void) =20 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, vm_fault_t *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); =20 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -2014,68 +2014,24 @@ static struct folio *shmem_alloc_and_add_folio(stru= ct vm_fault *vmf, } =20 static struct folio *shmem_swap_alloc_folio(struct inode *inode, - struct vm_area_struct *vma, pgoff_t index, + struct vm_fault *vmf, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { + pgoff_t ilx; + struct folio *folio; + struct mempolicy *mpol; + unsigned long orders =3D BIT(order); struct shmem_inode_info *info =3D SHMEM_I(inode); - struct folio *new, *swapcache; - int nr_pages =3D 1 << order; - gfp_t alloc_gfp =3D gfp; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (WARN_ON_ONCE(order)) - return ERR_PTR(-EINVAL); - } else if (order) { - /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(entry, nr_pages) !=3D nr_pages) - goto fallback; =20 - alloc_gfp =3D thp_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); - } -retry: - new =3D shmem_alloc_folio(alloc_gfp, order, info, index); - if (!new) { - new =3D ERR_PTR(-ENOMEM); - goto fallback; - } + if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) || + !zswap_never_enabled()) + orders =3D 0; =20 - if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - alloc_gfp, entry)) { - folio_put(new); - new =3D ERR_PTR(-ENOMEM); - goto fallback; - } + mpol =3D shmem_get_pgoff_policy(info, index, order, &ilx); + folio =3D swapin_entry(entry, gfp, orders, vmf, mpol, ilx); + mpol_cond_put(mpol); =20 - swapcache =3D swapin_folio(entry, new); - if (swapcache !=3D new) { - folio_put(new); - if (!swapcache) { - /* - * The new folio is charged already, swapin can - * only fail due to another raced swapin. - */ - new =3D ERR_PTR(-EEXIST); - goto fallback; - } - } - return swapcache; -fallback: - /* Order 0 swapin failed, nothing to fallback to, abort */ - if (!order) - return new; - entry.val +=3D index - round_down(index, nr_pages); - alloc_gfp =3D gfp; - nr_pages =3D 1; - order =3D 0; - goto retry; + return folio; } =20 /* @@ -2262,11 +2218,12 @@ static int shmem_split_large_entry(struct inode *in= ode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct address_space *mapping =3D inode->i_mapping; - struct mm_struct *fault_mm =3D vma ? vma->vm_mm : NULL; + struct vm_area_struct *vma =3D vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm =3D vmf ? vmf->vma->vm_mm : NULL; struct shmem_inode_info *info =3D SHMEM_I(inode); swp_entry_t swap; softleaf_t index_entry; @@ -2307,20 +2264,15 @@ static int shmem_swapin_folio(struct inode *inode, = pgoff_t index, if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ - folio =3D shmem_swap_alloc_folio(inode, vma, index, - index_entry, order, gfp); - if (IS_ERR(folio)) { - error =3D PTR_ERR(folio); - folio =3D NULL; - goto failed; - } + folio =3D shmem_swap_alloc_folio(inode, vmf, index, + swap, order, gfp); } else { /* Cached swapin only supports order 0 folio */ folio =3D shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { - error =3D -ENOMEM; - goto failed; - } + } + if (!folio) { + error =3D -ENOMEM; + goto failed; } if (fault_type) { *fault_type |=3D VM_FAULT_MAJOR; @@ -2468,7 +2420,7 @@ static int shmem_get_folio_gfp(struct inode *inode, p= goff_t index, =20 if (xa_is_value(folio)) { error =3D shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, vmf, fault_type); if (error =3D=3D -EEXIST) goto repeat; =20 diff --git a/mm/swap.h b/mm/swap.h index 6774af10a943..80c2f1bf7a57 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -300,7 +300,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry,= gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); +struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, unsigned long or= ders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); =20 @@ -334,24 +335,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry= , int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } =20 -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - int i; - - /* - * While allocating a large folio and doing mTHP swapin, we need to - * ensure all entries are not cached, otherwise, the mTHP folio will - * be in conflict with the folio in swap cache. - */ - for (i =3D 0; i < max_nr; i++) { - if (swap_cache_has_folio(entry)) - return i; - entry.val++; - } - - return i; -} - #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -433,7 +416,9 @@ static inline struct folio *swapin_readahead(swp_entry_= t swp, gfp_t gfp_mask, return NULL; } =20 -static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *= folio) +static inline struct folio *swapin_entry( + swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } @@ -493,10 +478,5 @@ static inline int swap_zeromap_batch(swp_entry_t entry= , int max_nr, { return 0; } - -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - return 0; -} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index e32b06a1f229..0a2a4e084cf2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -199,43 +199,6 @@ void __swap_cache_add_folio(struct swap_cluster_info *= ci, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } =20 -/** - * swap_cache_add_folio - Add a folio into the swap cache. - * @folio: The folio to be added. - * @entry: The swap entry corresponding to the folio. - * @gfp: gfp_mask for XArray node allocation. - * @shadowp: If a shadow is found, return the shadow. - * - * Context: Caller must ensure @entry is valid and protect the swap device - * with reference count or locks. - */ -static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadowp) -{ - int err; - void *shadow =3D NULL; - unsigned int ci_off; - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long nr_pages =3D folio_nr_pages(folio); - - si =3D __swap_entry_to_info(entry); - ci =3D swap_cluster_lock(si, swp_offset(entry)); - ci_off =3D swp_cluster_offset(entry); - err =3D __swap_cache_check_batch(ci, ci_off, ci_off, nr_pages, &shadow); - if (err) { - swap_cluster_unlock(ci); - return err; - } - - __swap_cache_add_folio(ci, folio, entry); - swap_cluster_unlock(ci); - if (shadowp) - *shadowp =3D shadow; - - return 0; -} - static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, swp_entry_t targ_entry, gfp_t gfp, unsigned int order, struct vm_fault *vmf, @@ -328,30 +291,28 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ= _entry, gfp_t gfp_mask, unsigned long orders, struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { - int order; + int order, err; struct folio *folio; struct swap_cluster_info *ci; =20 + /* Always allow order 0 so swap won't fail under pressure. */ + order =3D orders ? highest_order(orders |=3D BIT(0)) : 0; ci =3D __swap_entry_to_cluster(targ_entry); - order =3D orders ? highest_order(orders) : 0; for (;;) { folio =3D __swap_cache_alloc(ci, targ_entry, gfp_mask, order, vmf, mpol, ilx); if (!IS_ERR(folio)) return folio; - if (PTR_ERR(folio) =3D=3D -EAGAIN) + err =3D PTR_ERR(folio); + if (err =3D=3D -EAGAIN) continue; - /* Only -EBUSY means we should fallback and retry. */ - if (PTR_ERR(folio) !=3D -EBUSY) - return folio; + if (!order || (err !=3D -EBUSY && err !=3D -ENOMEM)) + break; count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order =3D next_order(&orders, order); - if (!orders) - break; } - /* Should never reach here, order 0 should not fail with -EBUSY. */ - WARN_ON_ONCE(1); - return ERR_PTR(-EINVAL); + + return ERR_PTR(err); } =20 /** @@ -584,51 +545,6 @@ void swap_update_readahead(struct folio *folio, struct= vm_area_struct *vma, } } =20 -/** - * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cac= he. - * @entry: swap entry to be bound to the folio. - * @folio: folio to be added. - * @gfp: memory allocation flags for charge, can be 0 if @charged if true. - * @charged: if the folio is already charged. - * - * Update the swap_map and add folio as swap cache, typically before swapi= n. - * All swap slots covered by the folio must have a non-zero swap count. - * - * Context: Caller must protect the swap device with reference count or lo= cks. - * Return: 0 if success, error code if failed. - */ -static int __swap_cache_prepare_and_add(swp_entry_t entry, - struct folio *folio, - gfp_t gfp, bool charged) -{ - void *shadow; - int ret; - - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - ret =3D swap_cache_add_folio(folio, entry, &shadow); - if (ret) - goto failed; - - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - swap_cache_del_folio(folio); - ret =3D -ENOMEM; - goto failed; - } - - memcg1_swapin(entry, folio_nr_pages(folio)); - if (shadow) - workingset_refault(folio, shadow); - - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); - return 0; - -failed: - folio_unlock(folio); - return ret; -} - static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, struct mempolicy *mpol, pgoff_t ilx, struct swap_iocb **plug, bool readahead) @@ -649,7 +565,6 @@ static struct folio *swap_cache_read_folio(swp_entry_t = entry, gfp_t gfp, folio =3D swap_cache_get_folio(entry); if (folio) return folio; - folio =3D swap_cache_alloc_folio(entry, gfp, 0, NULL, mpol, ilx); } while (PTR_ERR(folio) =3D=3D -EEXIST); =20 @@ -666,49 +581,37 @@ static struct folio *swap_cache_read_folio(swp_entry_= t entry, gfp_t gfp, } =20 /** - * swapin_folio - swap-in one or multiple entries skipping readahead. - * @entry: starting swap entry to swap in - * @folio: a new allocated and charged folio + * swapin_entry - swap-in one or multiple entries skipping readahead. + * @entry: swap entry indicating the target slot + * @gfp_mask: memory allocation flags + * @orders: allocation orders + * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * - * Reads @entry into @folio, @folio will be added to the swap cache. - * If @folio is a large folio, the @entry will be rounded down to align - * with the folio size. + * This would allocate a folio suit given @orders, or return the existing + * folio in the swap cache for @entry. This initiates the IO, too, if need= ed. + * @entry could be rounded down if @orders allows large allocation. * - * Return: returns pointer to @folio on success. If folio is a large folio - * and this raced with another swapin, NULL will be returned to allow fall= back - * to order 0. Else, if another folio was already added to the swap cache, - * return that swap cache folio instead. + * Context: Caller must ensure @entry is valid and pin the swap device wit= h refcount. + * Return: Returns the folio on success, returns error code if failed. */ -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp, unsigned long ord= ers, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { - int ret; - struct folio *swapcache; - pgoff_t offset =3D swp_offset(entry); - unsigned long nr_pages =3D folio_nr_pages(folio); - - entry =3D swp_entry(swp_type(entry), round_down(offset, nr_pages)); - for (;;) { - ret =3D __swap_cache_prepare_and_add(entry, folio, 0, true); - if (!ret) { - swap_read_folio(folio, NULL); - break; - } + struct folio *folio; =20 - /* - * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fallback to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. - */ - if (ret !=3D -EEXIST || nr_pages > 1) - return NULL; + do { + folio =3D swap_cache_get_folio(entry); + if (folio) + return folio; + folio =3D swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx); + } while (PTR_ERR(folio) =3D=3D -EEXIST); =20 - swapcache =3D swap_cache_get_folio(entry); - if (swapcache) - return swapcache; - } + if (IS_ERR(folio)) + return NULL; =20 + swap_read_folio(folio, NULL); return folio; } =20 diff --git a/mm/swapfile.c b/mm/swapfile.c index 06b37efad2bd..7e7614a5181a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1833,8 +1833,7 @@ void folio_put_swap(struct folio *folio, struct page = *subpage) * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() - * swap_cache_add_folio() - * // check swap_map + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD52E32AAC0; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=TRDuFQhnHJdywL6vQ5ALISefz0sMrX/0Bhuh5o4P2hdgRD6bBUI+6AUS4+Ud5Ta8vymmzkyufgMNiyurObKfR5TqTC1oBYWMW45p/yAnBtpNavkRpBNhhubyuBbmuVAHXuZLnGhM76hCHd28EBxIGIg7ze6mQzFVJlT7tULKr54= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=4iTCLXI0G718VK6Ca+BP/6nuHLoyDqG29VXYYyUPsLE=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=BP8ctD0eV0UieUlz4GGUiUuLem65ScuefSYF7JuzguqjgJFioI4z7Q/8ff+CT6GH68Bjd5TtB4aaYXkkqmJ9BIc/qm+lx83I+Nd+BpBXxzostQFpdlYQlVIUYSbUkCsu17X/iEct+EsGG+ve8dEFZr7bGP38w4gP7oUxPE7wx70= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=bFjDBJol; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="bFjDBJol" Received: by smtp.kernel.org (Postfix) with ESMTPS id 9E549C4CEF7; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=4iTCLXI0G718VK6Ca+BP/6nuHLoyDqG29VXYYyUPsLE=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=bFjDBJollDR0O+0bka0eoDtbc6D06nyu9hdmjQGBOI6slfe50chmcxflzdT62fvv4 nKNTfKzaGo61ba8mDZfbmS/lJLuIkGRm64M8PIX2QvdWYEuHBqNvxqXskSBQTgFLlh KQHvUq5ZT4yu/xvVa8/721Q76mPVkzCopbjG8xrIX+gRNhS7NXpxsCckjA73uTlfoZ G7eCGkbROvGJ7gKkF89eg0e4yuk9OwrDXx7jA6Zm+DRHRgOuQSmzqhxHztvtQ3FQT5 GrThfv7MrpJTrbJ4/N8dppdT3KiONe5MMV9ApYiYY9j8oaw51wiIQlqNlChkRofvla juYBVUtREH9Rg== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 92EFBC531EB; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:07 +0800 Subject: [PATCH RFC 06/15] memcg, swap: reparent the swap entry on swapin if swapout cgroup is dead Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-6-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=5359; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=9vO+dDP6+je7+jClnWWZNq1Qut+lGRZYCcUUD2nWDoQ=; b=IJcUr2SA72vqNqyIF134yy4jJ7YYtmhGsNrcSmy0Q9+wfm8Xie+xv3UHp/3cAaYgf2KVDPluh fEz1zi/4rLRAHAdpMH0ccI3u2DKM8Z0eLSBfg87HH3CD83K36kEjcv1 X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song As a result this will always charge the swapin folio into the dead cgroup's parent cgroup, and ensure folio->swap belongs to folio_memcg. This only affects some uncommon behavior if we move the process between memcg. When a process that previously swapped some memory is moved to another cgroup, and the cgroup where the swap occurred is dead, folios for swap in of old swap entries will be charged into the new cgroup. Combined with the lazy freeing of swap cache, this leads to a strange situation where the folio->swap entry belongs to a cgroup that is not folio->memcg. Swapin from dead zombie memcg might be rare in practise, cgroups are offlined only after the workload in it is gone, which requires zapping the page table first, and releases all swap entries. Shmem is a bit different, but shmem always has swap count =3D=3D 1, and force releases the swap cache. So, for shmem charging into the new memcg and release entry does look more sensible. However, to make things easier to understand for an RFC, let's just always charge to the parent cgroup if the leaf cgroup is dead. This may not be the best design, but it makes the following work much easier to demonstrate. For a better solution, we can later: - Dynamically allocate a swap cluster trampoline cgroup table (ci->memcg_table) and use that for zombie swapin only. Which is actually OK and may not cause a mess in the code level, since the incoming swap table compaction will require table expansion on swap-in as well. - Just tolerate a 2-byte per slot overhead all the time, which is also acceptable. - Limit the charge to parent behavior to only one situation: when the swap count > 2 and the process is migrated to another cgroup after swapout, these entries. This is even more rare to see in practice, I think. For reference, the memory ownership model of cgroup v2: """ A memory area is charged to the cgroup which instantiated it and stays charged to the cgroup until the area is released. Migrating a process to a different cgroup doesn't move the memory usages that it instantiated while in the previous cgroup to the new cgroup. A memory area may be used by processes belonging to different cgroups. To which cgroup the area will be charged is in-deterministic; however, over time, the memory area is likely to end up in a cgroup which has enough memory allowance to avoid high reclaim pressure. If a cgroup sweeps a considerable amount of memory which is expected to be accessed repeatedly by other cgroups, it may make sense to use POSIX_FADV_DONTNEED to relinquish the ownership of memory areas belonging to the affected files to ensure correct memory ownership. """ So I think all of the solutions mentioned above, including this commit, are not wrong. Signed-off-by: Kairui Song --- mm/memcontrol.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 73f622f7a72b..b2898719e935 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4803,22 +4803,67 @@ int mem_cgroup_charge_hugetlb(struct folio *folio, = gfp_t gfp) int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *= mm, gfp_t gfp, swp_entry_t entry) { - struct mem_cgroup *memcg; - unsigned short id; + struct mem_cgroup *memcg, *swap_memcg; + unsigned short id, parent_id; + unsigned int nr_pages; int ret; =20 if (mem_cgroup_disabled()) return 0; =20 id =3D lookup_swap_cgroup_id(entry); + nr_pages =3D folio_nr_pages(folio); + rcu_read_lock(); - memcg =3D mem_cgroup_from_private_id(id); - if (!memcg || !css_tryget_online(&memcg->css)) + swap_memcg =3D mem_cgroup_from_private_id(id); + if (!swap_memcg) { + WARN_ON_ONCE(id); memcg =3D get_mem_cgroup_from_mm(mm); + } else { + memcg =3D swap_memcg; + /* Find the nearest online ancestor if dead, for reparent */ + while (!css_tryget_online(&memcg->css)) + memcg =3D parent_mem_cgroup(memcg); + } rcu_read_unlock(); =20 ret =3D charge_memcg(folio, memcg, gfp); + if (ret) + goto out; + + /* + * If the swap entry's memcg is dead, reparent the swap charge + * from swap_memcg to memcg. + * + * If memcg is also being offlined, the charge will be moved to + * its parent again. + */ + if (swap_memcg && memcg !=3D swap_memcg) { + struct mem_cgroup *parent_memcg; =20 + parent_memcg =3D mem_cgroup_private_id_get_online(memcg, nr_pages); + parent_id =3D mem_cgroup_private_id(parent_memcg); + + WARN_ON(id !=3D swap_cgroup_clear(entry, nr_pages)); + swap_cgroup_record(folio, parent_id, entry); + + if (do_memsw_account()) { + if (!mem_cgroup_is_root(parent_memcg)) + page_counter_charge(&parent_memcg->memsw, nr_pages); + page_counter_uncharge(&swap_memcg->memsw, nr_pages); + } else { + if (!mem_cgroup_is_root(parent_memcg)) + page_counter_charge(&parent_memcg->swap, nr_pages); + page_counter_uncharge(&swap_memcg->swap, nr_pages); + } + + mod_memcg_state(parent_memcg, MEMCG_SWAP, nr_pages); + mod_memcg_state(swap_memcg, MEMCG_SWAP, -nr_pages); + + /* Release the dead cgroup after reparent */ + mem_cgroup_private_id_put(swap_memcg, nr_pages); + } +out: css_put(&memcg->css); return ret; } --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DBF4132BF32; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; cv=none; b=QLjUxgAJUrB8O5XF7UUtGK9lp2Rg7VwKOScuqx90/a2wK5dVyfAB3inJA8QmVRnsVAYh0wcvnFyitavYXYK3LytKlayYhDpCzVJNmH51KlnlhBADUCm68Le1jarzottJqQyR0RyI+XA2cTBRM3HIWXSHDAoGubix0kgqIk9gzDM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544527; c=relaxed/simple; bh=IyPbPri7CX9GB9ElQqKtYupl/DO1oV0ntplpsuoO26M=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=laH0IPZ/2neamSWgyvoVTePbBbkLrx9Kf3ADDqYLuPrH+aftkUywtjKnD+ofxnEPox8oApOCxC3iwdM+OXMSMs6do0sefW9lgSbWQtnmUkRLHlYjjvDHHhb4Z3A17QC0Sxx86NemTr4/5zZijPC5y2izoy2jWrSmQg3QRz3WxuE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=rGlOZxrj; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="rGlOZxrj" Received: by smtp.kernel.org (Postfix) with ESMTPS id BBD16C2BC9E; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=IyPbPri7CX9GB9ElQqKtYupl/DO1oV0ntplpsuoO26M=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=rGlOZxrjUUbEbocvIWQJL5OUnMKQ82iiddmBFy7bqyEsx2vHK6VcxkxgoIxPW6Tq4 Hef4l34HceK3MkuSYA+62O3ejlrl1CXpHLq0TlTWuLctL2s7vlpZJZFDAD8WHMLeJy 4trW5/RJYMIHylk0U6rnohnL5AhOIpCzmIcAoGfujAnNPB7KCCsfipuvmSTbC53DUx eZFjpy5zu3Zd5t0x6ZJ2QJ167AKPT+Lg4/505fUlAYyw1YEft+h69gMhY3uKa/VRFb +svFQj7PNnDqhTsWiw4+8M/pqWEPBbF9e8abEhUK2vtSqlhwNoVfk4/+GkpUdPXIdT Q0O0CA0Epi52Q== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id B0974C531EC; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:08 +0800 Subject: [PATCH RFC 07/15] memcg, swap: defer the recording of memcg info and reparent flexibly Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-7-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=21986; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=N4eG7UwpOV73KkZjg7SfKoYbB1/U5wsQN6FM7xZMnN0=; b=L/TS1LIkjpq+Vev1o2AIE5orrNSx1hnsyAXzgwsEd0IOlMphL9qo415nR5XllR+SY+cty9Op7 OolC2HsaQP5Bo/NuKjTGkfgpmh8T1WZtjS9xwmynQELKXWIbAPfpwKr X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song To make sure folio->swap always belongs to folio->memcg, when doing the charge, charge against folio->memcg. Defer the recording of swap cgroup info, do a reparent, and record the nearest online ancestor on swap cache removal only. Then, a folio is in the swap cache, and the folio itself is owned by the memcg. Hence, through the folio, the memcg also owns folio->swap. The extra pinning of the swap cgroup info record is not needed and can be released. This should be fine for both cgroup v2 and v1. There should be no userspace observable behavior. Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 8 ++-- include/linux/swap.h | 24 +++++++++-- mm/memcontrol-v1.c | 77 ++++++++++++++++----------------- mm/memcontrol.c | 104 ++++++++++++++++++++++++++++++++---------= ---- mm/swap.h | 6 ++- mm/swap_cgroup.c | 5 +-- mm/swap_state.c | 26 +++++++++--- mm/swapfile.c | 15 +++++-- mm/vmscan.c | 3 +- 9 files changed, 173 insertions(+), 95 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 70b685a85bf4..0b37d4faf785 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1896,8 +1896,8 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault =3D 0; } =20 -void memcg1_swapout(struct folio *folio, swp_entry_t entry); -void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); +void memcg1_swapout(struct folio *folio, struct mem_cgroup *swap_memcg); +void memcg1_swapin(struct folio *folio); =20 #else /* CONFIG_MEMCG_V1 */ static inline @@ -1926,11 +1926,11 @@ static inline void mem_cgroup_exit_user_fault(void) { } =20 -static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) +static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *= _memcg) { } =20 -static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +static inline void memcg1_swapin(struct folio *folio) { } =20 diff --git a/include/linux/swap.h b/include/linux/swap.h index 0effe3cc50f5..66cf657a1f35 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -580,12 +580,22 @@ static inline int mem_cgroup_try_charge_swap(struct f= olio *folio, return __mem_cgroup_try_charge_swap(folio, entry); } =20 -extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_= pages); -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned in= t nr_pages) +extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_= pages); +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned in= t nr_pages) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge_swap(entry, nr_pages); + __mem_cgroup_uncharge_swap(id, nr_pages); +} + +struct mem_cgroup *__mem_cgroup_swap_free_folio(struct folio *folio, + bool reclaim); +static inline struct mem_cgroup *mem_cgroup_swap_free_folio(struct folio *= folio, + bool reclaim) +{ + if (mem_cgroup_disabled()) + return NULL; + return __mem_cgroup_swap_free_folio(folio, reclaim); } =20 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); @@ -597,11 +607,17 @@ static inline int mem_cgroup_try_charge_swap(struct f= olio *folio, return 0; } =20 -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, +static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { } =20 +static inline struct mem_cgroup *mem_cgroup_swap_free_folio(struct folio *= folio, + bool reclaim) +{ + return NULL; +} + static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a7c78b0987df..038e630dc7e1 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -606,29 +606,21 @@ void memcg1_commit_charge(struct folio *folio, struct= mem_cgroup *memcg) /** * memcg1_swapout - transfer a memsw charge to swap * @folio: folio whose memsw charge to transfer - * @entry: swap entry to move the charge to - * - * Transfer the memsw charge of @folio to @entry. + * @swap_memcg: cgroup that will be charged, must be online ancestor + * of folio's memcg. */ -void memcg1_swapout(struct folio *folio, swp_entry_t entry) +void memcg1_swapout(struct folio *folio, struct mem_cgroup *swap_memcg) { - struct mem_cgroup *memcg, *swap_memcg; + struct mem_cgroup *memcg; unsigned int nr_entries; + unsigned long flags; =20 - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - - if (mem_cgroup_disabled()) - return; - - if (!do_memsw_account()) - return; + /* The folio must be getting reclaimed. */ + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); =20 memcg =3D folio_memcg(folio); =20 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) - return; =20 /* * In case the memcg owning these pages has been offlined and doesn't @@ -636,14 +628,15 @@ void memcg1_swapout(struct folio *folio, swp_entry_t = entry) * ancestor for the swap instead and transfer the memory+swap charge. */ nr_entries =3D folio_nr_pages(folio); - swap_memcg =3D mem_cgroup_private_id_get_online(memcg, nr_entries); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); =20 - swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); - folio_unqueue_deferred_split(folio); - folio->memcg_data =3D 0; =20 + /* + * Free the folio charge now so memsw won't be double uncharged: + * memsw is now charged by the swap record. + */ + folio->memcg_data =3D 0; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); =20 @@ -653,33 +646,34 @@ void memcg1_swapout(struct folio *folio, swp_entry_t = entry) page_counter_uncharge(&memcg->memsw, nr_entries); } =20 - /* - * Interrupts should be disabled here because the caller holds the - * i_pages lock which is taken with interrupts-off. It is - * important here to have the interrupts disabled because it is the - * only synchronisation we have for updating the per-CPU variables. - */ + local_irq_save(flags); preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); preempt_enable_nested(); + local_irq_restore(flags); memcg1_check_events(memcg, folio_nid(folio)); =20 css_put(&memcg->css); } =20 /* - * memcg1_swapin - uncharge swap slot - * @entry: the first swap entry for which the pages are charged - * @nr_pages: number of pages which will be uncharged + * memcg1_swapin - uncharge memsw for the swap slot on swapin + * @folio: the folio being swapped in, already charged to memory * * Call this function after successfully adding the charged page to swapca= che. - * - * Note: This function assumes the page for which swap slot is being uncha= rged - * is order 0 page. + * The swap cgroup tracking has already been released by + * mem_cgroup_swapin_charge_folio(), so we only need to drop the duplicate + * memsw charge that was placed on the swap entry during swapout. */ -void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +void memcg1_swapin(struct folio *folio) { + struct mem_cgroup *memcg; + unsigned int nr_pages; + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_memcg_charged(folio), folio); + /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap @@ -692,14 +686,15 @@ void memcg1_swapin(swp_entry_t entry, unsigned int nr= _pages) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (do_memsw_account()) { - /* - * The swap entry might not get freed for a long time, - * let's not wait for it. The page already received a - * memory+swap charge, drop the swap entry duplicate. - */ - mem_cgroup_uncharge_swap(entry, nr_pages); - } + if (!do_memsw_account()) + return; + + memcg =3D folio_memcg(folio); + nr_pages =3D folio_nr_pages(folio); + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memsw, nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); } =20 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2898719e935..d9ff44b77409 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4804,8 +4804,8 @@ int mem_cgroup_swapin_charge_folio(struct folio *foli= o, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; - unsigned short id, parent_id; unsigned int nr_pages; + unsigned short id; int ret; =20 if (mem_cgroup_disabled()) @@ -4831,37 +4831,31 @@ int mem_cgroup_swapin_charge_folio(struct folio *fo= lio, struct mm_struct *mm, if (ret) goto out; =20 + /* + * On successful charge, the folio itself now belongs to the memcg, + * so is folio->swap. So we can release the swap cgroup table's + * pinning of the private id. + */ + swap_cgroup_clear(folio->swap, nr_pages); + mem_cgroup_private_id_put(swap_memcg, nr_pages); + /* * If the swap entry's memcg is dead, reparent the swap charge * from swap_memcg to memcg. - * - * If memcg is also being offlined, the charge will be moved to - * its parent again. */ if (swap_memcg && memcg !=3D swap_memcg) { - struct mem_cgroup *parent_memcg; - - parent_memcg =3D mem_cgroup_private_id_get_online(memcg, nr_pages); - parent_id =3D mem_cgroup_private_id(parent_memcg); - - WARN_ON(id !=3D swap_cgroup_clear(entry, nr_pages)); - swap_cgroup_record(folio, parent_id, entry); - if (do_memsw_account()) { - if (!mem_cgroup_is_root(parent_memcg)) - page_counter_charge(&parent_memcg->memsw, nr_pages); + if (!mem_cgroup_is_root(memcg)) + page_counter_charge(&memcg->memsw, nr_pages); page_counter_uncharge(&swap_memcg->memsw, nr_pages); } else { - if (!mem_cgroup_is_root(parent_memcg)) - page_counter_charge(&parent_memcg->swap, nr_pages); + if (!mem_cgroup_is_root(memcg)) + page_counter_charge(&memcg->swap, nr_pages); page_counter_uncharge(&swap_memcg->swap, nr_pages); } =20 - mod_memcg_state(parent_memcg, MEMCG_SWAP, nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); mod_memcg_state(swap_memcg, MEMCG_SWAP, -nr_pages); - - /* Release the dead cgroup after reparent */ - mem_cgroup_private_id_put(swap_memcg, nr_pages); } out: css_put(&memcg->css); @@ -5260,33 +5254,32 @@ int __mem_cgroup_try_charge_swap(struct folio *foli= o, swp_entry_t entry) return 0; } =20 - memcg =3D mem_cgroup_private_id_get_online(memcg, nr_pages); - + /* + * Charge the swap counter against the folio's memcg directly. + * The private id pinning and swap cgroup recording are deferred + * to __mem_cgroup_swap_free_folio() when the folio leaves the + * swap cache. No _id_get_online here means no _id_put on error. + */ if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - mem_cgroup_private_id_put(memcg, nr_pages); return -ENOMEM; } mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); =20 - swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); - return 0; } =20 /** * __mem_cgroup_uncharge_swap - uncharge swap space - * @entry: swap entry to uncharge + * @id: private id of the mem_cgroup to uncharge * @nr_pages: the amount of swap space to uncharge */ -void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { struct mem_cgroup *memcg; - unsigned short id; =20 - id =3D swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg =3D mem_cgroup_from_private_id(id); if (memcg) { @@ -5302,6 +5295,59 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, u= nsigned int nr_pages) rcu_read_unlock(); } =20 +/** + * __mem_cgroup_swap_free_folio - Folio is being freed from swap cache. + * @folio: folio being freed. + * @reclaim: true if the folio is being reclaimed. + * + * For cgroup V2, swap entries are charged to folio's memcg by the time + * swap allocator adds it into the swap cache by mem_cgroup_try_charge_swa= p. + * The ownership of folio->swap to folio->memcg is constrained by the folio + * in swap cache. If the folio is being removed from swap cache, the + * constraint will be gone so need to grab the memcg's private id for long + * term tracking. + * + * For cgroup V1, the memory-to-swap charge transfer is also performed on + * the folio reclaim path. + * + * It's unlikely but possible that the folio's memcg is dead, in that case + * we reparent and recharge the parent. Recorded cgroup is changed to + * parent too. + * + * Return: Pointer to the mem cgroup being pinned by the charge. + */ +struct mem_cgroup *__mem_cgroup_swap_free_folio(struct folio *folio, + bool reclaim) +{ + unsigned int nr_pages =3D folio_nr_pages(folio); + struct mem_cgroup *memcg, *swap_memcg; + swp_entry_t entry =3D folio->swap; + unsigned short id; + + VM_WARN_ON_ONCE_FOLIO(!folio_memcg_charged(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + + /* + * Pin the nearest online ancestor's private id for long term + * swap cgroup tracking. If memcg is still alive, swap_memcg + * will be the same as memcg. Else, it's reparented. + */ + memcg =3D folio_memcg(folio); + swap_memcg =3D mem_cgroup_private_id_get_online(memcg, nr_pages); + id =3D mem_cgroup_private_id(swap_memcg); + swap_cgroup_record(folio, id, entry); + + if (reclaim && do_memsw_account()) { + memcg1_swapout(folio, swap_memcg); + } else if (memcg !=3D swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->swap, nr_pages); + page_counter_uncharge(&memcg->swap, nr_pages); + } + + return swap_memcg; +} + long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages =3D get_nr_swap_pages(); diff --git a/mm/swap.h b/mm/swap.h index 80c2f1bf7a57..da41e9cea46d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -287,7 +287,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t target= _entry, gfp_t gfp_mask, void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry); void __swap_cache_del_folio(struct swap_cluster_info *ci, - struct folio *folio, swp_entry_t entry, void *shadow); + struct folio *folio, void *shadow, + bool charged, bool reclaim); void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new); =20 @@ -459,7 +460,8 @@ static inline void swap_cache_del_folio(struct folio *f= olio) } =20 static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, - struct folio *folio, swp_entry_t entry, void *shadow) + struct folio *folio, void *shadow, + bool charged, bool reclaim) { } =20 diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index de779fed8c21..b5a7f21c3afe 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -54,8 +54,7 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_c= group *map, /** * swap_cgroup_record - record mem_cgroup for a set of swap entries. * These entries must belong to one single folio, and that folio - * must be being charged for swap space (swap out), and these - * entries must not have been charged + * must be being charged for swap space (swap out). * * @folio: the folio that the swap entry belongs to * @id: mem_cgroup ID to be recorded @@ -75,7 +74,7 @@ void swap_cgroup_record(struct folio *folio, unsigned sho= rt id, =20 do { old =3D __swap_cgroup_id_xchg(map, offset, id); - VM_BUG_ON(old); + VM_WARN_ON_ONCE(old); } while (++offset !=3D end); } =20 diff --git a/mm/swap_state.c b/mm/swap_state.c index 0a2a4e084cf2..40f037576c5f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -251,7 +251,7 @@ static struct folio *__swap_cache_alloc(struct swap_clu= ster_info *ci, if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, gfp, entry)) { spin_lock(&ci->lock); - __swap_cache_del_folio(ci, folio, shadow); + __swap_cache_del_folio(ci, folio, shadow, false, false); spin_unlock(&ci->lock); folio_unlock(folio); folio_put(folio); @@ -260,7 +260,7 @@ static struct folio *__swap_cache_alloc(struct swap_clu= ster_info *ci, } =20 /* For memsw accouting, swap is uncharged when folio is added to swap cac= he */ - memcg1_swapin(entry, 1 << order); + memcg1_swapin(folio); if (shadow) workingset_refault(folio, shadow); =20 @@ -319,21 +319,24 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ= _entry, gfp_t gfp_mask, * __swap_cache_del_folio - Removes a folio from the swap cache. * @ci: The locked swap cluster. * @folio: The folio. - * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. + * @charged: If folio->swap is charged to folio->memcg. + * @reclaim: If the folio is being reclaimed. When true on cgroup v1, + * the memory charge is transferred from memory to swap. * * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * - * Context: Caller must ensure the folio is locked and in the swap cache - * using the index of @entry, and lock the cluster that holds the entries. + * Context: Caller must ensure the folio is locked and in the swap cache, + * and lock the cluster that holds the entries. */ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *fo= lio, - swp_entry_t entry, void *shadow) + void *shadow, bool charged, bool reclaim) { int count; unsigned long old_tb; struct swap_info_struct *si; + swp_entry_t entry =3D folio->swap; unsigned int ci_start, ci_off, ci_end; bool folio_swapped =3D false, need_free =3D false; unsigned long nr_pages =3D folio_nr_pages(folio); @@ -343,6 +346,15 @@ void __swap_cache_del_folio(struct swap_cluster_info *= ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); =20 + /* + * If the folio's swap entry is charged to its memcg, record the + * swap cgroup for long-term tracking before the folio leaves the + * swap cache. Not charged when the folio never completed memcg + * charging (e.g. swapin charge failure, or swap alloc charge failure). + */ + if (charged) + mem_cgroup_swap_free_folio(folio, reclaim); + si =3D __swap_entry_to_info(entry); ci_start =3D swp_cluster_offset(entry); ci_end =3D ci_start + nr_pages; @@ -392,7 +404,7 @@ void swap_cache_del_folio(struct folio *folio) swp_entry_t entry =3D folio->swap; =20 ci =3D swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); - __swap_cache_del_folio(ci, folio, entry, NULL); + __swap_cache_del_folio(ci, folio, NULL, true, false); swap_cluster_unlock(ci); =20 folio_ref_sub(folio, folio_nr_pages(folio)); diff --git a/mm/swapfile.c b/mm/swapfile.c index 7e7614a5181a..c0169bce46c9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1703,6 +1703,7 @@ int folio_alloc_swap(struct folio *folio) { unsigned int order =3D folio_order(folio); unsigned int size =3D 1 << order; + struct swap_cluster_info *ci; =20 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); @@ -1737,8 +1738,12 @@ int folio_alloc_swap(struct folio *folio) } =20 /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ - if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) - swap_cache_del_folio(folio); + if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) { + ci =3D swap_cluster_lock(__swap_entry_to_info(folio->swap), + swp_offset(folio->swap)); + __swap_cache_del_folio(ci, folio, NULL, false, false); + swap_cluster_unlock(ci); + } =20 if (unlikely(!folio_test_swapcache(folio))) return -ENOMEM; @@ -1879,6 +1884,7 @@ void __swap_cluster_free_entries(struct swap_info_str= uct *si, unsigned int ci_start, unsigned int nr_pages) { unsigned long old_tb; + unsigned short id; unsigned int ci_off =3D ci_start, ci_end =3D ci_start + nr_pages; unsigned long offset =3D cluster_offset(si, ci) + ci_start; =20 @@ -1892,7 +1898,10 @@ void __swap_cluster_free_entries(struct swap_info_st= ruct *si, __swap_table_set(ci, ci_off, null_to_swp_tb()); } while (++ci_off < ci_end); =20 - mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); + id =3D swap_cgroup_clear(swp_entry(si->type, offset), nr_pages); + if (id) + mem_cgroup_uncharge_swap(id, nr_pages); + swap_range_free(si, offset, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); =20 diff --git a/mm/vmscan.c b/mm/vmscan.c index 44e4fcd6463c..5112f81cf875 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -759,8 +759,7 @@ static int __remove_mapping(struct address_space *mappi= ng, struct folio *folio, =20 if (reclaimed && !mapping_exiting(mapping)) shadow =3D workingset_eviction(folio, target_memcg); - memcg1_swapout(folio, swap); - __swap_cache_del_folio(ci, folio, swap, shadow); + __swap_cache_del_folio(ci, folio, shadow, true, true); swap_cluster_unlock_irq(ci); } else { void (*free_folio)(struct folio *); --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E950332D43C; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=TxIqEtz7EOtE59GnDr0FNlNbKQMdniM+0+rM1f3VtTQkkACGkRXvwuseA6dkJtK7Jt8yqfXQMlsX8q3jYTYRU532w5gIouqj0DiMnbryW5zyYzmTwd6anIi1jkRtpJipB91hfQ+aMoh/+KMBBf0fKb7H6DVDYpV2crHJDQunZjM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=7fVNl2OWSZn5yIAaZMdbJy8esxpVNqxwiw/ChyI7RI8=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=irYGUlDIUgLXivF2vs/ZuVftuCfqSSGE0n89wPKqBHhZcjVRHKZOXabPVywVN39SWX3gFh3hQ1NYZ4bhiNAQc6dyjmLx/L1Ns9YN0nk8z8fIxEP+aGm/zNfUs/Nz9ThkpsPXPlzA0WnZKTiEuNBTQpzmqAMMNiv+GZ7s8y8CC/c= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=gutj444u; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="gutj444u" Received: by smtp.kernel.org (Postfix) with ESMTPS id CC63EC116C6; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=7fVNl2OWSZn5yIAaZMdbJy8esxpVNqxwiw/ChyI7RI8=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=gutj444uK2RjiSDXuWpNITUZSFe22ZDCg5BOQxexBE5QKnd/jwXsdUge+dqFwKryQ 5vCzE9A/AmchcdBBvU9G5g1UxNIDXHadQOZeellY7WZ0GHK4CNU+clLYjjoEs5ccp6 Yf4ltSnll7hIlZ/BjxAUVnlRvhWE3PpWpkxvB1He4qUyMyXmAq3PrBv9O45AOWoEJY 5i+j4s6B2K244MbMZQZSQX51cTw7ZH3sRwicL1Y2kkMsbBshdGMdlaZXVKMw+aoyiM 2MERmy08ck+5sVPOG9po56/zI95lxP733oXgxuGBRSck0VnykXAu8fn4soqhDdu6BV qMjo3zu08GvOg== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id C47E2C531E3; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:09 +0800 Subject: [PATCH RFC 08/15] mm, swap: store and check memcg info in the swap table Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-8-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=11918; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=T46p8i3lmd7nNPEr9N3RzFlb/rLjAdfkkrw3HoHcgOA=; b=ACLCqf4sQhtOMVUw7uIzQypZq7bA8GsNSCfaFYF7EJAQ3irEZYoDIiP6FtvzUwRYrJH13tShm Hs2UKl47dqHD/sSc/bYJeKsgR98o16GR9f3V+K0HcrnsXdNGKtXBqBL X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song To prepare for merging the swap_cgroup_ctrl into the swap table, store the memcg info in the swap table on swapout. This is done by using the existing shadow format. Note this also changes the refault counting at the nearest online memcg level: Unlike file folios, anon folios are mostly exclusive to one mem cgroup, and each cgroup is likely to have different characteristics. When commit b910718a948a ("mm: vmscan: detect file thrashing at the reclaim root") moved the refault accounting to the reclaim root level, anon shadows don't even exist, and it's explicitly for file pages. Later commit aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU") added anon shadows following a similar design. And in shrink_lruvec, an active LRU's shrinking is done regardlessly when it's low. For MGLRU, it's a bit different, but with the PID refault control, it's more accurate to let the nearest online memcg take the refault feedback too. Signed-off-by: Kairui Song --- mm/internal.h | 20 ++++++++++++++++++++ mm/swap.h | 7 ++++--- mm/swap_state.c | 50 +++++++++++++++++++++++++++++++++----------------- mm/swapfile.c | 4 +++- mm/vmscan.c | 6 +----- mm/workingset.c | 16 +++++++++++----- 6 files changed, 72 insertions(+), 31 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index cb0af847d7d9..5bbe081c9048 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1714,6 +1714,7 @@ static inline void shrinker_debugfs_remove(struct den= try *debugfs_entry, #endif /* CONFIG_SHRINKER_DEBUG */ =20 /* Only track the nodes of mappings with shadow entries */ +#define WORKINGSET_SHIFT 1 void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ @@ -1722,6 +1723,25 @@ extern struct list_lru shadow_nodes; xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) +static inline unsigned short shadow_to_memcgid(void *shadow) +{ + unsigned long entry =3D xa_to_value(shadow); + unsigned short memcgid; + + entry >>=3D (WORKINGSET_SHIFT + NODES_SHIFT); + memcgid =3D entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + + return memcgid; +} +static inline void *memcgid_to_shadow(unsigned short memcgid) +{ + unsigned long val; + + val =3D memcgid; + val <<=3D (NODES_SHIFT + WORKINGSET_SHIFT); + + return xa_mk_value(val); +} =20 /* mremap.c */ unsigned long move_page_tables(struct pagetable_move_control *pmc); diff --git a/mm/swap.h b/mm/swap.h index da41e9cea46d..c95f5fafea42 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -265,6 +265,8 @@ static inline bool folio_matches_swap_entry(const struc= t folio *folio, return folio_entry.val =3D=3D round_down(entry.val, nr_pages); } =20 +bool folio_maybe_swapped(struct folio *folio); + /* * All swap cache helpers below require the caller to ensure the swap entr= ies * used are valid and stabilize the device by any of the following ways: @@ -286,9 +288,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t target= _entry, gfp_t gfp_mask, /* Below helpers require the caller to lock and pass in the swap cluster. = */ void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry); -void __swap_cache_del_folio(struct swap_cluster_info *ci, - struct folio *folio, void *shadow, - bool charged, bool reclaim); +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *fo= lio, + void *shadow, bool charged, bool reclaim); void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new); =20 diff --git a/mm/swap_state.c b/mm/swap_state.c index 40f037576c5f..cc4bf40320ef 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -143,22 +143,11 @@ static int __swap_cache_check_batch(struct swap_clust= er_info *ci, { unsigned int ci_end =3D ci_off + nr; unsigned long old_tb; + unsigned int memcgid; =20 if (unlikely(!ci->table)) return -ENOENT; =20 - do { - old_tb =3D __swap_table_get(ci, ci_off); - if (unlikely(swp_tb_is_folio(old_tb)) || - unlikely(!__swp_tb_get_count(old_tb))) - break; - if (swp_tb_is_shadow(old_tb)) - *shadowp =3D swp_tb_to_shadow(old_tb); - } while (++ci_off < ci_end); - - if (likely(ci_off =3D=3D ci_end)) - return 0; - /* * If the target slot is not suitable for adding swap cache, return * -EEXIST or -ENOENT. If the batch is not suitable, could be a @@ -169,7 +158,21 @@ static int __swap_cache_check_batch(struct swap_cluste= r_info *ci, return -EEXIST; if (!__swp_tb_get_count(old_tb)) return -ENOENT; - return -EBUSY; + if (WARN_ON_ONCE(!swp_tb_is_shadow(old_tb))) + return -ENOENT; + *shadowp =3D swp_tb_to_shadow(old_tb); + memcgid =3D shadow_to_memcgid(*shadowp); + + WARN_ON_ONCE(!mem_cgroup_disabled() && !memcgid); + do { + old_tb =3D __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb)) || + unlikely(!__swp_tb_get_count(old_tb)) || + memcgid !=3D shadow_to_memcgid(swp_tb_to_shadow(old_tb))) + return -EBUSY; + } while (++ci_off < ci_end); + + return 0; } =20 void __swap_cache_add_folio(struct swap_cluster_info *ci, @@ -261,8 +264,7 @@ static struct folio *__swap_cache_alloc(struct swap_clu= ster_info *ci, =20 /* For memsw accouting, swap is uncharged when folio is added to swap cac= he */ memcg1_swapin(folio); - if (shadow) - workingset_refault(folio, shadow); + workingset_refault(folio, shadow); =20 /* Caller will initiate read into locked new_folio */ folio_add_lru(folio); @@ -319,7 +321,8 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_e= ntry, gfp_t gfp_mask, * __swap_cache_del_folio - Removes a folio from the swap cache. * @ci: The locked swap cluster. * @folio: The folio. - * @shadow: shadow value to be filled in the swap cache. + * @shadow: Shadow to restore when the folio is not charged. Ignored when + * @charged is true, as the shadow is computed internally. * @charged: If folio->swap is charged to folio->memcg. * @reclaim: If the folio is being reclaimed. When true on cgroup v1, * the memory charge is transferred from memory to swap. @@ -336,6 +339,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *c= i, struct folio *folio, int count; unsigned long old_tb; struct swap_info_struct *si; + struct mem_cgroup *memcg =3D NULL; swp_entry_t entry =3D folio->swap; unsigned int ci_start, ci_off, ci_end; bool folio_swapped =3D false, need_free =3D false; @@ -353,7 +357,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *= ci, struct folio *folio, * charging (e.g. swapin charge failure, or swap alloc charge failure). */ if (charged) - mem_cgroup_swap_free_folio(folio, reclaim); + memcg =3D mem_cgroup_swap_free_folio(folio, reclaim); + if (reclaim) { + WARN_ON(!charged); + shadow =3D workingset_eviction(folio, memcg); + } else if (memcg) { + shadow =3D memcgid_to_shadow(mem_cgroup_private_id(memcg)); + } =20 si =3D __swap_entry_to_info(entry); ci_start =3D swp_cluster_offset(entry); @@ -392,6 +402,11 @@ void __swap_cache_del_folio(struct swap_cluster_info *= ci, struct folio *folio, * swap_cache_del_folio - Removes a folio from the swap cache. * @folio: The folio. * + * Force delete a folio from the swap cache. This is only safe to use for + * folios that are not swapped out (swap count =3D=3D 0) to release the sw= ap + * space from being pinned by swap cache, or remove a clean and charged + * folio that no one modified or is still using. + * * Same as __swap_cache_del_folio, but handles lock and refcount. The * caller must ensure the folio is either clean or has a swap count * equal to zero, or it may cause data loss. @@ -404,6 +419,7 @@ void swap_cache_del_folio(struct folio *folio) swp_entry_t entry =3D folio->swap; =20 ci =3D swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + VM_WARN_ON_ONCE(folio_test_dirty(folio) && folio_maybe_swapped(folio)); __swap_cache_del_folio(ci, folio, NULL, true, false); swap_cluster_unlock(ci); =20 diff --git a/mm/swapfile.c b/mm/swapfile.c index c0169bce46c9..2cd3e260f1bf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1972,9 +1972,11 @@ int swp_swapcount(swp_entry_t entry) * decrease of swap count is possible through swap_put_entries_direct, so = this * may return a false positive. * + * Caller can hold the ci lock to get a stable result. + * * Context: Caller must ensure the folio is locked and in the swap cache. */ -static bool folio_maybe_swapped(struct folio *folio) +bool folio_maybe_swapped(struct folio *folio) { swp_entry_t entry =3D folio->swap; struct swap_cluster_info *ci; diff --git a/mm/vmscan.c b/mm/vmscan.c index 5112f81cf875..4565c9c3ac60 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -755,11 +755,7 @@ static int __remove_mapping(struct address_space *mapp= ing, struct folio *folio, } =20 if (folio_test_swapcache(folio)) { - swp_entry_t swap =3D folio->swap; - - if (reclaimed && !mapping_exiting(mapping)) - shadow =3D workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(ci, folio, shadow, true, true); + __swap_cache_del_folio(ci, folio, NULL, true, true); swap_cluster_unlock_irq(ci); } else { void (*free_folio)(struct folio *); diff --git a/mm/workingset.c b/mm/workingset.c index 37a94979900f..765a954baefa 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -202,12 +202,18 @@ static unsigned int bucket_order[ANON_AND_FILE] __rea= d_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long evic= tion, bool workingset, bool file) { + void *shadow; + eviction &=3D file ? EVICTION_MASK : EVICTION_MASK_ANON; eviction =3D (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction =3D (eviction << NODES_SHIFT) | pgdat->node_id; eviction =3D (eviction << WORKINGSET_SHIFT) | workingset; =20 - return xa_mk_value(eviction); + shadow =3D xa_mk_value(eviction); + /* Sanity check for retrieving memcgid from anon shadow. */ + VM_WARN_ON_ONCE(shadow_to_memcgid(shadow) !=3D memcgid); + + return shadow; } =20 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, @@ -232,7 +238,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, = pg_data_t **pgdat, =20 #ifdef CONFIG_LRU_GEN =20 -static void *lru_gen_eviction(struct folio *folio) +static void *lru_gen_eviction(struct folio *folio, struct mem_cgroup *memc= g) { int hist; unsigned long token; @@ -244,7 +250,6 @@ static void *lru_gen_eviction(struct folio *folio) int refs =3D folio_lru_refs(folio); bool workingset =3D folio_test_workingset(folio); int tier =3D lru_tier_from_refs(refs, workingset); - struct mem_cgroup *memcg =3D folio_memcg(folio); struct pglist_data *pgdat =3D folio_pgdat(folio); =20 BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > @@ -252,6 +257,7 @@ static void *lru_gen_eviction(struct folio *folio) =20 lruvec =3D mem_cgroup_lruvec(memcg, pgdat); lrugen =3D &lruvec->lrugen; + memcg =3D lruvec_memcg(lruvec); min_seq =3D READ_ONCE(lrugen->min_seq[type]); token =3D (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); =20 @@ -329,7 +335,7 @@ static void lru_gen_refault(struct folio *folio, void *= shadow) =20 #else /* !CONFIG_LRU_GEN */ =20 -static void *lru_gen_eviction(struct folio *folio) +static void *lru_gen_eviction(struct folio *folio, struct mem_cgroup *targ= et_memcg) { return NULL; } @@ -396,7 +402,7 @@ void *workingset_eviction(struct folio *folio, struct m= em_cgroup *target_memcg) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); =20 if (lru_gen_enabled()) - return lru_gen_eviction(folio); + return lru_gen_eviction(folio, target_memcg); =20 lruvec =3D mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0B8AE32D45E; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=qJnOI9m5XVhi66jbDXX8rlUYvEr95zPCZ2QjI2GKZnhLqFHviF5e2rjm6AgcAtKAnCbZ8wAxLd6XLujfd3mUwqCC8TL4hGgKutARHC5aQnTQB4oRty1FmuPTDto5FB+V/Lr0KfsPGi4FLZojNEvLb/OrSnBfejVTg4eAy0PxFtA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=76c3QaEd2jCZParrzQ0wwU1XNJCt26Ty2dEVwvDZzmc=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=IJG5IBvTItE/CugVLch5JJl3PpM5/ChCtaYRPsFj2vFo5AsVy89Dus7siK5lI+ywwagc4FHChGwouX2MolrpnoErHJkQiwoHCgSgEzWYpk6LnxdqkHPA9aSpKqpvRmCkWcsa7oFghKYkgBuZkpexzxs2P4S8SzZ4F3oFWQCp5+o= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=qzAcBPC7; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="qzAcBPC7" Received: by smtp.kernel.org (Postfix) with ESMTPS id DDA9BC2BCB1; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544527; bh=76c3QaEd2jCZParrzQ0wwU1XNJCt26Ty2dEVwvDZzmc=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=qzAcBPC7NnS1YB7AxoXXGQTEF+1VDmP1NZlh4N8sjbsSoa7C6LgqlzvqTcjn/WuxW KtztDgSt2q8JQVsHG5I+2cJzQ2pKy430ZswbZKAFiWBrEOjQ7Gb4aFjwzB4iluGWQL I56w4gD1EQXOqzXhzWq73aVOSsz1fLjDb+rW3WnxBcHcpm+ymHPY9hs0qhh3YyhZZz cVGSm7gut/44j3fXJsvasLLay9O9KcMZ0vsWPCBb8JdTU9NrJuad1STkQypRM6w2wV RDMkxDUItVOBAwDslVKgplbY8Xgg7UzQGqIl9cx52zOtJUm0BMCfxjKX3JTBH6oYD9 jFA9bDQ7HfhwA== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id D4FF3C531EA; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:10 +0800 Subject: [PATCH RFC 09/15] mm, swap: support flexible batch freeing of slots in different memcg Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-9-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=2190; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=Nr97MkylL+DHtVgOfh389lZsI/jvwd7cCWUvrCs+ZdE=; b=VNT+6pjfkVvYVkF9R+AU+XtVKlcJWeED6XPNHYULO17gQljRJAaYmkZ02T9dibmkLNBBvXLp7 wp61I+XYYaoBXuYFFCdCriXsl76W8j1LTC2XS24nk5qzY39pQZFZfDh X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Instead of let the caller ensures all slots are in the same memcg, the make it be able to handle different memcg at once. Signed-off-by: Kairui Song --- mm/swapfile.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2cd3e260f1bf..cd2d3b2ca6f0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1883,10 +1883,13 @@ void __swap_cluster_free_entries(struct swap_info_s= truct *si, struct swap_cluster_info *ci, unsigned int ci_start, unsigned int nr_pages) { + void *shadow; unsigned long old_tb; - unsigned short id; + unsigned int type =3D si->type; + unsigned int id =3D 0, id_iter, id_check; unsigned int ci_off =3D ci_start, ci_end =3D ci_start + nr_pages; - unsigned long offset =3D cluster_offset(si, ci) + ci_start; + unsigned long offset =3D cluster_offset(si, ci); + unsigned int ci_batch =3D ci_off; =20 VM_WARN_ON(ci->count < nr_pages); =20 @@ -1896,13 +1899,29 @@ void __swap_cluster_free_entries(struct swap_info_s= truct *si, /* Release the last ref, or after swap cache is dropped */ VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); __swap_table_set(ci, ci_off, null_to_swp_tb()); + + shadow =3D swp_tb_to_shadow(old_tb); + id_iter =3D shadow_to_memcgid(shadow); + if (id !=3D id_iter) { + if (id) { + id_check =3D swap_cgroup_clear(swp_entry(type, offset + ci_batch), + ci_off - ci_batch); + WARN_ON(id !=3D id_check); + mem_cgroup_uncharge_swap(id, ci_off - ci_batch); + } + id =3D id_iter; + ci_batch =3D ci_off; + } } while (++ci_off < ci_end); =20 - id =3D swap_cgroup_clear(swp_entry(si->type, offset), nr_pages); - if (id) - mem_cgroup_uncharge_swap(id, nr_pages); + if (id) { + id_check =3D swap_cgroup_clear(swp_entry(type, offset + ci_batch), + ci_off - ci_batch); + WARN_ON(id !=3D id_check); + mem_cgroup_uncharge_swap(id, ci_off - ci_batch); + } =20 - swap_range_free(si, offset, nr_pages); + swap_range_free(si, offset + ci_start, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); =20 if (!ci->count) --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1715932D7FF; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=M/yq4GcI5KL/jlwQyrdDgEshusvaVE1CcChsHEQ6AWVLrXO4nx1hNAwM9Vj1c5rWsZzmQb2gwsWatOJ39NN6VsN61+dmPC34bR3/4B8MIUoDXtUr0ctdTRhK/Oq5/1pWlYaORSOOsU4poQVRFKzhY5WQBlYcLSWGMN1svGjRSpc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=XphCFHReTLerRTmQ62E7k3tj+ws8vNIKhOrrRBsM1to=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=fp7V+KPb03M3dO6DJ/NhbMz4EU8gKddmkrojwZJiYGtoqTK3F7vV8c9Ffe78qw4UTouig7oWoI0SaNS4zIwxFtX0PuD24aGoW0tN0eys5zRGO4cR7DWjv0FD4SgUcMr1DGlPw3Lntn13M4lZr3Yp2WhAU+drvDTvksYEQCsUXpk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=mdqwLoRA; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="mdqwLoRA" Received: by smtp.kernel.org (Postfix) with ESMTPS id EE002C2BCAF; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544528; bh=XphCFHReTLerRTmQ62E7k3tj+ws8vNIKhOrrRBsM1to=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=mdqwLoRAkSEFaKH5nJYW4cVLC+sLaPjgd7tmmPIXVPooYnwsbygV1gv6tomQodj9N jO714FjHDyMkiCkBQZLpEz0D809nXyLNDbtNZUhd99XA587oEMu1OIZ7IWBztDpWjg 7l/OUQxVr2UVAbaoj7nEvQawvxGJwkt/M6Y6LYN/g1Kn0FIAeGlToAdnsjp6f/q7L+ Ug+CD9rx2u9idWo3v3o4hSIvx+3K4cYtqKbTkXdc7c4h5USVZI77vlNDo/v5kKlAX6 utsmjLEUdr9zGPsEx7K/dj+dbgOBaY5lsR31hO2mcoJqN4vhs48c+BkHmHsgiZqRzu yqm7kL5/WB+ig== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id E568BC531EB; Thu, 19 Feb 2026 23:42:07 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:11 +0800 Subject: [PATCH RFC 10/15] mm, swap: always retrieve memcg id from swap table Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-10-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=4319; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=WsZXml04ASUgTsoFPs4VOCkE3+G9dotFosGt0KDSdnI=; b=8PUGQy+iLDj47T0eAJnEPkBbFEpbjpdotOGd9HFhFodXjiH7rSlOJhvBygk9dElK56junzS5k sewBlHU75avAo5d1frQoohXdWvj9c85coOf+rZoeyzNWabyjMYaOOeT X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Transition mem_cgroup_swapin_charge_folio() to receive the memcg id from the caller via the swap table shadow entry, demoting the old swap cgroup array lookup to a sanity check. Also removes the per-PTE cgroup id batching break from swap_pte_batch() since now swap is able to free slots across mem cgroups. Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 6 ++++-- mm/internal.h | 4 ---- mm/memcontrol.c | 9 ++++++--- mm/swap_state.c | 5 ++++- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b37d4faf785..8fc794baf736 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -667,7 +667,8 @@ static inline int mem_cgroup_charge(struct folio *folio= , struct mm_struct *mm, int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); =20 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *= mm, - gfp_t gfp, swp_entry_t entry); + gfp_t gfp, swp_entry_t entry, + unsigned short id); =20 void __mem_cgroup_uncharge(struct folio *folio); =20 @@ -1145,7 +1146,8 @@ static inline int mem_cgroup_charge_hugetlb(struct fo= lio* folio, gfp_t gfp) } =20 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, - struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) + struct mm_struct *mm, gfp_t gfp, swp_entry_t entry, + unsigned short id) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index 5bbe081c9048..416d3401aa17 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -452,12 +452,10 @@ static inline int swap_pte_batch(pte_t *start_ptep, i= nt max_nr, pte_t pte) const pte_t *end_ptep =3D start_ptep + max_nr; const softleaf_t entry =3D softleaf_from_pte(pte); pte_t *ptep =3D start_ptep + 1; - unsigned short cgroup_id; =20 VM_WARN_ON(max_nr < 1); VM_WARN_ON(!softleaf_is_swap(entry)); =20 - cgroup_id =3D lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { softleaf_t entry; =20 @@ -466,8 +464,6 @@ static inline int swap_pte_batch(pte_t *start_ptep, int= max_nr, pte_t pte) if (!pte_same(pte, expected_pte)) break; entry =3D softleaf_from_pte(pte); - if (lookup_swap_cgroup_id(entry) !=3D cgroup_id) - break; expected_pte =3D pte_next_swp_offset(expected_pte); ptep++; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9ff44b77409..d0f50019d733 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4794,6 +4794,7 @@ int mem_cgroup_charge_hugetlb(struct folio *folio, gf= p_t gfp) * @mm: mm context of the victim * @gfp: reclaim mode * @entry: swap entry for which the folio is allocated + * @id: the mem cgroup id * * This function charges a folio allocated for swapin. Please call this be= fore * adding the folio to the swapcache. @@ -4801,19 +4802,21 @@ int mem_cgroup_charge_hugetlb(struct folio *folio, = gfp_t gfp) * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *= mm, - gfp_t gfp, swp_entry_t entry) + gfp_t gfp, swp_entry_t entry, unsigned short id) { struct mem_cgroup *memcg, *swap_memcg; + unsigned short memcg_id; unsigned int nr_pages; - unsigned short id; int ret; =20 if (mem_cgroup_disabled()) return 0; =20 - id =3D lookup_swap_cgroup_id(entry); + memcg_id =3D lookup_swap_cgroup_id(entry); nr_pages =3D folio_nr_pages(folio); =20 + WARN_ON_ONCE(id !=3D memcg_id); + rcu_read_lock(); swap_memcg =3D mem_cgroup_from_private_id(id); if (!swap_memcg) { diff --git a/mm/swap_state.c b/mm/swap_state.c index cc4bf40320ef..5ab3a41fe42c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -251,8 +251,11 @@ static struct folio *__swap_cache_alloc(struct swap_cl= uster_info *ci, __swap_cache_add_folio(ci, folio, entry); spin_unlock(&ci->lock); =20 + /* With swap table, we must have a shadow, for memcg tracking */ + WARN_ON(!shadow); + if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, - gfp, entry)) { + gfp, entry, shadow_to_memcgid(shadow))) { spin_lock(&ci->lock); __swap_cache_del_folio(ci, folio, shadow, false, false); spin_unlock(&ci->lock); --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4599433032A; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=hl9R7smX227XInQ/0llZmVfOYTv/QEdLReUYo5Oo+x95QR5UnqEgdyyTre39LKF2s63w7J4np37LlNDPsnkA13W5cwbj5OI6XjtQhCc3KXZeNXVB54Y94va94nTjryLB3GN4jxIhrvVJmAtXUXIF+MhtyO1bhh33t9bvUtyZKl8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=SVkjz87H+v7qEvTRagEPaFIS/kko6XF9kiNxmbSXkpQ=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=Uead0N7gIDwP+WGqichRnJHD/fCdhM51L7aNxlUkWV32yrfobiyREcBtKHC0G8wBQTrJ7j4IvunpMewkS4zlZ6x5ja71CdpHj+a2Y1iIC1pLtoERSShBaQ7ANH8QOnMBmV3lFvwpLZ/qi6Qm0SNbcq8wOTF7bU5JT+P39LvWtzE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=crl8C4zW; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="crl8C4zW" Received: by smtp.kernel.org (Postfix) with ESMTPS id 0CCDBC2BCB2; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544528; bh=SVkjz87H+v7qEvTRagEPaFIS/kko6XF9kiNxmbSXkpQ=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=crl8C4zWRKZwRLaByTmREkcM4IQWEXl33WnmqGuSEWKF00sOUTp2eCuGZ6HoK5YLL lBfdSZFAdLR1JLfFLM9UZ7I/p6Aui7ZLzvRv3jIe+deFP7DGQ8ksmmJxOP13Jte0zP MLB9ojJ0d4E9jBhKTKOK1OovK9I7vKOjjgSRa0PGTQJheSTQJ+okRwVYuvU1MKGXyi W2paG5sr3Jb9EXXrt0nHhgR0DJflFDx5xnAunvXQePHA2DcJn3j3EFtWpeuKnKQthn M9rXU9RyPxeZoCMz1QsUJOwOQ5t2+fDwRUd1GqAXSAtnTt4zOoxiL5LR3uyYqVb7Nq rF3dvGMth+mpw== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 03203C531EA; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:12 +0800 Subject: [PATCH RFC 11/15] mm/swap, memcg: remove swap cgroup array Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-11-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=15582; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=ab1DMIR8yqVLL69CexiaEYLTLIUHukDgwvK/gc3LcsM=; b=W5W0v6jgjF0XBwiRd/XQqgsIQe0I03NglUh7sJSKbsTpBIHlTYOWBv8Bup+fBAi5446LgGVyV eqfznMFyEv+Cy0zY0FiaHNTajlorvDD1ujYZO7cjxI/YmbNz9mMM5g5 X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Now swap table contains the swap cgropu info all the time, the swap cgroup array can be dropped. Signed-off-by: Kairui Song --- MAINTAINERS | 1 - include/linux/memcontrol.h | 6 +- include/linux/swap_cgroup.h | 47 ------------ mm/Makefile | 3 - mm/internal.h | 1 - mm/memcontrol-v1.c | 1 - mm/memcontrol.c | 19 ++--- mm/swap_cgroup.c | 171 ----------------------------------------= ---- mm/swap_state.c | 3 +- mm/swapfile.c | 23 +----- 10 files changed, 11 insertions(+), 264 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index aa1734a12887..05e633611e0b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6571,7 +6571,6 @@ F: mm/memcontrol.c F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c -F: mm/swap_cgroup.c F: samples/cgroup/* F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8fc794baf736..4bfe905bffb0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -667,8 +667,7 @@ static inline int mem_cgroup_charge(struct folio *folio= , struct mm_struct *mm, int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); =20 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *= mm, - gfp_t gfp, swp_entry_t entry, - unsigned short id); + gfp_t gfp, unsigned short id); =20 void __mem_cgroup_uncharge(struct folio *folio); =20 @@ -1146,8 +1145,7 @@ static inline int mem_cgroup_charge_hugetlb(struct fo= lio* folio, gfp_t gfp) } =20 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, - struct mm_struct *mm, gfp_t gfp, swp_entry_t entry, - unsigned short id) + struct mm_struct *mm, gfp_t gfp, unsigned short id) { return 0; } diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h deleted file mode 100644 index 91cdf12190a0..000000000000 --- a/include/linux/swap_cgroup.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_SWAP_CGROUP_H -#define __LINUX_SWAP_CGROUP_H - -#include - -#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) - -extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp= _entry_t ent); -extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_e= nts); -extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -extern int swap_cgroup_swapon(int type, unsigned long max_pages); -extern void swap_cgroup_swapoff(int type); - -#else - -static inline -void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_= t ent) -{ -} - -static inline -unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) -{ - return 0; -} - -static inline -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return 0; -} - -static inline int -swap_cgroup_swapon(int type, unsigned long max_pages) -{ - return 0; -} - -static inline void swap_cgroup_swapoff(int type) -{ - return; -} - -#endif - -#endif /* __LINUX_SWAP_CGROUP_H */ diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab08244e..eff9f9e7e061 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,9 +103,6 @@ obj-$(CONFIG_PAGE_COUNTER) +=3D page_counter.o obj-$(CONFIG_LIVEUPDATE_MEMFD) +=3D memfd_luo.o obj-$(CONFIG_MEMCG_V1) +=3D memcontrol-v1.o obj-$(CONFIG_MEMCG) +=3D memcontrol.o vmpressure.o -ifdef CONFIG_SWAP -obj-$(CONFIG_MEMCG) +=3D swap_cgroup.o -endif ifdef CONFIG_BPF_SYSCALL obj-$(CONFIG_MEMCG) +=3D bpf_memcontrol.o endif diff --git a/mm/internal.h b/mm/internal.h index 416d3401aa17..26691885d75f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,7 +16,6 @@ #include #include #include -#include #include =20 /* Internal core VMA manipulation functions. */ diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 038e630dc7e1..eff18eda0707 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d0f50019d733..8d0c9f3a011e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include @@ -4793,7 +4792,6 @@ int mem_cgroup_charge_hugetlb(struct folio *folio, gf= p_t gfp) * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the folio is allocated * @id: the mem cgroup id * * This function charges a folio allocated for swapin. Please call this be= fore @@ -4802,21 +4800,17 @@ int mem_cgroup_charge_hugetlb(struct folio *folio, = gfp_t gfp) * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *= mm, - gfp_t gfp, swp_entry_t entry, unsigned short id) + gfp_t gfp, unsigned short id) { struct mem_cgroup *memcg, *swap_memcg; - unsigned short memcg_id; unsigned int nr_pages; int ret; =20 if (mem_cgroup_disabled()) return 0; =20 - memcg_id =3D lookup_swap_cgroup_id(entry); nr_pages =3D folio_nr_pages(folio); =20 - WARN_ON_ONCE(id !=3D memcg_id); - rcu_read_lock(); swap_memcg =3D mem_cgroup_from_private_id(id); if (!swap_memcg) { @@ -4836,10 +4830,11 @@ int mem_cgroup_swapin_charge_folio(struct folio *fo= lio, struct mm_struct *mm, =20 /* * On successful charge, the folio itself now belongs to the memcg, - * so is folio->swap. So we can release the swap cgroup table's - * pinning of the private id. + * so is folio->swap. And the folio takes place of the shadow in + * the swap table so we can release the shadow's pinning of the + * private id. */ - swap_cgroup_clear(folio->swap, nr_pages); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); mem_cgroup_private_id_put(swap_memcg, nr_pages); =20 /* @@ -5324,8 +5319,6 @@ struct mem_cgroup *__mem_cgroup_swap_free_folio(struc= t folio *folio, { unsigned int nr_pages =3D folio_nr_pages(folio); struct mem_cgroup *memcg, *swap_memcg; - swp_entry_t entry =3D folio->swap; - unsigned short id; =20 VM_WARN_ON_ONCE_FOLIO(!folio_memcg_charged(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); @@ -5337,8 +5330,6 @@ struct mem_cgroup *__mem_cgroup_swap_free_folio(struc= t folio *folio, */ memcg =3D folio_memcg(folio); swap_memcg =3D mem_cgroup_private_id_get_online(memcg, nr_pages); - id =3D mem_cgroup_private_id(swap_memcg); - swap_cgroup_record(folio, id, entry); =20 if (reclaim && do_memsw_account()) { memcg1_swapout(folio, swap_memcg); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c deleted file mode 100644 index b5a7f21c3afe..000000000000 --- a/mm/swap_cgroup.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include - -#include /* depends on mm.h include */ - -static DEFINE_MUTEX(swap_cgroup_mutex); - -/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t)= */ -#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) -#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) -#define ID_MASK (BIT(ID_SHIFT) - 1) -struct swap_cgroup { - atomic_t ids; -}; - -struct swap_cgroup_ctrl { - struct swap_cgroup *map; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; - -static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, - pgoff_t offset) -{ - unsigned int shift =3D (offset % ID_PER_SC) * ID_SHIFT; - unsigned int old_ids =3D atomic_read(&map[offset / ID_PER_SC].ids); - - BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); - BUILD_BUG_ON(sizeof(struct swap_cgroup) !=3D sizeof(atomic_t)); - - return (old_ids >> shift) & ID_MASK; -} - -static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, - pgoff_t offset, - unsigned short new_id) -{ - unsigned short old_id; - struct swap_cgroup *sc =3D &map[offset / ID_PER_SC]; - unsigned int shift =3D (offset % ID_PER_SC) * ID_SHIFT; - unsigned int new_ids, old_ids =3D atomic_read(&sc->ids); - - do { - old_id =3D (old_ids >> shift) & ID_MASK; - new_ids =3D (old_ids & ~(ID_MASK << shift)); - new_ids |=3D ((unsigned int)new_id) << shift; - } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); - - return old_id; -} - -/** - * swap_cgroup_record - record mem_cgroup for a set of swap entries. - * These entries must belong to one single folio, and that folio - * must be being charged for swap space (swap out). - * - * @folio: the folio that the swap entry belongs to - * @id: mem_cgroup ID to be recorded - * @ent: the first swap entry to be recorded - */ -void swap_cgroup_record(struct folio *folio, unsigned short id, - swp_entry_t ent) -{ - unsigned int nr_ents =3D folio_nr_pages(folio); - struct swap_cgroup *map; - pgoff_t offset, end; - unsigned short old; - - offset =3D swp_offset(ent); - end =3D offset + nr_ents; - map =3D swap_cgroup_ctrl[swp_type(ent)].map; - - do { - old =3D __swap_cgroup_id_xchg(map, offset, id); - VM_WARN_ON_ONCE(old); - } while (++offset !=3D end); -} - -/** - * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. - * These entries must be being uncharged from swap. They either - * belongs to one single folio in the swap cache (swap in for - * cgroup v1), or no longer have any users (slot freeing). - * - * @ent: the first swap entry to be recorded into - * @nr_ents: number of swap entries to be recorded - * - * Returns the existing old value. - */ -unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) -{ - pgoff_t offset, end; - struct swap_cgroup *map; - unsigned short old, iter =3D 0; - - offset =3D swp_offset(ent); - end =3D offset + nr_ents; - map =3D swap_cgroup_ctrl[swp_type(ent)].map; - - do { - old =3D __swap_cgroup_id_xchg(map, offset, 0); - if (!iter) - iter =3D old; - VM_BUG_ON(iter !=3D old); - } while (++offset !=3D end); - - return old; -} - -/** - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry - * @ent: swap entry to be looked up. - * - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) - */ -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return 0; - - ctrl =3D &swap_cgroup_ctrl[swp_type(ent)]; - return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - struct swap_cgroup *map; - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return 0; - - BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=3D - sizeof(struct swap_cgroup)); - map =3D vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * - sizeof(struct swap_cgroup)); - if (!map) - goto nomem; - - ctrl =3D &swap_cgroup_ctrl[type]; - mutex_lock(&swap_cgroup_mutex); - ctrl->map =3D map; - mutex_unlock(&swap_cgroup_mutex); - - return 0; -nomem: - pr_info("couldn't allocate enough memory for swap_cgroup\n"); - pr_info("swap_cgroup can be disabled by swapaccount=3D0 boot option\n"); - return -ENOMEM; -} - -void swap_cgroup_swapoff(int type) -{ - struct swap_cgroup *map; - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return; - - mutex_lock(&swap_cgroup_mutex); - ctrl =3D &swap_cgroup_ctrl[type]; - map =3D ctrl->map; - ctrl->map =3D NULL; - mutex_unlock(&swap_cgroup_mutex); - - vfree(map); -} diff --git a/mm/swap_state.c b/mm/swap_state.c index 5ab3a41fe42c..c6ba15de4094 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -163,7 +163,6 @@ static int __swap_cache_check_batch(struct swap_cluster= _info *ci, *shadowp =3D swp_tb_to_shadow(old_tb); memcgid =3D shadow_to_memcgid(*shadowp); =20 - WARN_ON_ONCE(!mem_cgroup_disabled() && !memcgid); do { old_tb =3D __swap_table_get(ci, ci_off); if (unlikely(swp_tb_is_folio(old_tb)) || @@ -255,7 +254,7 @@ static struct folio *__swap_cache_alloc(struct swap_clu= ster_info *ci, WARN_ON(!shadow); =20 if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, - gfp, entry, shadow_to_memcgid(shadow))) { + gfp, shadow_to_memcgid(shadow))) { spin_lock(&ci->lock); __swap_cache_del_folio(ci, folio, shadow, false, false); spin_unlock(&ci->lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index cd2d3b2ca6f0..de34f1990209 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,7 +45,6 @@ =20 #include #include -#include #include "swap_table.h" #include "internal.h" #include "swap_table.h" @@ -1885,8 +1884,7 @@ void __swap_cluster_free_entries(struct swap_info_str= uct *si, { void *shadow; unsigned long old_tb; - unsigned int type =3D si->type; - unsigned int id =3D 0, id_iter, id_check; + unsigned int id =3D 0, id_iter; unsigned int ci_off =3D ci_start, ci_end =3D ci_start + nr_pages; unsigned long offset =3D cluster_offset(si, ci); unsigned int ci_batch =3D ci_off; @@ -1903,23 +1901,15 @@ void __swap_cluster_free_entries(struct swap_info_s= truct *si, shadow =3D swp_tb_to_shadow(old_tb); id_iter =3D shadow_to_memcgid(shadow); if (id !=3D id_iter) { - if (id) { - id_check =3D swap_cgroup_clear(swp_entry(type, offset + ci_batch), - ci_off - ci_batch); - WARN_ON(id !=3D id_check); + if (id) mem_cgroup_uncharge_swap(id, ci_off - ci_batch); - } id =3D id_iter; ci_batch =3D ci_off; } } while (++ci_off < ci_end); =20 - if (id) { - id_check =3D swap_cgroup_clear(swp_entry(type, offset + ci_batch), - ci_off - ci_batch); - WARN_ON(id !=3D id_check); + if (id) mem_cgroup_uncharge_swap(id, ci_off - ci_batch); - } =20 swap_range_free(si, offset + ci_start, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); @@ -3034,8 +3024,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, special= file) p->global_cluster =3D NULL; kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); - /* Destroy swap account information */ - swap_cgroup_swapoff(p->type); =20 inode =3D mapping->host; =20 @@ -3567,10 +3555,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, special= file, int, swap_flags) if (error) goto bad_swap_unlock_inode; =20 - error =3D swap_cgroup_swapon(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order mi= ght * be above MAX_PAGE_ORDER incase of a large swap file. @@ -3681,7 +3665,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialf= ile, int, swap_flags) si->global_cluster =3D NULL; inode =3D NULL; destroy_swap_extents(si, swap_file); - swap_cgroup_swapoff(si->type); free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info =3D NULL; kvfree(si->zeromap); --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4529A330320; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=pCf6pf4L5VrxEEKWFi//EggPj5fvPOoEoc9ZUcchjuKIlf+oapFjcbK2j4UG2QhnBeWnEB+tx2qpFb5qn1TdRFvFXsL9cDdOsNGNCCMmZJ9AdBijwW7D8jQvl0RblkxE0M53ZNfMsVh6l9WEFqsF6mu4Z0Kgs6BJyxgWmq73PEc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=qyfGzo8v12j44PlTzKr7Yv77BTV9X7d51BLIJWbEEYs=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=YIiU/m8yIJxToEWbcsK0ZXpm+a7nyqfj/7ZaYXm//IPs+o6Xr3kRjuvszH+n5yCcezNpdnyP3pWHPxFv+fV9+7QvnODqRaRTuMCkkBwqCnAP2xfgmCSkl6SaiC6XqOL5McrhMOmLlIgff2f1TZ0X6KUTkUN/T3TlfJ/3pmTnhE4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=DJjaYhBf; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="DJjaYhBf" Received: by smtp.kernel.org (Postfix) with ESMTPS id 1BC3FC2BC86; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544528; bh=qyfGzo8v12j44PlTzKr7Yv77BTV9X7d51BLIJWbEEYs=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=DJjaYhBfKRXMs29IBBoQ0MONmBWpt1kh+++rK9KIYLK9IusRvsbMp10yotOgXty8v qAw3EvI5b/ktnkkwlD2vl/b8p2W0ZwqUD8Cyo3U22fJ7+5X8MgHhW/hfTHeU2xgPqR U3kJO+f+Gr5LC2cY1tFOKuLDn3BoHUowjmA7fXQfp91cBn6YWHfbFnGET80BVoRE8B b/lNB5Cryp/UaiT3NUOs3/0Cqj1HhyQTfwRIMWvqzTN5GdBh5rmVH5UCdQQiHX2ckZ mdyN/x6N9mUoWheaTSfyw8wRE4cOWV8+onXuqEQg3rbsjIcEsN77mPEk4zvKpufLqq dPFW+hRiwtpLQ== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 14527C531EB; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:13 +0800 Subject: [PATCH RFC 12/15] mm, swap: merge zeromap into swap table Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-12-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=20509; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=DS+YZ8JcJF+Jri1i7Q3A9LxB6Fr+z8Z5vJM23M1Iw2Y=; b=L6nM7BZpj5iPh5g8XNyIi9cgSB9rftBwNt5gSZgFcNupU7deDrzFyHCMKvPN9uyvnWLvfnZzp 3VaSVVaiR4TDG8LO1zZp6SlzlF4rEuUrv1U37XEzBCJIyka3n8HvIdn X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song By reserving one bit for the counting part, we can easily merge the zeromap into the swap table. Signed-off-by: Kairui Song --- include/linux/swap.h | 1 - mm/memory.c | 12 ++---- mm/page_io.c | 28 ++++++++++---- mm/swap.h | 31 ---------------- mm/swap_state.c | 23 ++++++++---- mm/swap_table.h | 103 +++++++++++++++++++++++++++++++++++++++++++----= ---- mm/swapfile.c | 27 +------------- 7 files changed, 127 insertions(+), 98 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 66cf657a1f35..bc871d8a1e99 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -254,7 +254,6 @@ struct swap_info_struct { struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* size of this swap device */ - unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ diff --git a/mm/memory.c b/mm/memory.c index e58f976508b3..8df169fced0d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -88,6 +88,7 @@ =20 #include "pgalloc-track.h" #include "internal.h" +#include "swap_table.h" #include "swap.h" =20 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) @@ -4522,13 +4523,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault= *vmf) =20 #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * Check if the PTEs within a range are contiguous swap entries - * and have consistent swapcache, zeromap. + * Check if the PTEs within a range are contiguous swap entries. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - softleaf_t entry; int idx; pte_t pte; =20 @@ -4538,18 +4537,13 @@ static bool can_swapin_thp(struct vm_fault *vmf, pt= e_t *ptep, int nr_pages) =20 if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry =3D softleaf_from_pte(pte); - if (swap_pte_batch(ptep, nr_pages, pte) !=3D nr_pages) - return false; - /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ - if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) !=3D nr_pages)) + if (swap_pte_batch(ptep, nr_pages, pte) !=3D nr_pages) return false; - return true; } =20 diff --git a/mm/page_io.c b/mm/page_io.c index a2c034660c80..5a0b5034489b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,6 +26,7 @@ #include #include #include "swap.h" +#include "swap_table.h" =20 static void __end_swap_bio_write(struct bio *bio) { @@ -204,15 +205,20 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg =3D get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis =3D __swap_entry_to_info(folio->swap); int nr_pages =3D folio_nr_pages(folio); + struct swap_cluster_info *ci; swp_entry_t entry; unsigned int i; =20 + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + + ci =3D swap_cluster_get_and_lock(folio); for (i =3D 0; i < folio_nr_pages(folio); i++) { entry =3D page_swap_entry(folio_page(folio, i)); - set_bit(swp_offset(entry), sis->zeromap); + __swap_table_set_zero(ci, swp_cluster_offset(entry)); } + swap_cluster_unlock(ci); =20 count_vm_events(SWPOUT_ZERO, nr_pages); if (objcg) { @@ -223,14 +229,19 @@ static void swap_zeromap_folio_set(struct folio *foli= o) =20 static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis =3D __swap_entry_to_info(folio->swap); + struct swap_cluster_info *ci; swp_entry_t entry; unsigned int i; =20 + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + + ci =3D swap_cluster_get_and_lock(folio); for (i =3D 0; i < folio_nr_pages(folio); i++) { entry =3D page_swap_entry(folio_page(folio, i)); - clear_bit(swp_offset(entry), sis->zeromap); + __swap_table_clear_zero(ci, swp_cluster_offset(entry)); } + swap_cluster_unlock(ci); } =20 /* @@ -255,10 +266,9 @@ int swap_writeout(struct folio *folio, struct swap_ioc= b **swap_plug) } =20 /* - * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. - * The bits in zeromap are protected by the locked swapcache folio - * and atomic updates are used to protect against read-modify-write - * corruption due to other zero swap entries seeing concurrent updates. + * Use the swap table zero mark to avoid doing IO for zero-filled + * pages. The zero mark is protected by the cluster lock, which is + * acquired internally by swap_zeromap_folio_set/clear. */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); @@ -511,6 +521,8 @@ static bool swap_read_folio_zeromap(struct folio *folio) struct obj_cgroup *objcg; bool is_zeromap; =20 + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so diff --git a/mm/swap.h b/mm/swap.h index c95f5fafea42..cb1ab20d83d5 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -312,31 +312,6 @@ static inline unsigned int folio_swap_flags(struct fol= io *folio) return __swap_entry_to_info(folio->swap)->flags; } =20 -/* - * Return the count of contiguous swap entries that share the same - * zeromap status as the starting entry. If is_zeromap is not NULL, - * it will return the zeromap status of the starting entry. - */ -static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, - bool *is_zeromap) -{ - struct swap_info_struct *sis =3D __swap_entry_to_info(entry); - unsigned long start =3D swp_offset(entry); - unsigned long end =3D start + max_nr; - bool first_bit; - - first_bit =3D test_bit(start, sis->zeromap); - if (is_zeromap) - *is_zeromap =3D first_bit; - - if (max_nr <=3D 1) - return max_nr; - if (first_bit) - return find_next_zero_bit(sis->zeromap, end, start) - start; - else - return find_next_bit(sis->zeromap, end, start) - start; -} - #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -475,11 +450,5 @@ static inline unsigned int folio_swap_flags(struct fol= io *folio) { return 0; } - -static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, - bool *has_zeromap) -{ - return 0; -} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index c6ba15de4094..419419e18a47 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -138,6 +138,7 @@ void *swap_cache_get_shadow(swp_entry_t entry) } =20 static int __swap_cache_check_batch(struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int ci_off, unsigned int ci_targ, unsigned int nr, void **shadowp) { @@ -148,6 +149,13 @@ static int __swap_cache_check_batch(struct swap_cluste= r_info *ci, if (unlikely(!ci->table)) return -ENOENT; =20 + /* + * TODO: Swap of large folio that is partially in the zeromap + * is not supported. + */ + if (nr > 1 && swap_zeromap_batch(entry, nr, NULL) !=3D nr) + return -EBUSY; + /* * If the target slot is not suitable for adding swap cache, return * -EEXIST or -ENOENT. If the batch is not suitable, could be a @@ -190,7 +198,7 @@ void __swap_cache_add_folio(struct swap_cluster_info *c= i, do { old_tb =3D __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); - __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_t= b))); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_t= b))); } while (++ci_off < ci_end); =20 folio_ref_add(folio, nr_pages); @@ -218,7 +226,7 @@ static struct folio *__swap_cache_alloc(struct swap_clu= ster_info *ci, =20 /* First check if the range is available */ spin_lock(&ci->lock); - err =3D __swap_cache_check_batch(ci, ci_off, ci_targ, nr_pages, &shadow); + err =3D __swap_cache_check_batch(ci, entry, ci_off, ci_targ, nr_pages, &s= hadow); spin_unlock(&ci->lock); if (unlikely(err)) return ERR_PTR(err); @@ -236,7 +244,7 @@ static struct folio *__swap_cache_alloc(struct swap_clu= ster_info *ci, =20 /* Double check the range is still not in conflict */ spin_lock(&ci->lock); - err =3D __swap_cache_check_batch(ci, ci_off, ci_targ, nr_pages, &shadow_c= heck); + err =3D __swap_cache_check_batch(ci, entry, ci_off, ci_targ, nr_pages, &s= hadow_check); if (unlikely(err) || shadow_check !=3D shadow) { spin_unlock(&ci->lock); folio_put(folio); @@ -338,7 +346,6 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_e= ntry, gfp_t gfp_mask, void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *fo= lio, void *shadow, bool charged, bool reclaim) { - int count; unsigned long old_tb; struct swap_info_struct *si; struct mem_cgroup *memcg =3D NULL; @@ -375,13 +382,13 @@ void __swap_cache_del_folio(struct swap_cluster_info = *ci, struct folio *folio, old_tb =3D __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) !=3D folio); - count =3D __swp_tb_get_count(old_tb); - if (count) + if (__swp_tb_get_count(old_tb)) folio_swapped =3D true; else need_free =3D true; /* If shadow is NULL, we sets an empty shadow. */ - __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, + __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); =20 folio->swap.val =3D 0; @@ -460,7 +467,7 @@ void __swap_cache_replace_folio(struct swap_cluster_inf= o *ci, do { old_tb =3D __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) !=3D ol= d); - __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_t= b))); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_t= b))); } while (++ci_off < ci_end); =20 /* diff --git a/mm/swap_table.h b/mm/swap_table.h index 8415ffbe2b9c..6d3d773e1908 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -21,12 +21,14 @@ struct swap_table { * Swap table entry type and bits layouts: * * NULL: |---------------- 0 ---------------| - Free slot - * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot - * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot + * Shadow: |SWAP_COUNT|Z|---- SHADOW_VAL ---|1| - Swapped out slot + * PFN: |SWAP_COUNT|Z|------ PFN -------|10| - Cached slot * Pointer: |----------- Pointer ----------|100| - (Unused) * Bad: |------------- 1 -------------|1000| - Bad slot * - * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long. + * COUNT is `SWP_TB_COUNT_BITS` long, Z is the `SWP_TB_ZERO_MARK` bit, + * and together they form the `SWP_TB_FLAGS_BITS` wide flags field. + * Each entry is an atomic long. * * Usages: * @@ -70,16 +72,21 @@ struct swap_table { #define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1) =20 /* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended = */ -#define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS) +#define SWP_TB_FLAGS_BITS min(5, BITS_PER_LONG - SWP_TB_PFN_BITS) +#define SWP_TB_COUNT_BITS (SWP_TB_FLAGS_BITS - 1) +#define SWP_TB_FLAGS_MASK (~((~0UL) >> SWP_TB_FLAGS_BITS)) #define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) +#define SWP_TB_FLAGS_SHIFT (BITS_PER_LONG - SWP_TB_FLAGS_BITS) #define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) #define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) =20 +#define SWP_TB_ZERO_MARK BIT(BITS_PER_LONG - SWP_TB_COUNT_BITS - 1) + /* Bad slot: ends with 0b1000 and rests of bits are all 1 */ #define SWP_TB_BAD ((~0UL) << 3) =20 /* Macro for shadow offset calculation */ -#define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS +#define SWAP_COUNT_SHIFT SWP_TB_FLAGS_BITS =20 /* * Helpers for casting one type of info into a swap table entry. @@ -102,35 +109,43 @@ static inline unsigned long __count_to_swp_tb(unsigne= d char count) return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; } =20 -static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int = count) +static inline unsigned long __flags_to_swp_tb(unsigned char flags) +{ + BUILD_BUG_ON(SWP_TB_FLAGS_BITS > BITS_PER_BYTE); + VM_WARN_ON((flags >> 1) > SWP_TB_COUNT_MAX); + return ((unsigned long)flags) << SWP_TB_FLAGS_SHIFT; +} + + +static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned char= flags) { unsigned long swp_tb; =20 BUILD_BUG_ON(sizeof(unsigned long) !=3D sizeof(void *)); BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > - (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS)); + (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_FLAGS_BITS)); =20 swp_tb =3D (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK; - VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK); + VM_WARN_ON_ONCE(swp_tb & SWP_TB_FLAGS_MASK); =20 - return swp_tb | __count_to_swp_tb(count); + return swp_tb | __flags_to_swp_tb(flags); } =20 -static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned = int count) +static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned = char flags) { - return pfn_to_swp_tb(folio_pfn(folio), count); + return pfn_to_swp_tb(folio_pfn(folio), flags); } =20 -static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int co= unt) +static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned char f= lags) { BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=3D BITS_PER_BYTE * sizeof(unsigned long)); BUILD_BUG_ON((unsigned long)xa_mk_value(0) !=3D SWP_TB_SHADOW_MARK); =20 VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); - VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK)); + VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_FLAGS_MASK)); =20 - return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_M= ARK; + return (unsigned long)shadow | SWP_TB_SHADOW_MARK | __flags_to_swp_tb(fla= gs); } =20 /* @@ -168,14 +183,14 @@ static inline bool swp_tb_is_countable(unsigned long = swp_tb) static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_folio(swp_tb)); - return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS); + return pfn_folio((swp_tb & ~SWP_TB_FLAGS_MASK) >> SWP_TB_PFN_MARK_BITS); } =20 static inline void *swp_tb_to_shadow(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); /* No shift needed, xa_value is stored as it is in the lower bits. */ - return (void *)(swp_tb & ~SWP_TB_COUNT_MASK); + return (void *)(swp_tb & ~SWP_TB_FLAGS_MASK); } =20 static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) @@ -184,6 +199,12 @@ static inline unsigned char __swp_tb_get_count(unsigne= d long swp_tb) return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); } =20 +static inline unsigned char __swp_tb_get_flags(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return ((swp_tb & SWP_TB_FLAGS_MASK) >> SWP_TB_FLAGS_SHIFT); +} + static inline int swp_tb_get_count(unsigned long swp_tb) { if (swp_tb_is_countable(swp_tb)) @@ -247,4 +268,54 @@ static inline unsigned long swap_table_get(struct swap= _cluster_info *ci, =20 return swp_tb; } + +static inline void __swap_table_set_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + unsigned long swp_tb =3D __swap_table_get(ci, ci_off); + + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + swp_tb |=3D SWP_TB_ZERO_MARK; + __swap_table_set(ci, ci_off, swp_tb); +} + +static inline void __swap_table_clear_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + unsigned long swp_tb =3D __swap_table_get(ci, ci_off); + + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + swp_tb &=3D ~SWP_TB_ZERO_MARK; + __swap_table_set(ci, ci_off, swp_tb); +} + +/** + * Return the count of contiguous swap entries that share the same + * zeromap status as the starting entry. If is_zerop is not NULL, + * it will return the zeromap status of the starting entry. + * + * Context: Caller must ensure the cluster containing the entries + * that will be checked won't be freed. + */ +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *is_zerop) +{ + bool is_zero; + unsigned long swp_tb; + struct swap_cluster_info *ci =3D __swap_entry_to_cluster(entry); + unsigned int ci_start =3D swp_cluster_offset(entry), ci_off, ci_end; + + ci_off =3D ci_start; + ci_end =3D ci_off + max_nr; + swp_tb =3D swap_table_get(ci, ci_off); + is_zero =3D !!(swp_tb & SWP_TB_ZERO_MARK); + if (is_zerop) + *is_zerop =3D is_zero; + while (++ci_off < ci_end) { + swp_tb =3D swap_table_get(ci, ci_off); + if (is_zero !=3D !!(swp_tb & SWP_TB_ZERO_MARK)) + break; + } + return ci_off - ci_start; +} #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index de34f1990209..4018e8694b72 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -918,7 +918,7 @@ static bool __swap_cluster_alloc_entries(struct swap_in= fo_struct *si, nr_pages =3D 1; swap_cluster_assert_empty(ci, ci_off, 1, false); /* Sets a fake shadow as placeholder */ - __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); + __swap_table_set(ci, ci_off, __swp_tb_mk_count(shadow_to_swp_tb(NULL, 0)= , 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -1308,14 +1308,8 @@ static void swap_range_free(struct swap_info_struct = *si, unsigned long offset, void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; =20 - /* - * Use atomic clear_bit operations only on zeromap instead of non-atomic - * bitmap_clear to prevent adjacent bits corruption due to simultaneous w= rites. - */ - for (i =3D 0; i < nr_entries; i++) { - clear_bit(offset + i, si->zeromap); + for (i =3D 0; i < nr_entries; i++) zswap_invalidate(swp_entry(si->type, offset + i)); - } =20 if (si->flags & SWP_BLKDEV) swap_slot_free_notify =3D @@ -2921,7 +2915,6 @@ static void flush_percpu_swap_cluster(struct swap_inf= o_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p =3D NULL; - unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; @@ -3009,8 +3002,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, special= file) =20 swap_file =3D p->swap_file; p->swap_file =3D NULL; - zeromap =3D p->zeromap; - p->zeromap =3D NULL; maxpages =3D p->max; cluster_info =3D p->cluster_info; p->max =3D 0; @@ -3022,7 +3013,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, special= file) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster =3D NULL; - kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); =20 inode =3D mapping->host; @@ -3555,17 +3545,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, special= file, int, swap_flags) if (error) goto bad_swap_unlock_inode; =20 - /* - * Use kvmalloc_array instead of bitmap_zalloc as the allocation order mi= ght - * be above MAX_PAGE_ORDER incase of a large swap file. - */ - si->zeromap =3D kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), - GFP_KERNEL | __GFP_ZERO); - if (!si->zeromap) { - error =3D -ENOMEM; - goto bad_swap_unlock_inode; - } - if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |=3D SWP_STABLE_WRITES; =20 @@ -3667,8 +3646,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialf= ile, int, swap_flags) destroy_swap_extents(si, swap_file); free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info =3D NULL; - kvfree(si->zeromap); - si->zeromap =3D NULL; /* * Clear the SWP_USED flag after all resources are freed so * alloc_swap_info can reuse this si safely. --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5F6AD33121F; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=cWuu8QI2eHCY0eDspiIw+58XmWxrkSet9JLXiVP707IiXxlx9s3/sDsyIFGgi0bEe1HiT3qWr9C8Pdpc5kOTDfHoYem27a7wI/MU4nDt5qRww0/XlkptxruHvPsSScXH14UCSzgbnFSXcxq4z5kTv9rAB3tsf0LLJj8FNsFBMFs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=o5/yQb2bx9F4jYBwj0NlqpQdPtP0c9FDJlMawMBLCJU=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=AiKb1sjz5VJNUfbjCNnZBI0AFltmF6A+7JAzEqzOfexYQiXBafMb60MBVUUTUOPL8oeVTXOS9oVxzhlRD4MEQhOvmLi/+bRpZBKCHCAzJBGvkSwXSNv0NUCe5gtEDtviEdaGte3Kn6BLiB3RuxbdJwzJt30btN2scB1/BLrU6ro= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=P73uNUnN; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="P73uNUnN" Received: by smtp.kernel.org (Postfix) with ESMTPS id 41142C4CEF7; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544528; bh=o5/yQb2bx9F4jYBwj0NlqpQdPtP0c9FDJlMawMBLCJU=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=P73uNUnNnqdZK6PtkKON1gQbC8woiJYlIrHCDBHaRPw7fKvDTBnfkv0LCwkl253Zl ZR19/O64c0CD61Td5oVNAJOsnRJwjTqEyACSraTPc/3vmwI5wlj+uwNvVmtB2y0dV0 7dDjfikhnsui9/b5/7WAAg1s+WKliUBlxsWwwy7hGJFen9ZdRYsfHNTnzLks9li2YX r0vaN11/FoCGlWEbnIiVX3SXjRrnmxTMhJ9P8oXRBeNtdKD5OtpIEVl1siEEHxnWnD w5zBpbO9soIGtxw5A85v/rUIY03p38UKIDtzrBjtvdcHyLbsZw+tlBM8cTXNqsjCLG X/EqSLldo13SA== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 38A5CC531E3; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:14 +0800 Subject: [PATCH RFC 13/15] mm: ghost swapfile support for zswap Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-13-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=8943; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=ZmZgtJsn2gR54J687nHVHnXVV/ak1x2+yjhOddj8Wcg=; b=+pDVNWFeDBYtwUhtCIlBKrjwyp+Hgt8lbQaWIkI+yYC0tSduQHgFYrWs+EWMDZrAaio7Sx4J4 r0uPcr3fuRyA3iOVaI3ZiAkumw8RoaZuW1gzuWdKuH6iy+YyoAAo28I X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Chris Li The current zswap requires a backing swapfile. The swap slot used by zswap is not able to be used by the swapfile. That waste swapfile space. The ghost swapfile is a swapfile that only contains the swapfile header for zswap. The swapfile header indicate the size of the swapfile. There is no swap data section in the ghost swapfile, therefore, no waste of swapfile space. As such, any write to a ghost swapfile will fail. To prevents accidental read or write of ghost swapfile, bdev of swap_info_struct is set to NULL. Ghost swapfile will also set the SSD flag because there is no rotation disk access when using zswap. The zswap write back has been disabled if all swapfiles in the system are ghost swap files. Signed-off-by: Chris Li Signed-off-by: Kairui Song --- include/linux/swap.h | 2 ++ mm/page_io.c | 18 +++++++++++++++--- mm/swap.h | 2 +- mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++++----- mm/zswap.c | 12 +++++++++--- 5 files changed, 64 insertions(+), 12 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index bc871d8a1e99..3b2efd319f44 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -215,6 +215,7 @@ enum { SWP_PAGE_DISCARD =3D (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES =3D (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO =3D (1 << 12), /* synchronous IO is efficient */ + SWP_GHOST =3D (1 << 13), /* not backed by anything */ /* add others here before... */ }; =20 @@ -419,6 +420,7 @@ void free_folio_and_swap_cache(struct folio *folio); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; +extern atomic_t nr_real_swapfiles; extern long total_swap_pages; extern atomic_t nr_rotate_swap; =20 diff --git a/mm/page_io.c b/mm/page_io.c index 5a0b5034489b..f4a5fc0863f5 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -291,8 +291,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb= **swap_plug) return AOP_WRITEPAGE_ACTIVATE; } =20 - __swap_writepage(folio, swap_plug); - return 0; + return __swap_writepage(folio, swap_plug); out_unlock: folio_unlock(folio); return ret; @@ -454,11 +453,18 @@ static void swap_writepage_bdev_async(struct folio *f= olio, submit_bio(bio); } =20 -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_info_struct *sis =3D __swap_entry_to_info(folio->swap); =20 VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (sis->flags & SWP_GHOST) { + /* Prevent the page from getting reclaimed. */ + folio_set_dirty(folio); + return AOP_WRITEPAGE_ACTIVATE; + } + /* * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race @@ -475,6 +481,7 @@ void __swap_writepage(struct folio *folio, struct swap_= iocb **swap_plug) swap_writepage_bdev_sync(folio, sis); else swap_writepage_bdev_async(folio, sis); + return 0; } =20 void swap_write_unplug(struct swap_iocb *sio) @@ -649,6 +656,11 @@ void swap_read_folio(struct folio *folio, struct swap_= iocb **plug) if (zswap_load(folio) !=3D -ENOENT) goto finish; =20 + if (unlikely(sis->flags & SWP_GHOST)) { + folio_unlock(folio); + goto finish; + } + /* We have to read from slower devices. Increase zswap protection. */ zswap_folio_swapin(folio); =20 diff --git a/mm/swap.h b/mm/swap.h index cb1ab20d83d5..55aa6d904afd 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -226,7 +226,7 @@ static inline void swap_read_unplug(struct swap_iocb *p= lug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); +int __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); =20 /* linux/mm/swap_state.c */ extern struct address_space swap_space __read_mostly; diff --git a/mm/swapfile.c b/mm/swapfile.c index 4018e8694b72..65666c43cbd5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -67,6 +67,7 @@ static void move_cluster(struct swap_info_struct *si, static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; +atomic_t nr_real_swapfiles; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to @@ -1211,6 +1212,8 @@ static void del_from_avail_list(struct swap_info_stru= ct *si, bool swapoff) goto skip; } =20 + if (!(si->flags & SWP_GHOST)) + atomic_sub(1, &nr_real_swapfiles); plist_del(&si->avail_list, &swap_avail_head); =20 skip: @@ -1253,6 +1256,8 @@ static void add_to_avail_list(struct swap_info_struct= *si, bool swapon) } =20 plist_add(&si->avail_list, &swap_avail_head); + if (!(si->flags & SWP_GHOST)) + atomic_add(1, &nr_real_swapfiles); =20 skip: spin_unlock(&swap_avail_lock); @@ -2793,6 +2798,11 @@ static int setup_swap_extents(struct swap_info_struc= t *sis, struct inode *inode =3D mapping->host; int ret; =20 + if (sis->flags & SWP_GHOST) { + *span =3D 0; + return 0; + } + if (S_ISBLK(inode->i_mode)) { ret =3D add_swap_extent(sis, 0, sis->max, 0); *span =3D sis->pages; @@ -2992,7 +3002,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, special= file) =20 destroy_swap_extents(p, p->swap_file); =20 - if (!(p->flags & SWP_SOLIDSTATE)) + if (!(p->flags & SWP_GHOST) && + !(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); =20 mutex_lock(&swapon_mutex); @@ -3102,6 +3113,19 @@ static void swap_stop(struct seq_file *swap, void *v) mutex_unlock(&swapon_mutex); } =20 +static const char *swap_type_str(struct swap_info_struct *si) +{ + struct file *file =3D si->swap_file; + + if (si->flags & SWP_GHOST) + return "ghost\t"; + + if (S_ISBLK(file_inode(file)->i_mode)) + return "partition"; + + return "file\t"; +} + static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si =3D v; @@ -3121,8 +3145,7 @@ static int swap_show(struct seq_file *swap, void *v) len =3D seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", - S_ISBLK(file_inode(file)->i_mode) ? - "partition" : "file\t", + swap_type_str(si), bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); @@ -3254,7 +3277,6 @@ static int claim_swapfile(struct swap_info_struct *si= , struct inode *inode) return 0; } =20 - /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: @@ -3300,6 +3322,7 @@ static unsigned long read_swap_header(struct swap_inf= o_struct *si, unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; + loff_t size; =20 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); @@ -3342,7 +3365,16 @@ static unsigned long read_swap_header(struct swap_in= fo_struct *si, =20 if (!maxpages) return 0; - swapfilepages =3D i_size_read(inode) >> PAGE_SHIFT; + + size =3D i_size_read(inode); + if (size =3D=3D PAGE_SIZE) { + /* Ghost swapfile */ + si->bdev =3D NULL; + si->flags |=3D SWP_GHOST | SWP_SOLIDSTATE; + return maxpages; + } + + swapfilepages =3D size >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; diff --git a/mm/zswap.c b/mm/zswap.c index 5d83539a8bba..e470f697e770 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -995,11 +995,16 @@ static int zswap_writeback_entry(struct zswap_entry *= entry, struct swap_info_struct *si; int ret =3D 0; =20 - /* try to allocate swap cache folio */ si =3D get_swap_device(swpentry); if (!si) return -EEXIST; =20 + if (si->flags & SWP_GHOST) { + put_swap_device(si); + return -EINVAL; + } + + /* try to allocate swap cache folio */ mpol =3D get_task_policy(current); folio =3D swap_cache_alloc_folio(swpentry, GFP_KERNEL, 0, NULL, mpol, NO_INTERLEAVE_INDEX); @@ -1052,7 +1057,8 @@ static int zswap_writeback_entry(struct zswap_entry *= entry, folio_set_reclaim(folio); =20 /* start writeback */ - __swap_writepage(folio, NULL); + ret =3D __swap_writepage(folio, NULL); + WARN_ON_ONCE(ret); =20 out: if (ret) { @@ -1536,7 +1542,7 @@ bool zswap_store(struct folio *folio) zswap_pool_put(pool); put_objcg: obj_cgroup_put(objcg); - if (!ret && zswap_pool_reached_full) + if (!ret && zswap_pool_reached_full && atomic_read(&nr_real_swapfiles)) queue_work(shrink_wq, &zswap_shrink_work); check_old: /* --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E50D331237; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=YLYLVQ3PE5Et7OGYf76tG/87k3gzKi+g6iQZ6rnduaTcuceBMlobb4g/5MCI8lIph0aEHt2Crs+6yPtOH8SjjvWuiYoIJSW/az00QlX0rvGMGEurmha4nfbVrZCVnzHi+ZKf2TyLH+1P03g7vvU2P5znZNDvQ4X/WW8GqtnCwrI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=yJRoofSZXDJaqGdGu9R8RavXCSOxdeInP5x/zkK9r8M=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=sFzkO2ysnAKdGnAtYmfqWYc97toWGjDRtic2/aQCxGs/AFyJtauYJPVKoJuKB+NVBxOKRkEl//anine1o6SDV3TEipXK2BaHz++B+VVklDjHUZ4fl9HN2nvCheFi1woZ/7dVzdZwHOTBi+Abq1wVwCT3izTLzlV4Ng7HclMjUSY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=DBuVZgXb; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="DBuVZgXb" Received: by smtp.kernel.org (Postfix) with ESMTPS id 5203BC19423; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544528; bh=yJRoofSZXDJaqGdGu9R8RavXCSOxdeInP5x/zkK9r8M=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=DBuVZgXb2JmAMtPPqJCDR4w7l0Pad0Y8BF2SKBdag28MYKZQTeqxA/FMzc9UGtCeF r4DQxwPdCxdTxQsjXdDL9yWstxBluFeLZgp05mEGXiJL4Hvrz3hfJAin1iQA4KT0MC HAJUkfiysvxCJDDpW1W2K3Oldr8+nA0m/zMGWYCSYIKSRTjOGalAoAS+BwDqygY4ZK M+5I6WSlsPvYMWXpn8pcdf0CWaeQFSIsup8RQ+WUBmSOU/ij+79N+gQUvQdtnUV/kE IQ+kcxyevprfYrBEEcVL8hTJ6miRSf/vXRdNYBeIgiIUPFOSW8L0p6TEPplW38XDov we/mb0gzQqwXw== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 49E7BC531EA; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:15 +0800 Subject: [PATCH RFC 14/15] mm, swap: add a special device for ghost swap setup Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-14-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=5352; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=O3BFoOFHolD8K6jdQ6r7SaL/46khBUmErm+aeFqPkUg=; b=eSqTz2oJlmVuUdj4GCRJ+RA8upcoyTgpzycyxnyD0Gvm4OVZAjVNFL2ffi01TJr2LW/3Zm4hW BR62/OWEVH2B9/MuTlXAxlzz7upTFjH/F4IHkz8+Sd28aGdViLoTF/3 X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Use /dev/ghostswap as a special device so userspace can setup ghost swap easily without any extra tools. Signed-off-by: Kairui Song --- drivers/char/mem.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/swap.h | 2 ++ mm/swapfile.c | 22 +++++++++++++++++++--- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index cca4529431f8..8d0eb3f7d191 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -30,6 +30,7 @@ #include #include #include +#include =20 #define DEVMEM_MINOR 1 #define DEVPORT_MINOR 4 @@ -667,6 +668,41 @@ static const struct file_operations null_fops =3D { .uring_cmd =3D uring_cmd_null, }; =20 +#ifdef CONFIG_SWAP +static ssize_t read_ghostswap(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + union swap_header *hdr; + size_t to_copy; + + if (*ppos >=3D PAGE_SIZE) + return 0; + + hdr =3D kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!hdr) + return -ENOMEM; + + hdr->info.version =3D 1; + hdr->info.last_page =3D totalram_pages() - 1; + memcpy(hdr->magic.magic, "SWAPSPACE2", 10); + to_copy =3D min_t(size_t, count, PAGE_SIZE - *ppos); + if (copy_to_user(buf, (char *)hdr + *ppos, to_copy)) { + kfree(hdr); + return -EFAULT; + } + + kfree(hdr); + *ppos +=3D to_copy; + return to_copy; +} + +static const struct file_operations ghostswap_fops =3D { + .llseek =3D null_lseek, + .read =3D read_ghostswap, + .write =3D write_null, +}; +#endif + #ifdef CONFIG_DEVPORT static const struct file_operations port_fops =3D { .llseek =3D memory_lseek, @@ -718,6 +754,9 @@ static const struct memdev { #ifdef CONFIG_PRINTK [11] =3D { "kmsg", &kmsg_fops, 0, 0644 }, #endif +#ifdef CONFIG_SWAP + [DEVGHOST_MINOR] =3D { "ghostswap", &ghostswap_fops, 0, 0660 }, +#endif }; =20 static int memory_open(struct inode *inode, struct file *filp) diff --git a/include/linux/swap.h b/include/linux/swap.h index 3b2efd319f44..b57a4a40f4fe 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -421,6 +421,8 @@ void free_pages_and_swap_cache(struct encoded_page **, = int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern atomic_t nr_real_swapfiles; + +#define DEVGHOST_MINOR 13 /* /dev/ghostswap char device minor */ extern long total_swap_pages; extern atomic_t nr_rotate_swap; =20 diff --git a/mm/swapfile.c b/mm/swapfile.c index 65666c43cbd5..d054f40ec75f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -42,6 +42,7 @@ #include #include #include +#include =20 #include #include @@ -1703,6 +1704,7 @@ int folio_alloc_swap(struct folio *folio) unsigned int size =3D 1 << order; struct swap_cluster_info *ci; =20 + VM_WARN_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); =20 @@ -3421,6 +3423,10 @@ static int setup_swap_clusters_info(struct swap_info= _struct *si, err =3D swap_cluster_setup_bad_slot(si, cluster_info, 0, false); if (err) goto err; + + if (!swap_header) + goto setup_cluster_info; + for (i =3D 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr =3D swap_header->info.badpages[i]; =20 @@ -3440,6 +3446,7 @@ static int setup_swap_clusters_info(struct swap_info_= struct *si, goto err; } =20 +setup_cluster_info: INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); INIT_LIST_HEAD(&si->discard_clusters); @@ -3476,7 +3483,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialf= ile, int, swap_flags) struct dentry *dentry; int prio; int error; - union swap_header *swap_header; + union swap_header *swap_header =3D NULL; int nr_extents; sector_t span; unsigned long maxpages; @@ -3528,6 +3535,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, special= file, int, swap_flags) goto bad_swap_unlock_inode; } =20 + /* /dev/ghostswap: synthesize a ghost swap device. */ + if (S_ISCHR(inode->i_mode) && + imajor(inode) =3D=3D MEM_MAJOR && iminor(inode) =3D=3D DEVGHOST_MINOR= ) { + maxpages =3D round_up(totalram_pages(), SWAPFILE_CLUSTER); + si->flags |=3D SWP_GHOST | SWP_SOLIDSTATE; + si->bdev =3D NULL; + goto setup; + } + /* * The swap subsystem needs a major overhaul to support this. * It doesn't work yet so just disable it for now. @@ -3550,13 +3566,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specia= lfile, int, swap_flags) goto bad_swap_unlock_inode; } swap_header =3D kmap_local_folio(folio, 0); - maxpages =3D read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error =3D -EINVAL; goto bad_swap_unlock_inode; } =20 +setup: si->max =3D maxpages; si->pages =3D maxpages - 1; nr_extents =3D setup_swap_extents(si, swap_file, &span); @@ -3585,7 +3601,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialf= ile, int, swap_flags) =20 if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |=3D SWP_SOLIDSTATE; - } else { + } else if (!(si->flags & SWP_SOLIDSTATE)) { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap =3D true; } --=20 2.53.0 From nobody Fri Apr 3 09:51:29 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 81E633314B7; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; cv=none; b=k8EdeMQA96KmdBlhvYow9Fb4V5o9ukYF11xgtRygyDeLVVl9Hq9l+buK6a4PsnHTU1P8kGyCH2R5OAgf9/FGQ4frsYvCpWDnm7oZtRzdoxAhOMsExUPht5EzBu7ghNIE+GWnF+IdsEfokSpigafOvJL2YEwyvtWqvauq30l8A3Y= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1771544528; c=relaxed/simple; bh=/QtGbmL1TXC2oAX2Iuptbz2C/1byQSYgNfnZVKvjNtY=; h=From:Date:Subject:MIME-Version:Content-Type:Message-Id:References: In-Reply-To:To:Cc; b=JKWC9ZssD59ETpV69Tq2Cq5Imq9Eg+r7tnnpH8/8wDov+AmULGjJZ3VFggBS1kzYLCfHbCm31BdzoLbSSqjTIYJjcCkE0sIRYwjOkZuotPQdyoLzuAUKBXuOD9Q5hcqoIyxzcACvySCWBkAfeQDBjnGX0j+9HyBTo6jTuZU/9AY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=KQB2OPOB; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="KQB2OPOB" Received: by smtp.kernel.org (Postfix) with ESMTPS id 64D08C2BCAF; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1771544528; bh=/QtGbmL1TXC2oAX2Iuptbz2C/1byQSYgNfnZVKvjNtY=; h=From:Date:Subject:References:In-Reply-To:To:Cc:Reply-To:From; b=KQB2OPOBM/pjAAk9pZ4JITH0OF9/923f2NJF43QpK5OC1bRKo/MqYSGoguMpcWjRV ezfMmrnDJCa9avsk30iBcqH1zsgYReHQKpbnvkpYAXyJ7+itUnyA1jnhHQJlYnQT5o F5DxQhBt7G5ilOTuwuUIscOGTm/hEYCtTOGr9y5NCnMF/S+PMJXLw/3gbT81KyabTQ 7xScPg3ee4g3nseIs2H48OOg61uP5UmtVKErm/Pa0UpqW6YTF8/y3VN4f9k2Ycr4jo FPsn/RIU36S1xbMn9KV1KzUzyd4AYlRX827kHtD/gA2tk9PmYKzMnwZ8g3kEAkjOTA VL8tKd54XT78A== Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id 5C662C531EB; Thu, 19 Feb 2026 23:42:08 +0000 (UTC) From: Kairui Song via B4 Relay Date: Fri, 20 Feb 2026 07:42:16 +0800 Subject: [PATCH RFC 15/15] mm, swap: allocate cluster dynamically for ghost swapfile Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Message-Id: <20260220-swap-table-p4-v1-15-104795d19815@tencent.com> References: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> In-Reply-To: <20260220-swap-table-p4-v1-0-104795d19815@tencent.com> To: linux-mm@kvack.org Cc: Andrew Morton , David Hildenbrand , Lorenzo Stoakes , Zi Yan , Baolin Wang , Barry Song , Hugh Dickins , Chris Li , Kemeng Shi , Nhat Pham , Baoquan He , Johannes Weiner , Yosry Ahmed , Youngjun Park , Chengming Zhou , Roman Gushchin , Shakeel Butt , Muchun Song , Qi Zheng , linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, Kairui Song X-Mailer: b4 0.14.3 X-Developer-Signature: v=1; a=ed25519-sha256; t=1771544524; l=19753; i=kasong@tencent.com; s=kasong-sign-tencent; h=from:subject:message-id; bh=u9csOIKG9VriVNBt5F3fiF35yiPdu1SM5al5oGisUCo=; b=j9KplqMb0u3hORs3FRAbIrR2GIN8gWkTW7vVNI1catIXs2JB76IZ4taZdDfUTyG5R8KF2tiLx gBiqJB5+ZdgBPoP2BUFi73FFXXiMuj1fHoA9SOBNYeHkmYIsqWeZiuI X-Developer-Key: i=kasong@tencent.com; a=ed25519; pk=kCdoBuwrYph+KrkJnrr7Sm1pwwhGDdZKcKrqiK8Y1mI= X-Endpoint-Received: by B4 Relay for kasong@tencent.com/kasong-sign-tencent with auth_id=562 X-Original-From: Kairui Song Reply-To: kasong@tencent.com From: Kairui Song Now, the ghost swap file is completely dynamic. For easier testing, this commit makes the /dev/ghostswap 8 times the size of total ram by default. NOTE: This commit is still a minimal proof of concept, so many parts of the implementation can be improved. And we have a ci_dyn->virtual_table that's is ready to be used (not used yet). For example, storing zswap's metadata. In theory the folio lock can be used to stablize it's virtual table data. e.g., Swap entry writeback can also be done easily using a folio_realloc_swap, skip the folio->swap's device and use underlying devices, it will be easier to do if we remove the global percpu cluster cache as suggested by [1] and should just work with tiering and priority. Just put the folio->swap as a reverse entry in the lower layer's swap table, and collect lower level's swap entry in the virtual_table, then it's all good. And right now all allocations are using atomic, which can also be improved as the swap table already has sleep allocation support, just need to adapt it. The RCU lock protection convention can also be simplified. But without all that, this works pretty well. We can have a "virtual swap" of any size with zero overhead, common stress tests are showing a very nice performance, while ordinary swaps have zero overhead, and everything is runtime configurable. But don't be too surprised if some corner cases are not well covered yet, as most works are still focusing on the infrastructure. Link: https://lore.kernel.org/linux-mm/20260126065242.1221862-5-youngjun.pa= rk@lge.com/ [1] Signed-off-by: Kairui Song --- include/linux/swap.h | 1 + mm/swap.h | 44 +++++++++++++--- mm/swap_state.c | 35 ++++++++----- mm/swap_table.h | 2 + mm/swapfile.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++= ---- 5 files changed, 199 insertions(+), 28 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b57a4a40f4fe..41d7eae56d65 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -284,6 +284,7 @@ struct swap_info_struct { struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_list; /* entry in swap_avail_head */ + struct xarray cluster_info_pool; /* Xarray for ghost swap cluster info */ }; =20 static inline swp_entry_t page_swap_entry(struct page *page) diff --git a/mm/swap.h b/mm/swap.h index 55aa6d904afd..7a4d1d939842 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,6 +41,13 @@ struct swap_cluster_info { struct list_head list; }; =20 +struct swap_cluster_info_dynamic { + struct swap_cluster_info ci; /* Underlying cluster info */ + unsigned int index; /* for cluster_index() */ + struct rcu_head rcu; /* For kfree_rcu deferred free */ + /* unsigned long *virtual_table; And we can easily have a virtual table */ +}; + /* All on-list cluster must have a non-zero flag. */ enum swap_cluster_flags { CLUSTER_FLAG_NONE =3D 0, /* For temporary off-list cluster */ @@ -51,6 +58,7 @@ enum swap_cluster_flags { CLUSTER_FLAG_USABLE =3D CLUSTER_FLAG_FRAG, CLUSTER_FLAG_FULL, CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_DEAD, /* Ghost cluster pending kfree_rcu */ CLUSTER_FLAG_MAX, }; =20 @@ -84,9 +92,19 @@ static inline struct swap_info_struct *__swap_entry_to_i= nfo(swp_entry_t entry) static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { + unsigned int cluster_idx =3D offset / SWAPFILE_CLUSTER; + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ VM_WARN_ON_ONCE(offset >=3D roundup(si->max, SWAPFILE_CLUSTER)); - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; + + if (si->flags & SWP_GHOST) { + struct swap_cluster_info_dynamic *ci_dyn; + + ci_dyn =3D xa_load(&si->cluster_info_pool, cluster_idx); + return ci_dyn ? &ci_dyn->ci : NULL; + } + + return &si->cluster_info[cluster_idx]; } =20 static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_= t entry) @@ -98,7 +116,7 @@ static inline struct swap_cluster_info *__swap_entry_to_= cluster(swp_entry_t entr static __always_inline struct swap_cluster_info *__swap_cluster_lock( struct swap_info_struct *si, unsigned long offset, bool irq) { - struct swap_cluster_info *ci =3D __swap_offset_to_cluster(si, offset); + struct swap_cluster_info *ci; =20 /* * Nothing modifies swap cache in an IRQ context. All access to @@ -111,10 +129,24 @@ static __always_inline struct swap_cluster_info *__sw= ap_cluster_lock( */ VM_WARN_ON_ONCE(!in_task()); VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - if (irq) - spin_lock_irq(&ci->lock); - else - spin_lock(&ci->lock); + + rcu_read_lock(); + ci =3D __swap_offset_to_cluster(si, offset); + if (ci) { + if (irq) + spin_lock_irq(&ci->lock); + else + spin_lock(&ci->lock); + + if (ci->flags =3D=3D CLUSTER_FLAG_DEAD) { + if (irq) + spin_unlock_irq(&ci->lock); + else + spin_unlock(&ci->lock); + ci =3D NULL; + } + } + rcu_read_unlock(); return ci; } =20 diff --git a/mm/swap_state.c b/mm/swap_state.c index 419419e18a47..1c3600a93ecd 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -90,8 +90,10 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) struct folio *folio; =20 for (;;) { + rcu_read_lock(); swp_tb =3D swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); + rcu_read_unlock(); if (!swp_tb_is_folio(swp_tb)) return NULL; folio =3D swp_tb_to_folio(swp_tb); @@ -113,8 +115,10 @@ bool swap_cache_has_folio(swp_entry_t entry) { unsigned long swp_tb; =20 + rcu_read_lock(); swp_tb =3D swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); + rcu_read_unlock(); return swp_tb_is_folio(swp_tb); } =20 @@ -130,8 +134,10 @@ void *swap_cache_get_shadow(swp_entry_t entry) { unsigned long swp_tb; =20 + rcu_read_lock(); swp_tb =3D swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); + rcu_read_unlock(); if (swp_tb_is_shadow(swp_tb)) return swp_tb_to_shadow(swp_tb); return NULL; @@ -209,14 +215,14 @@ void __swap_cache_add_folio(struct swap_cluster_info = *ci, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } =20 -static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, - swp_entry_t targ_entry, gfp_t gfp, +static struct folio *__swap_cache_alloc(swp_entry_t targ_entry, gfp_t gfp, unsigned int order, struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { int err; swp_entry_t entry; struct folio *folio; + struct swap_cluster_info *ci; void *shadow =3D NULL, *shadow_check =3D NULL; unsigned long address, nr_pages =3D 1 << order; unsigned int ci_off, ci_targ =3D swp_cluster_offset(targ_entry); @@ -225,9 +231,12 @@ static struct folio *__swap_cache_alloc(struct swap_cl= uster_info *ci, ci_off =3D round_down(ci_targ, nr_pages); =20 /* First check if the range is available */ - spin_lock(&ci->lock); - err =3D __swap_cache_check_batch(ci, entry, ci_off, ci_targ, nr_pages, &s= hadow); - spin_unlock(&ci->lock); + err =3D -ENOENT; + ci =3D swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + if (ci) { + err =3D __swap_cache_check_batch(ci, entry, ci_off, ci_targ, nr_pages, &= shadow); + swap_cluster_unlock(ci); + } if (unlikely(err)) return ERR_PTR(err); =20 @@ -243,10 +252,13 @@ static struct folio *__swap_cache_alloc(struct swap_c= luster_info *ci, return ERR_PTR(-ENOMEM); =20 /* Double check the range is still not in conflict */ - spin_lock(&ci->lock); - err =3D __swap_cache_check_batch(ci, entry, ci_off, ci_targ, nr_pages, &s= hadow_check); + err =3D -ENOENT; + ci =3D swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + if (ci) + err =3D __swap_cache_check_batch(ci, entry, ci_off, ci_targ, nr_pages, &= shadow_check); if (unlikely(err) || shadow_check !=3D shadow) { - spin_unlock(&ci->lock); + if (ci) + swap_cluster_unlock(ci); folio_put(folio); =20 /* If shadow changed, just try again */ @@ -256,13 +268,14 @@ static struct folio *__swap_cache_alloc(struct swap_c= luster_info *ci, __folio_set_locked(folio); __folio_set_swapbacked(folio); __swap_cache_add_folio(ci, folio, entry); - spin_unlock(&ci->lock); + swap_cluster_unlock(ci); =20 /* With swap table, we must have a shadow, for memcg tracking */ WARN_ON(!shadow); =20 if (mem_cgroup_swapin_charge_folio(folio, vmf ? vmf->vma->vm_mm : NULL, gfp, shadow_to_memcgid(shadow))) { + /* The folio pins the cluster */ spin_lock(&ci->lock); __swap_cache_del_folio(ci, folio, shadow, false, false); spin_unlock(&ci->lock); @@ -305,13 +318,11 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ= _entry, gfp_t gfp_mask, { int order, err; struct folio *folio; - struct swap_cluster_info *ci; =20 /* Always allow order 0 so swap won't fail under pressure. */ order =3D orders ? highest_order(orders |=3D BIT(0)) : 0; - ci =3D __swap_entry_to_cluster(targ_entry); for (;;) { - folio =3D __swap_cache_alloc(ci, targ_entry, gfp_mask, order, + folio =3D __swap_cache_alloc(targ_entry, gfp_mask, order, vmf, mpol, ilx); if (!IS_ERR(folio)) return folio; diff --git a/mm/swap_table.h b/mm/swap_table.h index 6d3d773e1908..867bcfff0e3c 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -260,6 +260,8 @@ static inline unsigned long swap_table_get(struct swap_= cluster_info *ci, unsigned long swp_tb; =20 VM_WARN_ON_ONCE(off >=3D SWAPFILE_CLUSTER); + if (!ci) + return SWP_TB_NULL; =20 rcu_read_lock(); table =3D rcu_dereference(ci->table); diff --git a/mm/swapfile.c b/mm/swapfile.c index d054f40ec75f..f0682c8c8f53 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -404,6 +404,8 @@ static inline bool cluster_is_usable(struct swap_cluste= r_info *ci, int order) static inline unsigned int cluster_index(struct swap_info_struct *si, struct swap_cluster_info *ci) { + if (si->flags & SWP_GHOST) + return container_of(ci, struct swap_cluster_info_dynamic, ci)->index; return ci - si->cluster_info; } =20 @@ -708,6 +710,22 @@ static void free_cluster(struct swap_info_struct *si, = struct swap_cluster_info * return; } =20 + if (si->flags & SWP_GHOST) { + struct swap_cluster_info_dynamic *ci_dyn; + + ci_dyn =3D container_of(ci, struct swap_cluster_info_dynamic, ci); + if (ci->flags !=3D CLUSTER_FLAG_NONE) { + spin_lock(&si->lock); + list_del(&ci->list); + spin_unlock(&si->lock); + } + swap_cluster_free_table(ci); + xa_erase(&si->cluster_info_pool, ci_dyn->index); + ci->flags =3D CLUSTER_FLAG_DEAD; + kfree_rcu(ci_dyn, rcu); + return; + } + __free_cluster(si, ci); } =20 @@ -814,15 +832,17 @@ static int swap_cluster_setup_bad_slot(struct swap_in= fo_struct *si, * stolen by a lower order). @usable will be set to false if that happens. */ static bool cluster_reclaim_range(struct swap_info_struct *si, - struct swap_cluster_info *ci, + struct swap_cluster_info **pcip, unsigned long start, unsigned int order, bool *usable) { + struct swap_cluster_info *ci =3D *pcip; unsigned int nr_pages =3D 1 << order; unsigned long offset =3D start, end =3D start + nr_pages; unsigned long swp_tb; =20 spin_unlock(&ci->lock); + rcu_read_lock(); do { swp_tb =3D swap_table_get(ci, offset % SWAPFILE_CLUSTER); if (swp_tb_get_count(swp_tb)) @@ -831,7 +851,15 @@ static bool cluster_reclaim_range(struct swap_info_str= uct *si, if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) break; } while (++offset < end); - spin_lock(&ci->lock); + rcu_read_unlock(); + + /* Re-lookup: ghost cluster may have been freed while lock was dropped */ + ci =3D swap_cluster_lock(si, start); + *pcip =3D ci; + if (!ci) { + *usable =3D false; + return false; + } =20 /* * We just dropped ci->lock so cluster could be used by another @@ -979,7 +1007,8 @@ static unsigned int alloc_swap_scan_cluster(struct swa= p_info_struct *si, if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) { - ret =3D cluster_reclaim_range(si, ci, offset, order, &usable); + ret =3D cluster_reclaim_range(si, &ci, offset, order, + &usable); if (!usable) goto out; if (cluster_is_empty(ci)) @@ -1005,8 +1034,10 @@ static unsigned int alloc_swap_scan_cluster(struct s= wap_info_struct *si, * should use a new cluster, and move the failed cluster to where it * should be. */ - relocate_cluster(si, ci); - swap_cluster_unlock(ci); + if (ci) { + relocate_cluster(si, ci); + swap_cluster_unlock(ci); + } if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -1038,6 +1069,44 @@ static unsigned int alloc_swap_scan_list(struct swap= _info_struct *si, return found; } =20 +static unsigned int alloc_swap_scan_dynamic(struct swap_info_struct *si, + struct folio *folio) +{ + struct swap_cluster_info_dynamic *ci_dyn; + struct swap_cluster_info *ci; + struct swap_table *table; + unsigned long offset; + + WARN_ON(!(si->flags & SWP_GHOST)); + + ci_dyn =3D kzalloc(sizeof(*ci_dyn), GFP_ATOMIC); + if (!ci_dyn) + return SWAP_ENTRY_INVALID; + + table =3D swap_table_alloc(GFP_ATOMIC); + if (!table) { + kfree(ci_dyn); + return SWAP_ENTRY_INVALID; + } + + spin_lock_init(&ci_dyn->ci.lock); + INIT_LIST_HEAD(&ci_dyn->ci.list); + rcu_assign_pointer(ci_dyn->ci.table, table); + + if (xa_alloc(&si->cluster_info_pool, &ci_dyn->index, ci_dyn, + XA_LIMIT(1, DIV_ROUND_UP(si->max, SWAPFILE_CLUSTER) - 1), + GFP_ATOMIC)) { + swap_table_free(table); + kfree(ci_dyn); + return SWAP_ENTRY_INVALID; + } + + ci =3D &ci_dyn->ci; + spin_lock(&ci->lock); + offset =3D cluster_offset(si, ci); + return alloc_swap_scan_cluster(si, ci, folio, offset); +} + static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool f= orce) { long to_scan =3D 1; @@ -1060,7 +1129,9 @@ static void swap_reclaim_full_clusters(struct swap_in= fo_struct *si, bool force) spin_unlock(&ci->lock); nr_reclaim =3D __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); - spin_lock(&ci->lock); + ci =3D swap_cluster_lock(si, offset); + if (!ci) + goto next; if (nr_reclaim) { offset +=3D abs(nr_reclaim); continue; @@ -1074,6 +1145,7 @@ static void swap_reclaim_full_clusters(struct swap_in= fo_struct *si, bool force) relocate_cluster(si, ci); =20 swap_cluster_unlock(ci); +next: if (to_scan <=3D 0) break; } @@ -1136,6 +1208,12 @@ static unsigned long cluster_alloc_swap_entry(struct= swap_info_struct *si, goto done; } =20 + if (si->flags & SWP_GHOST) { + found =3D alloc_swap_scan_dynamic(si, folio); + if (found) + goto done; + } + if (!(si->flags & SWP_PAGE_DISCARD)) { found =3D alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) @@ -1375,7 +1453,8 @@ static bool swap_alloc_fast(struct folio *folio) return false; =20 ci =3D swap_cluster_lock(si, offset); - alloc_swap_scan_cluster(si, ci, folio, offset); + if (ci) + alloc_swap_scan_cluster(si, ci, folio, offset); put_swap_device(si); return folio_test_swapcache(folio); } @@ -1476,6 +1555,7 @@ int swap_retry_table_alloc(swp_entry_t entry, gfp_t g= fp) if (!si) return 0; =20 + /* Entry is in use (being faulted in), so its cluster is alive. */ ci =3D __swap_offset_to_cluster(si, offset); ret =3D swap_extend_table_alloc(si, ci, gfp); =20 @@ -1996,6 +2076,7 @@ bool folio_maybe_swapped(struct folio *folio) VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); =20 + /* Folio is locked and in swap cache, so ci->count > 0: cluster is alive.= */ ci =3D __swap_entry_to_cluster(entry); ci_off =3D swp_cluster_offset(entry); ci_end =3D ci_off + folio_nr_pages(folio); @@ -2124,7 +2205,8 @@ swp_entry_t swap_alloc_hibernation_slot(int type) pcp_offset =3D this_cpu_read(percpu_swap_cluster.offset[0]); if (pcp_si =3D=3D si && pcp_offset) { ci =3D swap_cluster_lock(si, pcp_offset); - offset =3D alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); + if (ci) + offset =3D alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); } if (offset =3D=3D SWAP_ENTRY_INVALID) offset =3D cluster_alloc_swap_entry(si, NULL); @@ -2413,8 +2495,10 @@ static int unuse_pte_range(struct vm_area_struct *vm= a, pmd_t *pmd, &vmf); } if (!folio) { + rcu_read_lock(); swp_tb =3D swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); + rcu_read_unlock(); if (swp_tb_get_count(swp_tb) <=3D 0) continue; return -ENOMEM; @@ -2560,8 +2644,10 @@ static unsigned int find_next_to_unuse(struct swap_i= nfo_struct *si, * allocations from this area (while holding swap_lock). */ for (i =3D prev + 1; i < si->max; i++) { + rcu_read_lock(); swp_tb =3D swap_table_get(__swap_offset_to_cluster(si, i), i % SWAPFILE_CLUSTER); + rcu_read_unlock(); if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) break; if ((i % LATENCY_LIMIT) =3D=3D 0) @@ -2874,6 +2960,8 @@ static void wait_for_allocation(struct swap_info_stru= ct *si) struct swap_cluster_info *ci; =20 BUG_ON(si->flags & SWP_WRITEOK); + if (si->flags & SWP_GHOST) + return; =20 for (offset =3D 0; offset < end; offset +=3D SWAPFILE_CLUSTER) { ci =3D swap_cluster_lock(si, offset); @@ -3394,10 +3482,47 @@ static int setup_swap_clusters_info(struct swap_inf= o_struct *si, unsigned long maxpages) { unsigned long nr_clusters =3D DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - struct swap_cluster_info *cluster_info; + struct swap_cluster_info *cluster_info =3D NULL; + struct swap_cluster_info_dynamic *ci_dyn; int err =3D -ENOMEM; unsigned long i; =20 + /* For SWP_GHOST files, initialize Xarray pool instead of static array */ + if (si->flags & SWP_GHOST) { + /* + * Pre-allocate cluster 0 and mark slot 0 (header page) + * as bad so the allocator never hands out page offset 0. + */ + ci_dyn =3D kzalloc(sizeof(*ci_dyn), GFP_KERNEL); + if (!ci_dyn) + goto err; + spin_lock_init(&ci_dyn->ci.lock); + INIT_LIST_HEAD(&ci_dyn->ci.list); + + nr_clusters =3D 0; + xa_init_flags(&si->cluster_info_pool, XA_FLAGS_ALLOC); + err =3D xa_insert(&si->cluster_info_pool, 0, ci_dyn, GFP_KERNEL); + if (err) { + kfree(ci_dyn); + goto err; + } + + err =3D swap_cluster_setup_bad_slot(si, &ci_dyn->ci, 0, false); + if (err) { + struct swap_table *table; + + xa_erase(&si->cluster_info_pool, 0); + table =3D (void *)rcu_dereference_protected(ci_dyn->ci.table, true); + if (table) + swap_table_free(table); + kfree(ci_dyn); + xa_destroy(&si->cluster_info_pool); + goto err; + } + + goto setup_cluster_info; + } + cluster_info =3D kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) goto err; @@ -3538,7 +3663,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialf= ile, int, swap_flags) /* /dev/ghostswap: synthesize a ghost swap device. */ if (S_ISCHR(inode->i_mode) && imajor(inode) =3D=3D MEM_MAJOR && iminor(inode) =3D=3D DEVGHOST_MINOR= ) { - maxpages =3D round_up(totalram_pages(), SWAPFILE_CLUSTER); + maxpages =3D round_up(totalram_pages(), SWAPFILE_CLUSTER) * 8; si->flags |=3D SWP_GHOST | SWP_SOLIDSTATE; si->bdev =3D NULL; goto setup; --=20 2.53.0