From nobody Tue Dec 16 07:10:52 2025
Received: from mout-p-102.mailbox.org (mout-p-102.mailbox.org [80.241.56.152])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id DB24E6DCE1;
Tue, 27 May 2025 05:05:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.152
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748322318; cv=none;
b=ipq2b/xo6i4xXdTxnMhin1TbA7Tj4JpoDMYs0sD6IhcjNl6UZdfAoFA7Z26ShPj/wc0f7eHCSIL5rWY3tuT2Is9gt13KBGgPuC/tKSa++S4Wk7WyEISJBt5ugi0TMW/oViooFSNKcbGQmiNH9UHERkqz/CIgBAOPKjvLWlXF9s0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748322318; c=relaxed/simple;
bh=E9jmb+XDhXxm1C81tUhUKNpzz6Ez9Inxwibg117dTRg=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=qzVukYgoUBbPeFgi27PHR4kP6FJdICnORQIHgFED0jijGUnOW82tIca+B5iFHjMuHLSTI631fWfEhX49nWBzG8kdyppGVDl4EQROqqAweIaizuc7FGY1uIwJeV/XgN0IZUJzKOvzfQ/aOCEzPoiA60huxFQ0HieTCKEqm4JxD/g=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.152
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp102.mailbox.org (smtp102.mailbox.org
[IPv6:2001:67c:2050:b231:465::102])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-102.mailbox.org (Postfix) with ESMTPS id 4b60tv5BxNz9tX0;
Tue, 27 May 2025 07:05:11 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Vlastimil Babka ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Zi Yan ,
Mike Rapoport ,
Dave Hansen ,
Michal Hocko ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
linux-block@vger.kernel.org,
willy@infradead.org,
x86@kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [RFC 1/3] mm: move huge_zero_folio from huge_memory.c to memory.c
Date: Tue, 27 May 2025 07:04:50 +0200
Message-ID: <20250527050452.817674-2-p.raghav@samsung.com>
In-Reply-To: <20250527050452.817674-1-p.raghav@samsung.com>
References: <20250527050452.817674-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Rspamd-Queue-Id: 4b60tv5BxNz9tX0
Content-Type: text/plain; charset="utf-8"
The huge_zero_folio was initially placed in huge_memory.c as most of the
users were in that file. But it does not depend on THP, so it could very
well be a part of memory.c file.
As huge_zero_folio is going to be exposed to more users outside of mm,
let's move it to memory.c file.
This is a prep patch to add CONFIG_STATIC_PMD_ZERO_PAGE. No functional
changes.
Suggested-by: David Hildenbrand
Signed-off-by: Pankaj Raghav
---
include/linux/huge_mm.h | 16 ------
include/linux/mm.h | 16 ++++++
mm/huge_memory.c | 105 +---------------------------------------
mm/memory.c | 99 +++++++++++++++++++++++++++++++++++++
4 files changed, 117 insertions(+), 119 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2f190c90192d..d48973a6bd0f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -478,22 +478,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *=
vma, unsigned long addr,
=20
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
=20
-extern struct folio *huge_zero_folio;
-extern unsigned long huge_zero_pfn;
-
-static inline bool is_huge_zero_folio(const struct folio *folio)
-{
- return READ_ONCE(huge_zero_folio) =3D=3D folio;
-}
-
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) =3D=3D pmd_pfn(pmd);
-}
-
-struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
-void mm_put_huge_zero_folio(struct mm_struct *mm);
-
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd2e513189d6..58d150dfc2da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -69,6 +69,22 @@ static inline void totalram_pages_add(long count)
=20
extern void * high_memory;
=20
+extern struct folio *huge_zero_folio;
+extern unsigned long huge_zero_pfn;
+
+static inline bool is_huge_zero_folio(const struct folio *folio)
+{
+ return READ_ONCE(huge_zero_folio) =3D=3D folio;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) =3D=3D pmd_pfn(pmd);
+}
+
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
+void mm_put_huge_zero_folio(struct mm_struct *mm);
+
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d3e66136e41a..c6e203abb2de 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,9 +75,6 @@ static unsigned long deferred_split_scan(struct shrinker =
*shrink,
struct shrink_control *sc);
static bool split_underused_thp =3D true;
=20
-static atomic_t huge_zero_refcount;
-struct folio *huge_zero_folio __read_mostly;
-unsigned long huge_zero_pfn __read_mostly =3D ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
@@ -208,88 +205,6 @@ unsigned long __thp_vma_allowable_orders(struct vm_are=
a_struct *vma,
return orders;
}
=20
-static bool get_huge_zero_page(void)
-{
- struct folio *zero_folio;
-retry:
- if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return true;
-
- zero_folio =3D folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
- HPAGE_PMD_ORDER);
- if (!zero_folio) {
- count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
- return false;
- }
- /* Ensure zero folio won't have large_rmappable flag set. */
- folio_clear_large_rmappable(zero_folio);
- preempt_disable();
- if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
- preempt_enable();
- folio_put(zero_folio);
- goto retry;
- }
- WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
-
- /* We take additional reference here. It will be put back by shrinker */
- atomic_set(&huge_zero_refcount, 2);
- preempt_enable();
- count_vm_event(THP_ZERO_PAGE_ALLOC);
- return true;
-}
-
-static void put_huge_zero_page(void)
-{
- /*
- * Counter should never go to zero here. Only shrinker can put
- * last reference.
- */
- BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
-}
-
-struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
-{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- return READ_ONCE(huge_zero_folio);
-
- if (!get_huge_zero_page())
- return NULL;
-
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
-
- return READ_ONCE(huge_zero_folio);
-}
-
-void mm_put_huge_zero_folio(struct mm_struct *mm)
-{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
-}
-
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) =3D=3D 1 ? HPAGE_PMD_NR : 0;
-}
-
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) =3D=3D 1) {
- struct folio *zero_folio =3D xchg(&huge_zero_folio, NULL);
- BUG_ON(zero_folio =3D=3D NULL);
- WRITE_ONCE(huge_zero_pfn, ~0UL);
- folio_put(zero_folio);
- return HPAGE_PMD_NR;
- }
-
- return 0;
-}
-
-static struct shrinker *huge_zero_page_shrinker;
-
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -850,22 +765,12 @@ static inline void hugepage_exit_sysfs(struct kobject=
*hugepage_kobj)
=20
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker =3D shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker =3D shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects =3D shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects =3D shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
=20
deferred_split_shrinker->count_objects =3D deferred_split_count;
deferred_split_shrinker->scan_objects =3D deferred_split_scan;
@@ -874,12 +779,6 @@ static int __init thp_shrinker_init(void)
return 0;
}
=20
-static void __init thp_shrinker_exit(void)
-{
- shrinker_free(huge_zero_page_shrinker);
- shrinker_free(deferred_split_shrinker);
-}
-
static int __init hugepage_init(void)
{
int err;
@@ -923,7 +822,7 @@ static int __init hugepage_init(void)
=20
return 0;
err_khugepaged:
- thp_shrinker_exit();
+ shrinker_free(deferred_split_shrinker);
err_shrinker:
khugepaged_destroy();
err_slab:
diff --git a/mm/memory.c b/mm/memory.c
index 5cb48f262ab0..11edc4d66e74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -159,6 +159,105 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
=20
+static atomic_t huge_zero_refcount;
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly =3D ~0UL;
+static struct shrinker *huge_zero_page_shrinker;
+
+static bool get_huge_zero_page(void)
+{
+ struct folio *zero_folio;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return true;
+
+ zero_folio =3D folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_folio) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return false;
+ }
+ /* Ensure zero folio won't have large_rmappable flag set. */
+ folio_clear_large_rmappable(zero_folio);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
+ preempt_enable();
+ folio_put(zero_folio);
+ goto retry;
+ }
+ WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ return true;
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ return READ_ONCE(huge_zero_folio);
+
+ if (!get_huge_zero_page())
+ return NULL;
+
+ if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+
+ return READ_ONCE(huge_zero_folio);
+}
+
+void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+}
+
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) =3D=3D 1 ? HPAGE_PMD_NR : 0;
+}
+
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) =3D=3D 1) {
+ struct folio *zero_folio =3D xchg(&huge_zero_folio, NULL);
+ BUG_ON(zero_folio =3D=3D NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
+ folio_put(zero_folio);
+ return HPAGE_PMD_NR;
+ }
+
+ return 0;
+}
+
+static int __init init_huge_zero_page(void)
+{
+ huge_zero_page_shrinker =3D shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_page_shrinker)
+ return -ENOMEM;
+
+ huge_zero_page_shrinker->count_objects =3D shrink_huge_zero_page_count;
+ huge_zero_page_shrinker->scan_objects =3D shrink_huge_zero_page_scan;
+ shrinker_register(huge_zero_page_shrinker);
+
+ return 0;
+}
+early_initcall(init_huge_zero_page);
+
void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
trace_rss_stat(mm, member);
--=20
2.47.2
From nobody Tue Dec 16 07:10:52 2025
Received: from mout-p-102.mailbox.org (mout-p-102.mailbox.org [80.241.56.152])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id E2EB91E5B7E;
Tue, 27 May 2025 05:05:20 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.152
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748322322; cv=none;
b=Djx4oAnbMjGkC7lZuCLJFoABh49EIQXz+UXnT9wsJg0Tp/2EgxQVqh2dErXQ+/CxvOSgrZhgo23nNTodJRIiSm8/SvMV9hPJlZi0vhFuvPYzPxdGjiUGnQvuRRZEtzycGlCSgh+cVPzPAGFdmjubOi7N0+c7GiJI3EF/9ZaCG/I=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748322322; c=relaxed/simple;
bh=rUvi7Mt+AHNe7Yj6bxgf6KQ/SLNM3UWDePEH/EO6Atw=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=RKit1DcTkKQhiY8suTzhfN623U/cpXp1iM3uwkGfI52khDmSzz3osP+Mpum/NFIhdh6VOBQXISQvGuh9liwnJg9ucyuMZhpHmKGPeVzWij29Uy0OJYUisswyZLDQdc2/g55XBVj2G8/Q0f8LHZbZSDcvwqNW4+en+eoo0ufshXs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.152
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp102.mailbox.org (smtp102.mailbox.org [10.196.197.102])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-102.mailbox.org (Postfix) with ESMTPS id 4b60v06QP9z9tNF;
Tue, 27 May 2025 07:05:16 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Vlastimil Babka ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Zi Yan ,
Mike Rapoport ,
Dave Hansen ,
Michal Hocko ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
linux-block@vger.kernel.org,
willy@infradead.org,
x86@kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [RFC 2/3] mm: add STATIC_PMD_ZERO_PAGE config option
Date: Tue, 27 May 2025 07:04:51 +0200
Message-ID: <20250527050452.817674-3-p.raghav@samsung.com>
In-Reply-To: <20250527050452.817674-1-p.raghav@samsung.com>
References: <20250527050452.817674-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"
There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
This concern was raised during the review of adding LBS support to
XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.
Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio, and it will never be freed. This makes using the
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.
We can enable it by default for x86_64 where the PMD size is 2M.
It is good compromise between the memory and efficiency.
As a THP zero page might be wasteful for architectures with bigger page
sizes, let's not enable it for them.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Suggested-by: David Hildenbrand
Signed-off-by: Pankaj Raghav
---
arch/x86/Kconfig | 1 +
mm/Kconfig | 12 ++++++++++++
mm/memory.c | 30 ++++++++++++++++++++++++++----
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 055204dc211d..96f99b4f96ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_WANTS_STATIC_PMD_ZERO_PAGE if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
diff --git a/mm/Kconfig b/mm/Kconfig
index bd08e151fa1b..8f50f5c3f7a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,18 @@ config ARCH_WANTS_THP_SWAP
config MM_ID
def_bool n
=20
+config ARCH_WANTS_STATIC_PMD_ZERO_PAGE
+ bool
+
+config STATIC_PMD_ZERO_PAGE
+ def_bool y
+ depends on ARCH_WANTS_STATIC_PMD_ZERO_PAGE
+ help
+ Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+ on demand and deallocated when not in use. This option will always
+ allocate huge_zero_folio for zeroing and it is never deallocated.
+ Not suitable for memory constrained systems.
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/memory.c b/mm/memory.c
index 11edc4d66e74..ab8c16d04307 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -203,9 +203,17 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
=20
+/*
+ * If STATIC_PMD_ZERO_PAGE is enabled, @mm can be NULL, i.e, the huge_zero=
_folio
+ * is not associated with any mm_struct.
+*/
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (!IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE) && !mm)
+ return NULL;
+
+ if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE) ||
+ test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
return READ_ONCE(huge_zero_folio);
=20
if (!get_huge_zero_page())
@@ -219,6 +227,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *=
mm)
=20
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
+ if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE))
+ return;
+
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
}
@@ -246,15 +257,26 @@ static unsigned long shrink_huge_zero_page_scan(struc=
t shrinker *shrink,
=20
static int __init init_huge_zero_page(void)
{
+ int ret =3D 0;
+
+ if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE)) {
+ if (!get_huge_zero_page())
+ ret =3D -ENOMEM;
+ goto out;
+ }
+
huge_zero_page_shrinker =3D shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
+ if (!huge_zero_page_shrinker) {
+ ret =3D -ENOMEM;
+ goto out;
+ }
=20
huge_zero_page_shrinker->count_objects =3D shrink_huge_zero_page_count;
huge_zero_page_shrinker->scan_objects =3D shrink_huge_zero_page_scan;
shrinker_register(huge_zero_page_shrinker);
=20
- return 0;
+out:
+ return ret;
}
early_initcall(init_huge_zero_page);
=20
--=20
2.47.2
From nobody Tue Dec 16 07:10:52 2025
Received: from mout-p-202.mailbox.org (mout-p-202.mailbox.org [80.241.56.172])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 33F291F582A;
Tue, 27 May 2025 05:05:26 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748322328; cv=none;
b=WIo9jLGOQaIFc3Wa+dS8EgkncejK1l0ZMOzkz8tUYFiTbcmqrJ0/zHq0/TsksNG5GW8STGeGOnUgDaX0YPTq2DtRoCU58hf51HxprDs0nv+Ztx7jqcorYchPhc0x1P/XglI9AFRGSquVCTH8ZHiuciwnO9i/8F4/bdeyInHsneE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748322328; c=relaxed/simple;
bh=ZfJzFdHvBVoVt9yihJ6HO+m02MTs9dzOAIW953o6f9Y=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=WoIyUPObVPcKww13Q8twbDGvHDlJXkElaACm3UCBQlDEYRUyhz7xOwY2OIb/mXMkRu/On6zQTzLT/hU/EQ3uWdx4o/PYPHpxMbKFA3A1VQJxnvVh0eB0jXAxR1z32dAVmXw9XKD07I5oWmBCoLZwKJTjn4klQbtPdq+XB/JgpAk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.172
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp202.mailbox.org (smtp202.mailbox.org
[IPv6:2001:67c:2050:b231:465::202])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-202.mailbox.org (Postfix) with ESMTPS id 4b60v75Kq0z9tPW;
Tue, 27 May 2025 07:05:23 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Vlastimil Babka ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Zi Yan ,
Mike Rapoport ,
Dave Hansen ,
Michal Hocko ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
linux-block@vger.kernel.org,
willy@infradead.org,
x86@kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [RFC 3/3] block: use mm_huge_zero_folio in
__blkdev_issue_zero_pages()
Date: Tue, 27 May 2025 07:04:52 +0200
Message-ID: <20250527050452.817674-4-p.raghav@samsung.com>
In-Reply-To: <20250527050452.817674-1-p.raghav@samsung.com>
References: <20250527050452.817674-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Rspamd-Queue-Id: 4b60v75Kq0z9tPW
Content-Type: text/plain; charset="utf-8"
Use mm_huge_zero_folio in __blkdev_issue_zero_pages(). Fallback to
ZERO_PAGE if mm_huge_zero_folio is not available.
On systems that allocates mm_huge_zero_folio, we will end up sending larger
bvecs instead of multiple small ones.
Noticed a 4% increase in performance on a commercial NVMe SSD which does
not support OP_WRITE_ZEROES. The device's MDTS was 128K. The performance
gains might be bigger if the device supports bigger MDTS.
Signed-off-by: Pankaj Raghav
---
block/blk-lib.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 4c9f20a689f7..0fd55e028170 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -4,6 +4,7 @@
*/
#include
#include
+#include
#include
#include
#include
@@ -196,6 +197,12 @@ static void __blkdev_issue_zero_pages(struct block_dev=
ice *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned int flags)
{
+ struct folio *zero_folio;
+
+ zero_folio =3D mm_get_huge_zero_folio(NULL);
+ if (!zero_folio)
+ zero_folio =3D page_folio(ZERO_PAGE(0));
+
while (nr_sects) {
unsigned int nr_vecs =3D __blkdev_sectors_to_bio_pages(nr_sects);
struct bio *bio;
@@ -208,11 +215,12 @@ static void __blkdev_issue_zero_pages(struct block_de=
vice *bdev,
break;
=20
do {
- unsigned int len, added;
+ unsigned int len, added =3D 0;
=20
- len =3D min_t(sector_t,
- PAGE_SIZE, nr_sects << SECTOR_SHIFT);
- added =3D bio_add_page(bio, ZERO_PAGE(0), len, 0);
+ len =3D min_t(sector_t, folio_size(zero_folio),
+ nr_sects << SECTOR_SHIFT);
+ if (bio_add_folio(bio, zero_folio, len, 0))
+ added =3D len;
if (added < len)
break;
nr_sects -=3D added >> SECTOR_SHIFT;
--=20
2.47.2