From nobody Fri Oct 10 21:10:43 2025
Received: from mout-p-101.mailbox.org (mout-p-101.mailbox.org [80.241.56.151])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5CE6D223339;
Thu, 12 Jun 2025 10:51:26 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.151
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1749725487; cv=none;
b=oBlDTvljS9xVZrAbXVqKG5x9oUrXEMRvmBAELOCa6jK3gMpCzDlv5V/Ehh+kkL/zIc/2QCt88RRjzY6t88UHc7odAPYmb1uuDTu2rBRH0amgcH4Zle9xEwsGlM32T/QG/vmBhYrP/NOXzK6/71gusFdxSK3hxNtN3Cfd1mWiiF0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1749725487; c=relaxed/simple;
bh=UZzeclDY+t72LopZhOCsfPtxy3MhuMyLEdOTtIboqBs=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=hB3mCco8f0yhpITBLibB5BwjDkKc/Wwebte0AwVeXfXow3tnO8d84XXma62rOLSAaVLfYJ/V2LNeQ38Iea3JIRFhL6vZCpUEdzmTK8HauunO+rUC6ZBpU7pcYpg/7Ldu5PpLlR4pSpzi7X3uCf04ku0kD2uB/48IZB8LeEmJBIk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.151
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp202.mailbox.org (smtp202.mailbox.org [10.196.197.202])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-101.mailbox.org (Postfix) with ESMTPS id 4bHzpr5vwHz9tGC;
Thu, 12 Jun 2025 12:51:16 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Mike Rapoport ,
Michal Hocko ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Vlastimil Babka ,
Zi Yan ,
Dave Hansen ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
willy@infradead.org,
x86@kernel.org,
linux-block@vger.kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [PATCH 1/5] mm: move huge_zero_page declaration from huge_mm.h to
mm.h
Date: Thu, 12 Jun 2025 12:50:56 +0200
Message-ID: <20250612105100.59144-2-p.raghav@samsung.com>
In-Reply-To: <20250612105100.59144-1-p.raghav@samsung.com>
References: <20250612105100.59144-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"
Move the declaration associated with huge_zero_page from huge_mm.h to
mm.h. This patch is in preparation for adding static PMD zero page.
No functional changes.
Signed-off-by: Pankaj Raghav
---
include/linux/huge_mm.h | 31 -------------------------------
include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 34 insertions(+), 31 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2f190c90192d..3e887374892c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -478,22 +478,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *=
vma, unsigned long addr,
=20
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
=20
-extern struct folio *huge_zero_folio;
-extern unsigned long huge_zero_pfn;
-
-static inline bool is_huge_zero_folio(const struct folio *folio)
-{
- return READ_ONCE(huge_zero_folio) =3D=3D folio;
-}
-
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) =3D=3D pmd_pfn(pmd);
-}
-
-struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
-void mm_put_huge_zero_folio(struct mm_struct *mm);
-
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
@@ -631,21 +615,6 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct =
vm_fault *vmf)
return 0;
}
=20
-static inline bool is_huge_zero_folio(const struct folio *folio)
-{
- return false;
-}
-
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return false;
-}
-
-static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
-{
- return;
-}
-
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ef2ba0c667a..c8fbeaacf896 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,6 +4018,40 @@ static inline bool vma_is_special_huge(const struct =
vm_area_struct *vma)
=20
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
=20
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern struct folio *huge_zero_folio;
+extern unsigned long huge_zero_pfn;
+
+static inline bool is_huge_zero_folio(const struct folio *folio)
+{
+ return READ_ONCE(huge_zero_folio) =3D=3D folio;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) =3D=3D pmd_pfn(pmd);
+}
+
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
+void mm_put_huge_zero_folio(struct mm_struct *mm);
+
+#else
+static inline bool is_huge_zero_folio(const struct folio *folio)
+{
+ return false;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return false;
+}
+
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+ return;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
--=20
2.49.0
From nobody Fri Oct 10 21:10:43 2025
Received: from mout-p-101.mailbox.org (mout-p-101.mailbox.org [80.241.56.151])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5CECF23A9AD;
Thu, 12 Jun 2025 10:51:26 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.151
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1749725488; cv=none;
b=Pwq705tNvROjDXc9FTqwkkBI3djmlzyZqABIzDvKngHy8uqaj75Eb0HPvX/jI/S5xAg4yeOdspZh2bA0hb6y2WAj9lpKLeS92EkA1SFd93lrBirgwdRJn5Q/HhzgU9PyF2yQ2674AacIU/SpNmiQaFzpX1hS5291/E0fl4lmCjo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1749725488; c=relaxed/simple;
bh=P33S8WOT/OmzM1wtlo2dQh4zF4v+rYw0sInMwlmnl1A=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=TjYnamoHb5s+xgIkdUBUGPaBRk5F897xHRn31jkQv1wNabda+cfAkSO2AeqXs/ZPI5oBTmzYN7Kw9Siyv2OxbJUmAUuuOGyprYQAm/vPJ5jMQepny5nnvkpDpQBYK/3NK1ZT5ghnbsSiLSoZMR1zgkjtPoDIL4ypX59wemwLba0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.151
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp202.mailbox.org (smtp202.mailbox.org [10.196.197.202])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-101.mailbox.org (Postfix) with ESMTPS id 4bHzpy5spkz9t8J;
Thu, 12 Jun 2025 12:51:22 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Mike Rapoport ,
Michal Hocko ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Vlastimil Babka ,
Zi Yan ,
Dave Hansen ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
willy@infradead.org,
x86@kernel.org,
linux-block@vger.kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [PATCH 2/5] huge_memory: add huge_zero_page_shrinker_(init|exit)
function
Date: Thu, 12 Jun 2025 12:50:57 +0200
Message-ID: <20250612105100.59144-3-p.raghav@samsung.com>
In-Reply-To: <20250612105100.59144-1-p.raghav@samsung.com>
References: <20250612105100.59144-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"
Add huge_zero_page_shrinker_init() and huge_zero_page_shrinker_exit().
As shrinker will not be needed when static PMD zero page is enabled,
these two functions can be a no-op.
This is a preparation patch for static PMD zero page. No functional
changes.
Signed-off-by: Pankaj Raghav
---
mm/huge_memory.c | 38 +++++++++++++++++++++++++++-----------
1 file changed, 27 insertions(+), 11 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d3e66136e41a..101b67ab2eb6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -289,6 +289,24 @@ static unsigned long shrink_huge_zero_page_scan(struct=
shrinker *shrink,
}
=20
static struct shrinker *huge_zero_page_shrinker;
+static int huge_zero_page_shrinker_init(void)
+{
+ huge_zero_page_shrinker =3D shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_page_shrinker)
+ return -ENOMEM;
+
+ huge_zero_page_shrinker->count_objects =3D shrink_huge_zero_page_count;
+ huge_zero_page_shrinker->scan_objects =3D shrink_huge_zero_page_scan;
+ shrinker_register(huge_zero_page_shrinker);
+ return 0;
+}
+
+static void huge_zero_page_shrinker_exit(void)
+{
+ shrinker_free(huge_zero_page_shrinker);
+ return;
+}
+
=20
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -850,33 +868,31 @@ static inline void hugepage_exit_sysfs(struct kobject=
*hugepage_kobj)
=20
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker =3D shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
+ int ret =3D 0;
=20
deferred_split_shrinker =3D shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects =3D shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects =3D shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
=20
deferred_split_shrinker->count_objects =3D deferred_split_count;
deferred_split_shrinker->scan_objects =3D deferred_split_scan;
shrinker_register(deferred_split_shrinker);
=20
+ ret =3D huge_zero_page_shrinker_init();
+ if (ret) {
+ shrinker_free(deferred_split_shrinker);
+ return ret;
+ }
+
return 0;
}
=20
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ huge_zero_page_shrinker_exit();
shrinker_free(deferred_split_shrinker);
}
=20
--=20
2.49.0
From nobody Fri Oct 10 21:10:43 2025
Received: from mout-p-201.mailbox.org (mout-p-201.mailbox.org [80.241.56.171])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3CA61239570;
Thu, 12 Jun 2025 10:51:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.171
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1749725495; cv=none;
b=Ls/7+TyW9c6a/giKMCmhKSwPlNZCtcmXar0gyy3+mmxYKZqOEHE1dnceEkVTBljSIOqFDlEgZCfMNhQLb3+uV7YvRiKEeXid0iIoSDsBsNSg6peLIjyHKVAYMzImGqIFz2rOPSTqQrex1xuBIS/Jvb53eBI2mIFSGeoE4HtWagA=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1749725495; c=relaxed/simple;
bh=Yu3zq6evcS3iRIzSO3AOHAbZ4FI9g5mt/47mh+3hLAQ=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=VetgYPgIwJo7SqFzXv/Pi7miIZYCMSYQWmvt6SC2DRi/wbhQ0n8L8KxizbWY8dHjHNJR/3XUo6khc+QAkImAnn5M94V5xPLO3ow3n3DQOOhD3v3rNOSz18QmTKRgHmdQ4Uivl7c+mTx8GAl/nk8k6PTt23+YYV6JXk7Hy/4szxE=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.171
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp102.mailbox.org (smtp102.mailbox.org
[IPv6:2001:67c:2050:b231:465::102])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-201.mailbox.org (Postfix) with ESMTPS id 4bHzq5122sz9sx5;
Thu, 12 Jun 2025 12:51:29 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Mike Rapoport ,
Michal Hocko ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Vlastimil Babka ,
Zi Yan ,
Dave Hansen ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
willy@infradead.org,
x86@kernel.org,
linux-block@vger.kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [PATCH 3/5] mm: add static PMD zero page
Date: Thu, 12 Jun 2025 12:50:58 +0200
Message-ID: <20250612105100.59144-4-p.raghav@samsung.com>
In-Reply-To: <20250612105100.59144-1-p.raghav@samsung.com>
References: <20250612105100.59144-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Rspamd-Queue-Id: 4bHzq5122sz9sx5
Content-Type: text/plain; charset="utf-8"
There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
This concern was raised during the review of adding LBS support to
XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.
Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio in .bss, and it will never be freed. This makes using t=
he
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.
As STATIC_PMD_ZERO_PAGE does not depend on THP, declare huge_zero_folio
and huge_zero_pfn outside of the THP ifdef.
It can only be enabled from x86_64, but it is an optional config. We
could expand it more architectures in the future.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Suggested-by: David Hildenbrand
Signed-off-by: Pankaj Raghav
---
Questions:
- Can we call __split_huge_zero_page_pmd() on static PMD page?
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 8 ++++++++
arch/x86/kernel/head_64.S | 8 ++++++++
include/linux/mm.h | 16 +++++++++++++++-
mm/Kconfig | 13 +++++++++++++
mm/huge_memory.c | 24 ++++++++++++++++++++----
mm/memory.c | 19 +++++++++++++++++++
7 files changed, 84 insertions(+), 5 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..c3a9d136ec0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_HAS_STATIC_PMD_ZERO_PAGE if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 774430c3abff..7013a7d26da5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -47,6 +47,14 @@ void ptdump_walk_user_pgd_level_checkwx(void);
#define debug_checkwx_user() do { } while (0)
#endif
=20
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+/*
+ * PMD_ZERO_PAGE is a global shared PMD page that is always zero.
+ */
+extern unsigned long empty_pmd_zero_page[(PMD_SIZE) / sizeof(unsigned long=
)]
+ __visible;
+#endif
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..86aaa53fd619 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -714,6 +714,14 @@ EXPORT_SYMBOL(phys_base)
#include "../xen/xen-head.S"
=20
__PAGE_ALIGNED_BSS
+
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_pmd_zero_page)
+ .skip PMD_SIZE
+SYM_DATA_END(empty_pmd_zero_page)
+EXPORT_SYMBOL(empty_pmd_zero_page)
+#endif
+
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
SYM_DATA_END(empty_zero_page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c8fbeaacf896..b20d60d68b3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,10 +4018,10 @@ static inline bool vma_is_special_huge(const struct=
vm_area_struct *vma)
=20
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
=20
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
=20
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline bool is_huge_zero_folio(const struct folio *folio)
{
return READ_ONCE(huge_zero_folio) =3D=3D folio;
@@ -4032,9 +4032,23 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) =3D=3D pmd_pfn(pmd);
}
=20
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static inline struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
+{
+ return READ_ONCE(huge_zero_folio);
+}
+
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+ return;
+}
+
+#else
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
=20
+#endif /* CONFIG_STATIC_PMD_ZERO_PAGE */
+
#else
static inline bool is_huge_zero_folio(const struct folio *folio)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 781be3240e21..fd1c51995029 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,19 @@ config ARCH_WANTS_THP_SWAP
config MM_ID
def_bool n
=20
+config ARCH_HAS_STATIC_PMD_ZERO_PAGE
+ def_bool n
+
+config STATIC_PMD_ZERO_PAGE
+ bool "Allocate a PMD page for zeroing"
+ depends on ARCH_HAS_STATIC_PMD_ZERO_PAGE
+ help
+ Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+ on demand and deallocated when not in use. This option will
+ allocate a PMD sized zero page in .bss and huge_zero_folio will
+ use it instead allocating dynamically.
+ Not suitable for memory constrained systems.
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 101b67ab2eb6..c12ca7134e88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,9 +75,6 @@ static unsigned long deferred_split_scan(struct shrinker =
*shrink,
struct shrink_control *sc);
static bool split_underused_thp =3D true;
=20
-static atomic_t huge_zero_refcount;
-struct folio *huge_zero_folio __read_mostly;
-unsigned long huge_zero_pfn __read_mostly =3D ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
@@ -208,6 +205,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_are=
a_struct *vma,
return orders;
}
=20
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static int huge_zero_page_shrinker_init(void)
+{
+ return 0;
+}
+
+static void huge_zero_page_shrinker_exit(void)
+{
+ return;
+}
+#else
+
+static struct shrinker *huge_zero_page_shrinker;
+static atomic_t huge_zero_refcount;
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly =3D ~0UL;
+
static bool get_huge_zero_page(void)
{
struct folio *zero_folio;
@@ -288,7 +302,6 @@ static unsigned long shrink_huge_zero_page_scan(struct =
shrinker *shrink,
return 0;
}
=20
-static struct shrinker *huge_zero_page_shrinker;
static int huge_zero_page_shrinker_init(void)
{
huge_zero_page_shrinker =3D shrinker_alloc(0, "thp-zero");
@@ -307,6 +320,7 @@ static void huge_zero_page_shrinker_exit(void)
return;
}
=20
+#endif
=20
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -2843,6 +2857,8 @@ static void __split_huge_zero_page_pmd(struct vm_area=
_struct *vma,
pte_t *pte;
int i;
=20
+ // FIXME: can this be called with static zero page?
+ VM_BUG_ON(IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE));
/*
* Leave pmd empty until pte is filled note that it is fine to delay
* notification until mmu_notifier_invalidate_range_end() as we are
diff --git a/mm/memory.c b/mm/memory.c
index 8eba595056fe..77721f5ae043 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -159,6 +159,25 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
=20
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly =3D ~0UL;
+
+static int __init init_pmd_zero_pfn(void)
+{
+ huge_zero_folio =3D virt_to_folio(empty_pmd_zero_page);
+ huge_zero_pfn =3D page_to_pfn(virt_to_page(empty_pmd_zero_page));
+
+ __folio_set_head(huge_zero_folio);
+ prep_compound_head((struct page *)huge_zero_folio, PMD_ORDER);
+ /* Ensure zero folio won't have large_rmappable flag set. */
+ folio_clear_large_rmappable(huge_zero_folio);
+
+ return 0;
+}
+early_initcall(init_pmd_zero_pfn);
+#endif
+
void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
trace_rss_stat(mm, member);
--=20
2.49.0
From nobody Fri Oct 10 21:10:43 2025
Received: from mout-p-201.mailbox.org (mout-p-201.mailbox.org [80.241.56.171])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 15E10246797;
Thu, 12 Jun 2025 10:51:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.171
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1749725502; cv=none;
b=XDW3gzfJgMUntqhIWA2dl9vcvD2ndXxAO4QF+mhwsipHYvtjKblGOBVfJ909jrXcrOylFGOGvxP/QJK2r5kTrPL7Bi0UwumflkFk0m8CEve4zYkzKU+v0KzV+ik+s3xI+Kw0JCCmDKnJj0KAJ0yFHHS1Jfj0uJwltEhh/QP60a8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1749725502; c=relaxed/simple;
bh=QNuotDSbXamqQoufAqDmvk9X3kWMiUNh5OnMNgQcFr8=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=Yz47Tywrg3AmBjoS1vlb259TFMLR46u4fXVtjmIuURPUNe6x4Cwma2MY0YXp48sqauKeKDqyfd5zOwdr9R3FM50lve78HLxyRbvngTiIf/dL84ZsmlOL1InkWP1fIbvLTf4SPXJC8yOdsN6epKjTVfqrLNVWlR0TdW5CPZ4/3mQ=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.171
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp1.mailbox.org (smtp1.mailbox.org [10.196.197.1])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-201.mailbox.org (Postfix) with ESMTPS id 4bHzqF3rv5z9std;
Thu, 12 Jun 2025 12:51:37 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Mike Rapoport ,
Michal Hocko ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Vlastimil Babka ,
Zi Yan ,
Dave Hansen ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
willy@infradead.org,
x86@kernel.org,
linux-block@vger.kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [PATCH 4/5] mm: add mm_get_static_huge_zero_folio() routine
Date: Thu, 12 Jun 2025 12:50:59 +0200
Message-ID: <20250612105100.59144-5-p.raghav@samsung.com>
In-Reply-To: <20250612105100.59144-1-p.raghav@samsung.com>
References: <20250612105100.59144-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"
Add mm_get_static_huge_zero_folio() routine so that huge_zero_folio can be
used without the need to pass any mm struct. This will return ZERO_PAGE
folio if CONFIG_STATIC_PMD_ZERO_PAGE is disabled.
This routine can also be called even if THP is disabled.
Signed-off-by: Pankaj Raghav
---
include/linux/mm.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b20d60d68b3c..c8805480ff21 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4021,6 +4021,22 @@ static inline bool vma_is_special_huge(const struct =
vm_area_struct *vma)
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
=20
+/*
+ * mm_get_static_huge_zero_folio - Get a PMD sized zero folio
+ *
+ * This function will return a PMD sized zero folio if CONFIG_STATIC_PMD_Z=
ERO_PAGE
+ * is enabled. Otherwise, a ZERO_PAGE folio is returned.
+ *
+ * Deduce the size of the folio with folio_size instead of assuming the
+ * folio size.
+ */
+static inline struct folio *mm_get_static_huge_zero_folio(void)
+{
+ if(IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE))
+ return READ_ONCE(huge_zero_folio);
+ return page_folio(ZERO_PAGE(0));
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline bool is_huge_zero_folio(const struct folio *folio)
{
--=20
2.49.0
From nobody Fri Oct 10 21:10:43 2025
Received: from mout-p-202.mailbox.org (mout-p-202.mailbox.org [80.241.56.172])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9534D1AAA29;
Thu, 12 Jun 2025 10:51:54 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
arc=none smtp.client-ip=80.241.56.172
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1749725516; cv=none;
b=prwcSVzOKLznzN1+Y9WDz1PYOTdFxDC3Y8Kk4ZYYzwHvlUwc6YO1cR+ypXM2G3Dr6OOylyj1a5PKOxzFuhToNHsCqs/KbxS/EjYOW6NnRPuwxPjo41r617Ev/c1IrG+xLrDP3l6ce2o+ox5uN5sPPubVrPcbk3JoujOnf0Era5c=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1749725516; c=relaxed/simple;
bh=bd3wIJYZv4evP3kmcjH4Kxa+ysWUD4TN37ZjLBgIZCA=;
h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
MIME-Version;
b=fguIEVYj8nTGjqT7Qk5sd9t2XsICPHyjhIMU5KXayJ+bQaX8abfUqdtZ2+Jtt+CEQkNvaDtLpYwc/GhFxbX8IHi0dlT6qpNUsTm/VXevVtvRfZyoF53CSmxAZGeV6rzQ8njuGp0gCkCvC0QdNd58f21IBj7e/gV2Ij93UImY/R0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com;
spf=pass smtp.mailfrom=pankajraghav.com;
arc=none smtp.client-ip=80.241.56.172
Authentication-Results: smtp.subspace.kernel.org;
dmarc=fail (p=none dis=none) header.from=samsung.com
Authentication-Results: smtp.subspace.kernel.org;
spf=pass smtp.mailfrom=pankajraghav.com
Received: from smtp2.mailbox.org (smtp2.mailbox.org [10.196.197.2])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by mout-p-202.mailbox.org (Postfix) with ESMTPS id 4bHzqP4ZBmz9ss0;
Thu, 12 Jun 2025 12:51:45 +0200 (CEST)
From: Pankaj Raghav
To: Suren Baghdasaryan ,
Ryan Roberts ,
Mike Rapoport ,
Michal Hocko ,
Thomas Gleixner ,
Nico Pache ,
Dev Jain ,
Baolin Wang ,
Borislav Petkov ,
Ingo Molnar ,
"H . Peter Anvin" ,
Vlastimil Babka ,
Zi Yan ,
Dave Hansen ,
David Hildenbrand ,
Lorenzo Stoakes ,
Andrew Morton ,
"Liam R . Howlett" ,
Jens Axboe
Cc: linux-kernel@vger.kernel.org,
linux-mm@kvack.org,
willy@infradead.org,
x86@kernel.org,
linux-block@vger.kernel.org,
linux-fsdevel@vger.kernel.org,
"Darrick J . Wong" ,
mcgrof@kernel.org,
gost.dev@samsung.com,
kernel@pankajraghav.com,
hch@lst.de,
Pankaj Raghav
Subject: [PATCH 5/5] block: use mm_huge_zero_folio in
__blkdev_issue_zero_pages()
Date: Thu, 12 Jun 2025 12:51:00 +0200
Message-ID: <20250612105100.59144-6-p.raghav@samsung.com>
In-Reply-To: <20250612105100.59144-1-p.raghav@samsung.com>
References: <20250612105100.59144-1-p.raghav@samsung.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id:
List-Subscribe:
List-Unsubscribe:
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"
Use mm_get_static_huge_zero_folio() in __blkdev_issue_zero_pages().
On systems with CONFIG_STATIC_PMD_ZERO_PAGE enabled, we will end up
sending larger bvecs instead of multiple small ones.
Noticed a 4% increase in performance on a commercial NVMe SSD which does
not support OP_WRITE_ZEROES. The device's MDTS was 128K. The performance
gains might be bigger if the device supports bigger MDTS.
Signed-off-by: Pankaj Raghav
---
block/blk-lib.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 4c9f20a689f7..4ee219637a3f 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -196,6 +196,10 @@ static void __blkdev_issue_zero_pages(struct block_dev=
ice *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned int flags)
{
+ struct folio *zero_folio;
+
+ zero_folio =3D mm_get_static_huge_zero_folio();
+
while (nr_sects) {
unsigned int nr_vecs =3D __blkdev_sectors_to_bio_pages(nr_sects);
struct bio *bio;
@@ -208,15 +212,14 @@ static void __blkdev_issue_zero_pages(struct block_de=
vice *bdev,
break;
=20
do {
- unsigned int len, added;
+ unsigned int len;
=20
- len =3D min_t(sector_t,
- PAGE_SIZE, nr_sects << SECTOR_SHIFT);
- added =3D bio_add_page(bio, ZERO_PAGE(0), len, 0);
- if (added < len)
+ len =3D min_t(sector_t, folio_size(zero_folio),
+ nr_sects << SECTOR_SHIFT);
+ if (!bio_add_folio(bio, zero_folio, len, 0))
break;
- nr_sects -=3D added >> SECTOR_SHIFT;
- sector +=3D added >> SECTOR_SHIFT;
+ nr_sects -=3D len >> SECTOR_SHIFT;
+ sector +=3D len >> SECTOR_SHIFT;
} while (nr_sects);
=20
*biop =3D bio_chain_and_submit(*biop, bio);
--=20
2.49.0