From: Pankaj Raghav <p.raghav@samsung.com>
There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
This concern was raised during the review of adding LBS support to
XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left. At moment,
huge_zero_folio infrastructure refcount is tied to the process lifetime
that created it. This might not work for bio layer as the completions
can be async and the process that created the huge_zero_folio might no
longer be alive. And, one of the main point that came during discussion
is to have something bigger than zero page as a drop-in replacement.
Add a config option STATIC_HUGE_ZERO_FOLIO that will result in allocating
the huge zero folio on first request, if not already allocated, and turn
it static such that it can never get freed. This makes using the
huge_zero_folio without having to pass any mm struct and does not tie the
lifetime of the zero folio to anything, making it a drop-in replacement
for ZERO_PAGE.
If STATIC_HUGE_ZERO_FOLIO config option is enabled, then
mm_get_huge_zero_folio() will simply return this page instead of
dynamically allocating a new PMD page.
This option can waste memory in small systems or systems with 64k base
page size. So make it an opt-in and also add an option from individual
architecture so that we don't enable this feature for larger base page
size systems. Only x86 is enabled as a part of this series. Other
architectures shall be enabled as a follow-up to this series.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
arch/x86/Kconfig | 1 +
include/linux/huge_mm.h | 18 ++++++++++++++++
mm/Kconfig | 21 +++++++++++++++++++
mm/huge_memory.c | 46 ++++++++++++++++++++++++++++++++++++++++-
4 files changed, 85 insertions(+), 1 deletion(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0ce86e14ab5e..8e2aa1887309 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select BUILDTIME_TABLE_SORT
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7748489fde1b..78ebceb61d0e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -476,6 +476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
+extern atomic_t huge_zero_folio_is_static;
static inline bool is_huge_zero_folio(const struct folio *folio)
{
@@ -494,6 +495,18 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+struct folio *__get_static_huge_zero_folio(void);
+
+static inline struct folio *get_static_huge_zero_folio(void)
+{
+ if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO))
+ return NULL;
+
+ if (likely(atomic_read(&huge_zero_folio_is_static)))
+ return huge_zero_folio;
+
+ return __get_static_huge_zero_folio();
+}
static inline bool thp_migration_supported(void)
{
@@ -685,6 +698,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb,
{
return 0;
}
+
+static inline struct folio *get_static_huge_zero_folio(void)
+{
+ return NULL;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..366a6d2d771e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO
+ def_bool n
+
+config STATIC_HUGE_ZERO_FOLIO
+ bool "Allocate a PMD sized folio for zeroing"
+ depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE
+ help
+ Without this config enabled, the huge zero folio is allocated on
+ demand and freed under memory pressure once no longer in use.
+ To detect remaining users reliably, references to the huge zero folio
+ must be tracked precisely, so it is commonly only available for mapping
+ it into user page tables.
+
+ With this config enabled, the huge zero folio can also be used
+ for other purposes that do not implement precise reference counting:
+ it is still allocated on demand, but never freed, allowing for more
+ wide-spread use, for example, when performing I/O similar to the
+ traditional shared zeropage.
+
+ Not suitable for memory constrained systems.
+
config MM_ID
def_bool n
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ff06dee213eb..e117b280b38d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,6 +75,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
static bool split_underused_thp = true;
static atomic_t huge_zero_refcount;
+atomic_t huge_zero_folio_is_static __read_mostly;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
@@ -266,6 +267,45 @@ void mm_put_huge_zero_folio(struct mm_struct *mm)
put_huge_zero_folio();
}
+#ifdef CONFIG_STATIC_HUGE_ZERO_FOLIO
+
+struct folio *__get_static_huge_zero_folio(void)
+{
+ static unsigned long fail_count_clear_timer;
+ static atomic_t huge_zero_static_fail_count __read_mostly;
+
+ if (unlikely(!slab_is_available()))
+ return NULL;
+
+ /*
+ * If we failed to allocate a huge zero folio, just refrain from
+ * trying for one minute before retrying to get a reference again.
+ */
+ if (atomic_read(&huge_zero_static_fail_count) > 1) {
+ if (time_before(jiffies, fail_count_clear_timer))
+ return NULL;
+ atomic_set(&huge_zero_static_fail_count, 0);
+ }
+ /*
+ * Our raised reference will prevent the shrinker from ever having
+ * success.
+ */
+ if (!get_huge_zero_folio()) {
+ int count = atomic_inc_return(&huge_zero_static_fail_count);
+
+ if (count > 1)
+ fail_count_clear_timer = get_jiffies_64() + 60 * HZ;
+
+ return NULL;
+ }
+
+ if (atomic_cmpxchg(&huge_zero_folio_is_static, 0, 1) != 0)
+ put_huge_zero_folio();
+
+ return huge_zero_folio;
+}
+#endif /* CONFIG_STATIC_HUGE_ZERO_FOLIO */
+
static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
struct shrink_control *sc)
{
@@ -277,7 +317,11 @@ static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
- struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
+ struct folio *zero_folio;
+
+ if (WARN_ON_ONCE(atomic_read(&huge_zero_folio_is_static)))
+ return 0;
+ zero_folio = xchg(&huge_zero_folio, NULL);
BUG_ON(zero_folio == NULL);
WRITE_ONCE(huge_zero_pfn, ~0UL);
folio_put(zero_folio);
--
2.49.0
On Mon, Aug 04, 2025 at 02:13:54PM +0200, Pankaj Raghav (Samsung) wrote: > From: Pankaj Raghav <p.raghav@samsung.com> > > There are many places in the kernel where we need to zeroout larger > chunks but the maximum segment we can zeroout at a time by ZERO_PAGE > is limited by PAGE_SIZE. > > This is especially annoying in block devices and filesystems where we > attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage > bvec support in block layer, it is much more efficient to send out > larger zero pages as a part of single bvec. > > This concern was raised during the review of adding LBS support to > XFS[1][2]. > > Usually huge_zero_folio is allocated on demand, and it will be > deallocated by the shrinker if there are no users of it left. At moment, > huge_zero_folio infrastructure refcount is tied to the process lifetime > that created it. This might not work for bio layer as the completions > can be async and the process that created the huge_zero_folio might no > longer be alive. And, one of the main point that came during discussion > is to have something bigger than zero page as a drop-in replacement. > > Add a config option STATIC_HUGE_ZERO_FOLIO that will result in allocating > the huge zero folio on first request, if not already allocated, and turn > it static such that it can never get freed. This makes using the > huge_zero_folio without having to pass any mm struct and does not tie the > lifetime of the zero folio to anything, making it a drop-in replacement > for ZERO_PAGE. > > If STATIC_HUGE_ZERO_FOLIO config option is enabled, then > mm_get_huge_zero_folio() will simply return this page instead of > dynamically allocating a new PMD page. > > This option can waste memory in small systems or systems with 64k base > page size. So make it an opt-in and also add an option from individual > architecture so that we don't enable this feature for larger base page > size systems. Only x86 is enabled as a part of this series. Other > architectures shall be enabled as a follow-up to this series. > > [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/ > [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/ > > Co-developed-by: David Hildenbrand <david@redhat.com> > Signed-off-by: David Hildenbrand <david@redhat.com> > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > --- > arch/x86/Kconfig | 1 + > include/linux/huge_mm.h | 18 ++++++++++++++++ > mm/Kconfig | 21 +++++++++++++++++++ > mm/huge_memory.c | 46 ++++++++++++++++++++++++++++++++++++++++- > 4 files changed, 85 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 0ce86e14ab5e..8e2aa1887309 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -153,6 +153,7 @@ config X86 > select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 > select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 > select ARCH_WANTS_THP_SWAP if X86_64 > + select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64 > select ARCH_HAS_PARANOID_L1D_FLUSH > select ARCH_WANT_IRQS_OFF_ACTIVATE_MM > select BUILDTIME_TABLE_SORT > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 7748489fde1b..78ebceb61d0e 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -476,6 +476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); > > extern struct folio *huge_zero_folio; > extern unsigned long huge_zero_pfn; > +extern atomic_t huge_zero_folio_is_static; Really don't love having globals like this, please can we have a helper function that tells you this and not extern it? Also we're not checking CONFIG_STATIC_HUGE_ZERO_FOLIO but still exposing this value which a helper function would avoid also. > > static inline bool is_huge_zero_folio(const struct folio *folio) > { > @@ -494,6 +495,18 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) > > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); > void mm_put_huge_zero_folio(struct mm_struct *mm); > +struct folio *__get_static_huge_zero_folio(void); Why are we declaring a static inline function prototype that we then implement immediately below? > + > +static inline struct folio *get_static_huge_zero_folio(void) > +{ > + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return NULL; > + > + if (likely(atomic_read(&huge_zero_folio_is_static))) > + return huge_zero_folio; > + > + return __get_static_huge_zero_folio(); > +} > > static inline bool thp_migration_supported(void) > { > @@ -685,6 +698,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, > { > return 0; > } > + > +static inline struct folio *get_static_huge_zero_folio(void) > +{ > + return NULL; > +} > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > static inline int split_folio_to_list_to_order(struct folio *folio, > diff --git a/mm/Kconfig b/mm/Kconfig > index e443fe8cd6cf..366a6d2d771e 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB > config ARCH_WANTS_THP_SWAP > def_bool n > > +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO > + def_bool n > + > +config STATIC_HUGE_ZERO_FOLIO > + bool "Allocate a PMD sized folio for zeroing" > + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE > + help > + Without this config enabled, the huge zero folio is allocated on > + demand and freed under memory pressure once no longer in use. > + To detect remaining users reliably, references to the huge zero folio > + must be tracked precisely, so it is commonly only available for mapping > + it into user page tables. > + > + With this config enabled, the huge zero folio can also be used > + for other purposes that do not implement precise reference counting: > + it is still allocated on demand, but never freed, allowing for more > + wide-spread use, for example, when performing I/O similar to the > + traditional shared zeropage. > + > + Not suitable for memory constrained systems. > + > config MM_ID > def_bool n > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index ff06dee213eb..e117b280b38d 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -75,6 +75,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > static bool split_underused_thp = true; > > static atomic_t huge_zero_refcount; > +atomic_t huge_zero_folio_is_static __read_mostly; > struct folio *huge_zero_folio __read_mostly; > unsigned long huge_zero_pfn __read_mostly = ~0UL; > unsigned long huge_anon_orders_always __read_mostly; > @@ -266,6 +267,45 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) > put_huge_zero_folio(); > } > > +#ifdef CONFIG_STATIC_HUGE_ZERO_FOLIO > + Extremely tiny silly nit - there's a blank line below this, but not under the #endif, let's remove this line. > +struct folio *__get_static_huge_zero_folio(void) > +{ > + static unsigned long fail_count_clear_timer; > + static atomic_t huge_zero_static_fail_count __read_mostly; > + > + if (unlikely(!slab_is_available())) > + return NULL; > + > + /* > + * If we failed to allocate a huge zero folio, just refrain from > + * trying for one minute before retrying to get a reference again. > + */ > + if (atomic_read(&huge_zero_static_fail_count) > 1) { > + if (time_before(jiffies, fail_count_clear_timer)) > + return NULL; > + atomic_set(&huge_zero_static_fail_count, 0); > + } Yeah I really don't like this. This seems overly complicated and too fiddly. Also if I want a static PMD, do I want to wait a minute for next attempt? Also doing things this way we might end up: 0. Enabling CONFIG_STATIC_HUGE_ZERO_FOLIO 1. Not doing anything that needs a static PMD for a while + get fragmentation. 2. Do something that needs it - oops can't get order-9 page, and waiting 60 seconds between attempts 3. This is silent so you think you have it switched on but are actually getting bad performance. I appreciate wanting to reuse this code, but we need to find a way to do this really really early, and get rid of this arbitrary time out. It's very aribtrary and we have no easy way of tracing how this might behave under workload. Also we end up pinning an order-9 page either way, so no harm in getting it first thing? > + /* > + * Our raised reference will prevent the shrinker from ever having > + * success. > + */ > + if (!get_huge_zero_folio()) { > + int count = atomic_inc_return(&huge_zero_static_fail_count); > + > + if (count > 1) > + fail_count_clear_timer = get_jiffies_64() + 60 * HZ; > + > + return NULL; > + } > + > + if (atomic_cmpxchg(&huge_zero_folio_is_static, 0, 1) != 0) > + put_huge_zero_folio(); > + > + return huge_zero_folio; > +} > +#endif /* CONFIG_STATIC_HUGE_ZERO_FOLIO */ > + > static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, > struct shrink_control *sc) > { > @@ -277,7 +317,11 @@ static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, > struct shrink_control *sc) > { > if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { > - struct folio *zero_folio = xchg(&huge_zero_folio, NULL); > + struct folio *zero_folio; > + > + if (WARN_ON_ONCE(atomic_read(&huge_zero_folio_is_static))) > + return 0; I don't hugely love these various sets of static variables, I wonder if we could put them into a helper struct or something, and put these checks into helper functions? static bool have_static_huge_zero_folio(void) { return ...; } etc. ? > + zero_folio = xchg(&huge_zero_folio, NULL); > BUG_ON(zero_folio == NULL); > WRITE_ONCE(huge_zero_pfn, ~0UL); > folio_put(zero_folio); > -- > 2.49.0 > Cheers, Lorenzo
On 04.08.25 18:46, Lorenzo Stoakes wrote: > On Mon, Aug 04, 2025 at 02:13:54PM +0200, Pankaj Raghav (Samsung) wrote: >> From: Pankaj Raghav <p.raghav@samsung.com> >> >> There are many places in the kernel where we need to zeroout larger >> chunks but the maximum segment we can zeroout at a time by ZERO_PAGE >> is limited by PAGE_SIZE. >> >> This is especially annoying in block devices and filesystems where we >> attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage >> bvec support in block layer, it is much more efficient to send out >> larger zero pages as a part of single bvec. >> >> This concern was raised during the review of adding LBS support to >> XFS[1][2]. >> >> Usually huge_zero_folio is allocated on demand, and it will be >> deallocated by the shrinker if there are no users of it left. At moment, >> huge_zero_folio infrastructure refcount is tied to the process lifetime >> that created it. This might not work for bio layer as the completions >> can be async and the process that created the huge_zero_folio might no >> longer be alive. And, one of the main point that came during discussion >> is to have something bigger than zero page as a drop-in replacement. >> >> Add a config option STATIC_HUGE_ZERO_FOLIO that will result in allocating >> the huge zero folio on first request, if not already allocated, and turn >> it static such that it can never get freed. This makes using the >> huge_zero_folio without having to pass any mm struct and does not tie the >> lifetime of the zero folio to anything, making it a drop-in replacement >> for ZERO_PAGE. >> >> If STATIC_HUGE_ZERO_FOLIO config option is enabled, then >> mm_get_huge_zero_folio() will simply return this page instead of >> dynamically allocating a new PMD page. >> >> This option can waste memory in small systems or systems with 64k base >> page size. So make it an opt-in and also add an option from individual >> architecture so that we don't enable this feature for larger base page >> size systems. Only x86 is enabled as a part of this series. Other >> architectures shall be enabled as a follow-up to this series. >> >> [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/ >> [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/ >> >> Co-developed-by: David Hildenbrand <david@redhat.com> >> Signed-off-by: David Hildenbrand <david@redhat.com> >> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> >> --- >> arch/x86/Kconfig | 1 + >> include/linux/huge_mm.h | 18 ++++++++++++++++ >> mm/Kconfig | 21 +++++++++++++++++++ >> mm/huge_memory.c | 46 ++++++++++++++++++++++++++++++++++++++++- >> 4 files changed, 85 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >> index 0ce86e14ab5e..8e2aa1887309 100644 >> --- a/arch/x86/Kconfig >> +++ b/arch/x86/Kconfig >> @@ -153,6 +153,7 @@ config X86 >> select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 >> select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 >> select ARCH_WANTS_THP_SWAP if X86_64 >> + select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64 >> select ARCH_HAS_PARANOID_L1D_FLUSH >> select ARCH_WANT_IRQS_OFF_ACTIVATE_MM >> select BUILDTIME_TABLE_SORT >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index 7748489fde1b..78ebceb61d0e 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -476,6 +476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); >> >> extern struct folio *huge_zero_folio; >> extern unsigned long huge_zero_pfn; >> +extern atomic_t huge_zero_folio_is_static; > > Really don't love having globals like this, please can we have a helper > function that tells you this and not extern it? > > Also we're not checking CONFIG_STATIC_HUGE_ZERO_FOLIO but still exposing > this value which a helper function would avoid also. > >> >> static inline bool is_huge_zero_folio(const struct folio *folio) >> { >> @@ -494,6 +495,18 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) >> >> struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); >> void mm_put_huge_zero_folio(struct mm_struct *mm); >> +struct folio *__get_static_huge_zero_folio(void); > > Why are we declaring a static inline function prototype that we then > implement immediately below? > >> + >> +static inline struct folio *get_static_huge_zero_folio(void) >> +{ >> + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) >> + return NULL; >> + >> + if (likely(atomic_read(&huge_zero_folio_is_static))) >> + return huge_zero_folio; >> + >> + return __get_static_huge_zero_folio(); >> +} >> >> static inline bool thp_migration_supported(void) >> { >> @@ -685,6 +698,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, >> { >> return 0; >> } >> + >> +static inline struct folio *get_static_huge_zero_folio(void) >> +{ >> + return NULL; >> +} >> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ >> >> static inline int split_folio_to_list_to_order(struct folio *folio, >> diff --git a/mm/Kconfig b/mm/Kconfig >> index e443fe8cd6cf..366a6d2d771e 100644 >> --- a/mm/Kconfig >> +++ b/mm/Kconfig >> @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB >> config ARCH_WANTS_THP_SWAP >> def_bool n >> >> +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO >> + def_bool n >> + >> +config STATIC_HUGE_ZERO_FOLIO >> + bool "Allocate a PMD sized folio for zeroing" >> + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE >> + help >> + Without this config enabled, the huge zero folio is allocated on >> + demand and freed under memory pressure once no longer in use. >> + To detect remaining users reliably, references to the huge zero folio >> + must be tracked precisely, so it is commonly only available for mapping >> + it into user page tables. >> + >> + With this config enabled, the huge zero folio can also be used >> + for other purposes that do not implement precise reference counting: >> + it is still allocated on demand, but never freed, allowing for more >> + wide-spread use, for example, when performing I/O similar to the >> + traditional shared zeropage. >> + >> + Not suitable for memory constrained systems. >> + >> config MM_ID >> def_bool n >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index ff06dee213eb..e117b280b38d 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -75,6 +75,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, >> static bool split_underused_thp = true; >> >> static atomic_t huge_zero_refcount; >> +atomic_t huge_zero_folio_is_static __read_mostly; >> struct folio *huge_zero_folio __read_mostly; >> unsigned long huge_zero_pfn __read_mostly = ~0UL; >> unsigned long huge_anon_orders_always __read_mostly; >> @@ -266,6 +267,45 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) >> put_huge_zero_folio(); >> } >> >> +#ifdef CONFIG_STATIC_HUGE_ZERO_FOLIO >> + > > Extremely tiny silly nit - there's a blank line below this, but not under the > #endif, let's remove this line. > >> +struct folio *__get_static_huge_zero_folio(void) >> +{ >> + static unsigned long fail_count_clear_timer; >> + static atomic_t huge_zero_static_fail_count __read_mostly; >> + >> + if (unlikely(!slab_is_available())) >> + return NULL; >> + >> + /* >> + * If we failed to allocate a huge zero folio, just refrain from >> + * trying for one minute before retrying to get a reference again. >> + */ >> + if (atomic_read(&huge_zero_static_fail_count) > 1) { >> + if (time_before(jiffies, fail_count_clear_timer)) >> + return NULL; >> + atomic_set(&huge_zero_static_fail_count, 0); >> + } > > Yeah I really don't like this. This seems overly complicated and too > fiddly. Also if I want a static PMD, do I want to wait a minute for next > attempt? > > Also doing things this way we might end up: > > 0. Enabling CONFIG_STATIC_HUGE_ZERO_FOLIO > 1. Not doing anything that needs a static PMD for a while + get fragmentation. > 2. Do something that needs it - oops can't get order-9 page, and waiting 60 > seconds between attempts > 3. This is silent so you think you have it switched on but are actually getting > bad performance. > > I appreciate wanting to reuse this code, but we need to find a way to do this > really really early, and get rid of this arbitrary time out. It's very aribtrary > and we have no easy way of tracing how this might behave under workload. > > Also we end up pinning an order-9 page either way, so no harm in getting it > first thing? What we could do, to avoid messing with memblock and two ways of initializing a huge zero folio early, and just disable the shrinker. Downside is that the page is really static (not just when actually used at least once). I like it: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0ce86e14ab5e1..8e2aa18873098 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -153,6 +153,7 @@ config X86 select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 + select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select BUILDTIME_TABLE_SORT diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7748489fde1b7..ccfa5c95f14b1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); +static inline struct folio *get_static_huge_zero_folio(void) +{ + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) + return NULL; + + if (unlikely(!huge_zero_folio)) + return NULL; + + return huge_zero_folio; +} + static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, { return 0; } + +static inline struct folio *get_static_huge_zero_folio(void) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf2..366a6d2d771e3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO + def_bool n + +config STATIC_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE + help + Without this config enabled, the huge zero folio is allocated on + demand and freed under memory pressure once no longer in use. + To detect remaining users reliably, references to the huge zero folio + must be tracked precisely, so it is commonly only available for mapping + it into user page tables. + + With this config enabled, the huge zero folio can also be used + for other purposes that do not implement precise reference counting: + it is allocated statically and never freed, allowing for more + wide-spread use, for example, when performing I/O similar to the + traditional shared zeropage. + + Not suitable for memory constrained systems. + config MM_ID def_bool n diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ff06dee213eb2..f65ba3e6f0824 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; shrinker_register(huge_zero_folio_shrinker); - deferred_split_shrinker->count_objects = deferred_split_count; - deferred_split_shrinker->scan_objects = deferred_split_scan; - shrinker_register(deferred_split_shrinker); + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { + if (!get_huge_zero_folio()) + pr_warn("Allocating static huge zero folio failed\n"); + } else { + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + } return 0; } -- 2.50.1 Now, one thing I do not like is that we have "ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO" but then have a user-selectable option. Should we just get rid of ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO? -- Cheers, David / dhildenb
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index ff06dee213eb2..f65ba3e6f0824 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) > huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; > shrinker_register(huge_zero_folio_shrinker); > > - deferred_split_shrinker->count_objects = deferred_split_count; > - deferred_split_shrinker->scan_objects = deferred_split_scan; > - shrinker_register(deferred_split_shrinker); > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { > + if (!get_huge_zero_folio()) > + pr_warn("Allocating static huge zero folio failed\n"); > + } else { > + deferred_split_shrinker->count_objects = deferred_split_count; > + deferred_split_shrinker->scan_objects = deferred_split_scan; > + shrinker_register(deferred_split_shrinker); > + } ^ disabled the wrong shrinker, but you should get the idea -- Cheers, David / dhildenb
On Mon, Aug 04, 2025 at 07:07:06PM +0200, David Hildenbrand wrote: > > Yeah I really don't like this. This seems overly complicated and too > > fiddly. Also if I want a static PMD, do I want to wait a minute for next > > attempt? > > > > Also doing things this way we might end up: > > > > 0. Enabling CONFIG_STATIC_HUGE_ZERO_FOLIO > > 1. Not doing anything that needs a static PMD for a while + get fragmentation. > > 2. Do something that needs it - oops can't get order-9 page, and waiting 60 > > seconds between attempts > > 3. This is silent so you think you have it switched on but are actually getting > > bad performance. > > > > I appreciate wanting to reuse this code, but we need to find a way to do this > > really really early, and get rid of this arbitrary time out. It's very aribtrary > > and we have no easy way of tracing how this might behave under workload. > > > > Also we end up pinning an order-9 page either way, so no harm in getting it > > first thing? > > What we could do, to avoid messing with memblock and two ways of initializing a huge zero folio early, and just disable the shrinker. Nice, I like this approach! > > Downside is that the page is really static (not just when actually used at least once). I like it: Well I'm not sure this is a downside :P User is explicitly enabling an option that says 'I'm cool to lose an order-9 page for this'. > > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 0ce86e14ab5e1..8e2aa18873098 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -153,6 +153,7 @@ config X86 > select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 > select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 > select ARCH_WANTS_THP_SWAP if X86_64 > + select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64 > select ARCH_HAS_PARANOID_L1D_FLUSH > select ARCH_WANT_IRQS_OFF_ACTIVATE_MM > select BUILDTIME_TABLE_SORT > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 7748489fde1b7..ccfa5c95f14b1 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); > void mm_put_huge_zero_folio(struct mm_struct *mm); > +static inline struct folio *get_static_huge_zero_folio(void) > +{ > + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return NULL; > + > + if (unlikely(!huge_zero_folio)) > + return NULL; > + > + return huge_zero_folio; > +} > + > static inline bool thp_migration_supported(void) > { > return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); > @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, > { > return 0; > } > + > +static inline struct folio *get_static_huge_zero_folio(void) > +{ > + return NULL; > +} > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > static inline int split_folio_to_list_to_order(struct folio *folio, > diff --git a/mm/Kconfig b/mm/Kconfig > index e443fe8cd6cf2..366a6d2d771e3 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB > config ARCH_WANTS_THP_SWAP > def_bool n > +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO > + def_bool n > + > +config STATIC_HUGE_ZERO_FOLIO > + bool "Allocate a PMD sized folio for zeroing" > + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE > + help > + Without this config enabled, the huge zero folio is allocated on > + demand and freed under memory pressure once no longer in use. > + To detect remaining users reliably, references to the huge zero folio > + must be tracked precisely, so it is commonly only available for mapping > + it into user page tables. > + > + With this config enabled, the huge zero folio can also be used > + for other purposes that do not implement precise reference counting: > + it is allocated statically and never freed, allowing for more > + wide-spread use, for example, when performing I/O similar to the > + traditional shared zeropage. > + > + Not suitable for memory constrained systems. > + > config MM_ID > def_bool n > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index ff06dee213eb2..f65ba3e6f0824 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) > huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; > shrinker_register(huge_zero_folio_shrinker); > - deferred_split_shrinker->count_objects = deferred_split_count; > - deferred_split_shrinker->scan_objects = deferred_split_scan; > - shrinker_register(deferred_split_shrinker); > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { > + if (!get_huge_zero_folio()) > + pr_warn("Allocating static huge zero folio failed\n"); > + } else { > + deferred_split_shrinker->count_objects = deferred_split_count; > + deferred_split_shrinker->scan_objects = deferred_split_scan; > + shrinker_register(deferred_split_shrinker); > + } > return 0; > } > -- > 2.50.1 > > > Now, one thing I do not like is that we have "ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO" but > then have a user-selectable option. > > Should we just get rid of ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO? Yeah, though I guess we probably need to make it need CONFIG_MMU if so? Probably don't want to provide it if it might somehow break things? I guess we could keep it as long as CONFIG_STATIC_HUGE_ZERO_FOLIO depend on something sensible like CONFIG_MMU maybe 64-bit too? Anyway this approach looks generally good! > > -- > Cheers, > > David / dhildenb > Cheers, Lorenzo
>> >> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >> index 0ce86e14ab5e1..8e2aa18873098 100644 >> --- a/arch/x86/Kconfig >> +++ b/arch/x86/Kconfig >> @@ -153,6 +153,7 @@ config X86 >> select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 >> select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 >> select ARCH_WANTS_THP_SWAP if X86_64 >> + select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64 >> select ARCH_HAS_PARANOID_L1D_FLUSH >> select ARCH_WANT_IRQS_OFF_ACTIVATE_MM >> select BUILDTIME_TABLE_SORT >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index 7748489fde1b7..ccfa5c95f14b1 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) >> struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); >> void mm_put_huge_zero_folio(struct mm_struct *mm); >> +static inline struct folio *get_static_huge_zero_folio(void) >> +{ >> + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) >> + return NULL; >> + >> + if (unlikely(!huge_zero_folio)) >> + return NULL; >> + >> + return huge_zero_folio; >> +} >> + >> static inline bool thp_migration_supported(void) >> { >> return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); >> @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, >> { >> return 0; >> } >> + >> +static inline struct folio *get_static_huge_zero_folio(void) >> +{ >> + return NULL; >> +} >> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ >> static inline int split_folio_to_list_to_order(struct folio *folio, >> diff --git a/mm/Kconfig b/mm/Kconfig >> index e443fe8cd6cf2..366a6d2d771e3 100644 >> --- a/mm/Kconfig >> +++ b/mm/Kconfig >> @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB >> config ARCH_WANTS_THP_SWAP >> def_bool n >> +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO >> + def_bool n >> + >> +config STATIC_HUGE_ZERO_FOLIO >> + bool "Allocate a PMD sized folio for zeroing" >> + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE >> + help >> + Without this config enabled, the huge zero folio is allocated on >> + demand and freed under memory pressure once no longer in use. >> + To detect remaining users reliably, references to the huge zero folio >> + must be tracked precisely, so it is commonly only available for mapping >> + it into user page tables. >> + >> + With this config enabled, the huge zero folio can also be used >> + for other purposes that do not implement precise reference counting: >> + it is allocated statically and never freed, allowing for more >> + wide-spread use, for example, when performing I/O similar to the >> + traditional shared zeropage. >> + >> + Not suitable for memory constrained systems. >> + >> config MM_ID >> def_bool n >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index ff06dee213eb2..f65ba3e6f0824 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) >> huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; >> shrinker_register(huge_zero_folio_shrinker); >> - deferred_split_shrinker->count_objects = deferred_split_count; >> - deferred_split_shrinker->scan_objects = deferred_split_scan; >> - shrinker_register(deferred_split_shrinker); >> + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { >> + if (!get_huge_zero_folio()) >> + pr_warn("Allocating static huge zero folio failed\n"); >> + } else { >> + deferred_split_shrinker->count_objects = deferred_split_count; >> + deferred_split_shrinker->scan_objects = deferred_split_scan; >> + shrinker_register(deferred_split_shrinker); >> + } >> return 0; >> } >> -- >> 2.50.1 >> >> >> Now, one thing I do not like is that we have "ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO" but >> then have a user-selectable option. >> >> Should we just get rid of ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO? > > Yeah, though I guess we probably need to make it need CONFIG_MMU if so? > Probably don't want to provide it if it might somehow break things? It would still depend on THP, and THP is !MMU. So that should just work. We could go one step further and special case in mm_get_huge_zero_folio() + mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. Something like diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f091..9b87884e5f299 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,6 +248,9 @@ static void put_huge_zero_page(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) return READ_ONCE(huge_zero_folio); @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); } -- Cheers, David / dhildenb
Thanks a lot Lorenzo and David for the feedback and quick iteration on the patchset. I really like the number of lines of code has been steadily reducing since the first version :) I will fold the changes in the next series. <snip> > > > @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) > > > huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; > > > shrinker_register(huge_zero_folio_shrinker); > > > - deferred_split_shrinker->count_objects = deferred_split_count; > > > - deferred_split_shrinker->scan_objects = deferred_split_scan; > > > - shrinker_register(deferred_split_shrinker); > > > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { > > > + if (!get_huge_zero_folio()) > > > + pr_warn("Allocating static huge zero folio failed\n"); > > > + } else { > > > + deferred_split_shrinker->count_objects = deferred_split_count; > > > + deferred_split_shrinker->scan_objects = deferred_split_scan; > > > + shrinker_register(deferred_split_shrinker); > > > + } > > > return 0; > > > } > > > -- > > > 2.50.1 > > > > > > > > > Now, one thing I do not like is that we have "ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO" but > > > then have a user-selectable option. > > > > > > Should we just get rid of ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO? > > One of the early feedbacks from Lorenzo was that there might be some architectures that has PMD size > 2M might enable this by mistake. So the ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO was introduced as an extra precaution apart from user selectable CONFIG_STATIC_HUGE_ZERO_FOLIO. Isn't it better to have an extra knob per-arch to be on the safer side or you think it is too excessive? > > Yeah, though I guess we probably need to make it need CONFIG_MMU if so? > > Probably don't want to provide it if it might somehow break things? > > It would still depend on THP, and THP is !MMU. So that should just work. > > We could go one step further and special case in mm_get_huge_zero_folio() + > mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. > > Something like > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 9c38a95e9f091..9b87884e5f299 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -248,6 +248,9 @@ static void put_huge_zero_page(void) > > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) > { > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return huge_zero_folio; > + > if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) > return READ_ONCE(huge_zero_folio); > > @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct > *mm) > > void mm_put_huge_zero_folio(struct mm_struct *mm) > { > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return huge_zero_folio; > + > if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) > put_huge_zero_page(); > } > > -- Pankaj
On 05.08.25 13:40, Pankaj Raghav (Samsung) wrote: > Thanks a lot Lorenzo and David for the feedback and quick iteration on > the patchset. I really like the number of lines of code has been > steadily reducing since the first version :) > > I will fold the changes in the next series. > > <snip> >>>> @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) >>>> huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; >>>> shrinker_register(huge_zero_folio_shrinker); >>>> - deferred_split_shrinker->count_objects = deferred_split_count; >>>> - deferred_split_shrinker->scan_objects = deferred_split_scan; >>>> - shrinker_register(deferred_split_shrinker); >>>> + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { >>>> + if (!get_huge_zero_folio()) >>>> + pr_warn("Allocating static huge zero folio failed\n"); >>>> + } else { >>>> + deferred_split_shrinker->count_objects = deferred_split_count; >>>> + deferred_split_shrinker->scan_objects = deferred_split_scan; >>>> + shrinker_register(deferred_split_shrinker); >>>> + } >>>> return 0; >>>> } >>>> -- >>>> 2.50.1 >>>> >>>> >>>> Now, one thing I do not like is that we have "ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO" but >>>> then have a user-selectable option. >>>> >>>> Should we just get rid of ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO? >>> > > One of the early feedbacks from Lorenzo was that there might be some > architectures that has PMD size > 2M might enable this by mistake. So > the ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO was introduced as an extra > precaution apart from user selectable CONFIG_STATIC_HUGE_ZERO_FOLIO. People will find creative ways to mis-configure their system no matter what you try :) So I think best we can do is document it carefully. -- Cheers, David / dhildenb
On Tue, Aug 05, 2025 at 02:10:39PM +0200, David Hildenbrand wrote: > On 05.08.25 13:40, Pankaj Raghav (Samsung) wrote: > > Thanks a lot Lorenzo and David for the feedback and quick iteration on > > the patchset. I really like the number of lines of code has been > > steadily reducing since the first version :) > > > > I will fold the changes in the next series. > > > > <snip> > > > > > @@ -866,9 +866,14 @@ static int __init thp_shrinker_init(void) > > > > > huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; > > > > > shrinker_register(huge_zero_folio_shrinker); > > > > > - deferred_split_shrinker->count_objects = deferred_split_count; > > > > > - deferred_split_shrinker->scan_objects = deferred_split_scan; > > > > > - shrinker_register(deferred_split_shrinker); > > > > > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) { > > > > > + if (!get_huge_zero_folio()) > > > > > + pr_warn("Allocating static huge zero folio failed\n"); > > > > > + } else { > > > > > + deferred_split_shrinker->count_objects = deferred_split_count; > > > > > + deferred_split_shrinker->scan_objects = deferred_split_scan; > > > > > + shrinker_register(deferred_split_shrinker); > > > > > + } > > > > > return 0; > > > > > } > > > > > -- > > > > > 2.50.1 > > > > > > > > > > > > > > > Now, one thing I do not like is that we have "ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO" but > > > > > then have a user-selectable option. > > > > > > > > > > Should we just get rid of ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO? > > > > > > > > One of the early feedbacks from Lorenzo was that there might be some > > architectures that has PMD size > 2M might enable this by mistake. So > > the ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO was introduced as an extra > > precaution apart from user selectable CONFIG_STATIC_HUGE_ZERO_FOLIO. > Oh yeah so I did :P forgot I said that. > People will find creative ways to mis-configure their system no matter what > you try :) I think with an explicit config flag, this won't be _broken_ so much as wasteful, so therefore not really an issue. > > So I think best we can do is document it carefully. Yup, well always :) > > -- > Cheers, > > David / dhildenb > Cheers, Lorenzo
> We could go one step further and special case in mm_get_huge_zero_folio() + > mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. > Hmm, but we could have also failed to allocate even though the option was enabled. 2 options: - we could do: if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO) && huge_zero_folio) return huge_zero_folio; or - As dave suggested, we could do a static key and enable it when the option is enabled and the allocation succeeded. The same static key can also be used in get_static_huge_zero_folio(). I think the latter option will look more clean and slightly more optimal? > Something like > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 9c38a95e9f091..9b87884e5f299 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -248,6 +248,9 @@ static void put_huge_zero_page(void) > > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) > { > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return huge_zero_folio; > + > if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) > return READ_ONCE(huge_zero_folio); > > @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct > *mm) > > void mm_put_huge_zero_folio(struct mm_struct *mm) > { > + if (IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return huge_zero_folio; > + > if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) > put_huge_zero_page(); > } > -- Pankaj
On 06.08.25 14:18, Pankaj Raghav (Samsung) wrote: >> We could go one step further and special case in mm_get_huge_zero_folio() + >> mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. >> > > Hmm, but we could have also failed to allocate even though the option > was enabled. Then we return huge_zero_folio, which is NULL? Or what are you concerned about? -- Cheers, David / dhildenb
On Wed, Aug 06, 2025 at 02:24:28PM +0200, David Hildenbrand wrote: > On 06.08.25 14:18, Pankaj Raghav (Samsung) wrote: > > > We could go one step further and special case in mm_get_huge_zero_folio() + > > > mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. > > > > > > > Hmm, but we could have also failed to allocate even though the option > > was enabled. > > Then we return huge_zero_folio, which is NULL? > > Or what are you concerned about? But don't we want to keep the "dynamic" allocation part be present even though we failed to allocate it statically in the shrinker_init? Mainly so that the existing users of mm_get_huge_zero_folio() are not affected by these changes. > > -- > Cheers, > > David / dhildenb >
On 06.08.25 14:28, Pankaj Raghav (Samsung) wrote: > On Wed, Aug 06, 2025 at 02:24:28PM +0200, David Hildenbrand wrote: >> On 06.08.25 14:18, Pankaj Raghav (Samsung) wrote: >>>> We could go one step further and special case in mm_get_huge_zero_folio() + >>>> mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. >>>> >>> >>> Hmm, but we could have also failed to allocate even though the option >>> was enabled. >> >> Then we return huge_zero_folio, which is NULL? >> >> Or what are you concerned about? > > But don't we want to keep the "dynamic" allocation part be present even > though we failed to allocate it statically in the shrinker_init? > > Mainly so that the existing users of mm_get_huge_zero_folio() are not affected by > these changes. I would just keep it simple and say that if we fail the early allocation (which will be extremely unlikely that early during boot!), just don't ever try to reallocate, even not when we could through mm_get_huge_zero_folio(). That sounds as simple as it gets. Again, failing to allocate that early and then succeeding to allocate later is a fairly unlikely scenario. -- Cheers, David / dhildenb
On Wed, Aug 06, 2025 at 02:36:51PM +0200, David Hildenbrand wrote: > On 06.08.25 14:28, Pankaj Raghav (Samsung) wrote: > > On Wed, Aug 06, 2025 at 02:24:28PM +0200, David Hildenbrand wrote: > > > On 06.08.25 14:18, Pankaj Raghav (Samsung) wrote: > > > > > We could go one step further and special case in mm_get_huge_zero_folio() + > > > > > mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. > > > > > > > > > > > > > Hmm, but we could have also failed to allocate even though the option > > > > was enabled. > > > > > > Then we return huge_zero_folio, which is NULL? > > > > > > Or what are you concerned about? > > > > But don't we want to keep the "dynamic" allocation part be present even > > though we failed to allocate it statically in the shrinker_init? > > > > Mainly so that the existing users of mm_get_huge_zero_folio() are not affected by > > these changes. > > I would just keep it simple and say that if we fail the early allocation > (which will be extremely unlikely that early during boot!), just don't ever > try to reallocate, even not when we could through mm_get_huge_zero_folio(). > > That sounds as simple as it gets. Again, failing to allocate that early and > then succeeding to allocate later is a fairly unlikely scenario. Ok. I will also document this as a comment just so that people are aware of this behaviour. Thanks a lot David for the comments and feedback! -- Pankaj Raghav
On 06.08.25 14:43, Pankaj Raghav (Samsung) wrote: > On Wed, Aug 06, 2025 at 02:36:51PM +0200, David Hildenbrand wrote: >> On 06.08.25 14:28, Pankaj Raghav (Samsung) wrote: >>> On Wed, Aug 06, 2025 at 02:24:28PM +0200, David Hildenbrand wrote: >>>> On 06.08.25 14:18, Pankaj Raghav (Samsung) wrote: >>>>>> We could go one step further and special case in mm_get_huge_zero_folio() + >>>>>> mm_put_huge_zero_folio() on CONFIG_STATIC_HUGE_ZERO_FOLIO. >>>>>> >>>>> >>>>> Hmm, but we could have also failed to allocate even though the option >>>>> was enabled. >>>> >>>> Then we return huge_zero_folio, which is NULL? >>>> >>>> Or what are you concerned about? >>> >>> But don't we want to keep the "dynamic" allocation part be present even >>> though we failed to allocate it statically in the shrinker_init? >>> >>> Mainly so that the existing users of mm_get_huge_zero_folio() are not affected by >>> these changes. >> >> I would just keep it simple and say that if we fail the early allocation >> (which will be extremely unlikely that early during boot!), just don't ever >> try to reallocate, even not when we could through mm_get_huge_zero_folio(). >> >> That sounds as simple as it gets. Again, failing to allocate that early and >> then succeeding to allocate later is a fairly unlikely scenario. > > Ok. I will also document this as a comment just so that people are aware of > this behaviour. > > Thanks a lot David for the comments and feedback! Sure, as always, feel free to object if you think I am talking nonsense :) -- Cheers, David / dhildenb
On 8/4/25 05:13, Pankaj Raghav (Samsung) wrote: > From: Pankaj Raghav <p.raghav@samsung.com> > > There are many places in the kernel where we need to zeroout larger > chunks but the maximum segment we can zeroout at a time by ZERO_PAGE > is limited by PAGE_SIZE. ... In x86-land, the rules are pretty clear about using imperative voice. There are quite a few "we's" in the changelog and comments in this series. I do think they're generally good to avoid and do lead to more clarity, but I'm also not sure how important that is in mm-land these days. > +static inline struct folio *get_static_huge_zero_folio(void) > +{ > + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) > + return NULL; > + > + if (likely(atomic_read(&huge_zero_folio_is_static))) > + return huge_zero_folio; > + > + return __get_static_huge_zero_folio(); > +} This seems like an ideal place to use 'struct static_key'. > static inline bool thp_migration_supported(void) > { > @@ -685,6 +698,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, > { > return 0; > } > + > +static inline struct folio *get_static_huge_zero_folio(void) > +{ > + return NULL; > +} > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > static inline int split_folio_to_list_to_order(struct folio *folio, > diff --git a/mm/Kconfig b/mm/Kconfig > index e443fe8cd6cf..366a6d2d771e 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB > config ARCH_WANTS_THP_SWAP > def_bool n > > +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO > + def_bool n > + > +config STATIC_HUGE_ZERO_FOLIO > + bool "Allocate a PMD sized folio for zeroing" > + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE > + help > + Without this config enabled, the huge zero folio is allocated on > + demand and freed under memory pressure once no longer in use. > + To detect remaining users reliably, references to the huge zero folio > + must be tracked precisely, so it is commonly only available for mapping > + it into user page tables. > + > + With this config enabled, the huge zero folio can also be used > + for other purposes that do not implement precise reference counting: > + it is still allocated on demand, but never freed, allowing for more > + wide-spread use, for example, when performing I/O similar to the > + traditional shared zeropage. > + > + Not suitable for memory constrained systems. IMNHO, this is written like a changelog, not documentation for end users trying to make sense of Kconfig options. I'd suggest keeping it short and sweet: config PERSISTENT_HUGE_ZERO_FOLIO bool "Allocate a persistent PMD-sized folio for zeroing" ... help Enable this option to reduce the runtime refcounting overhead of the huge zero folio and expand the places in the kernel that can use huge zero folios. With this option enabled, the huge zero folio is allocated once and never freed. It potentially wastes one huge page worth of memory. Say Y if your system has lots of memory. Say N if you are memory constrained. > config MM_ID > def_bool n > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index ff06dee213eb..e117b280b38d 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -75,6 +75,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > static bool split_underused_thp = true; > > static atomic_t huge_zero_refcount; > +atomic_t huge_zero_folio_is_static __read_mostly; > struct folio *huge_zero_folio __read_mostly; > unsigned long huge_zero_pfn __read_mostly = ~0UL; > unsigned long huge_anon_orders_always __read_mostly; > @@ -266,6 +267,45 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) > put_huge_zero_folio(); > } > > +#ifdef CONFIG_STATIC_HUGE_ZERO_FOLIO > + > +struct folio *__get_static_huge_zero_folio(void) > +{ > + static unsigned long fail_count_clear_timer; > + static atomic_t huge_zero_static_fail_count __read_mostly; > + > + if (unlikely(!slab_is_available())) > + return NULL; > + > + /* > + * If we failed to allocate a huge zero folio, just refrain from > + * trying for one minute before retrying to get a reference again. > + */ > + if (atomic_read(&huge_zero_static_fail_count) > 1) { > + if (time_before(jiffies, fail_count_clear_timer)) > + return NULL; > + atomic_set(&huge_zero_static_fail_count, 0); > + } Any reason that this is an open-coded ratelimit instead of using 'struct ratelimit_state'? I also find the 'huge_zero_static_fail_count' use pretty unintuitive. This is fundamentally a slow path. Ideally, it's called once. In the pathological case, it's called once a minute. I'd probably just recommend putting a rate limit on this function, then using a plain old mutex for the actual allocation to keep multiple threads out. Then the function becomes something like this: if (__ratelimit(&huge_zero_alloc_ratelimit)) return; guard(mutex)(&huge_zero_mutex); if (!get_huge_zero_folio()) return NULL; static_key_enable(&huge_zero_noref_key); return huge_zero_folio; No atomic, no cmpxchg, no races on allocating. ... > static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, > struct shrink_control *sc) > { > @@ -277,7 +317,11 @@ static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, > struct shrink_control *sc) > { > if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { > - struct folio *zero_folio = xchg(&huge_zero_folio, NULL); > + struct folio *zero_folio; > + > + if (WARN_ON_ONCE(atomic_read(&huge_zero_folio_is_static))) > + return 0; > + zero_folio = xchg(&huge_zero_folio, NULL); > BUG_ON(zero_folio == NULL); > WRITE_ONCE(huge_zero_pfn, ~0UL); > folio_put(zero_folio); This seems like a hack to me. If you don't want the shrinker to run, then deregister it. Keeping the refcount elevated is fine, but repeatedly calling the shrinker to do atomic_cmpxchg() when you *know* it will do nothing seems silly. If you can't deregister the shrinker, at least use the static_key approach and check the static key instead of doing futile cmpxchg's forever.
On Tue, Aug 05, 2025 at 09:33:10AM -0700, Dave Hansen wrote: > On 8/4/25 05:13, Pankaj Raghav (Samsung) wrote: > > From: Pankaj Raghav <p.raghav@samsung.com> > > > > There are many places in the kernel where we need to zeroout larger > > chunks but the maximum segment we can zeroout at a time by ZERO_PAGE > > is limited by PAGE_SIZE. > ... > > In x86-land, the rules are pretty clear about using imperative voice. > There are quite a few "we's" in the changelog and comments in this series. > > I do think they're generally good to avoid and do lead to more clarity, > but I'm also not sure how important that is in mm-land these days. Yeah, I will change it to imperative to stay consistent. <snip> > > static inline int split_folio_to_list_to_order(struct folio *folio, > > diff --git a/mm/Kconfig b/mm/Kconfig > > index e443fe8cd6cf..366a6d2d771e 100644 > > --- a/mm/Kconfig > > +++ b/mm/Kconfig > > @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB > > config ARCH_WANTS_THP_SWAP > > def_bool n > > > > +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO > > + def_bool n > > + > > +config STATIC_HUGE_ZERO_FOLIO > > + bool "Allocate a PMD sized folio for zeroing" > > + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE > > + help > > + Without this config enabled, the huge zero folio is allocated on > > + demand and freed under memory pressure once no longer in use. > > + To detect remaining users reliably, references to the huge zero folio > > + must be tracked precisely, so it is commonly only available for mapping > > + it into user page tables. > > + > > + With this config enabled, the huge zero folio can also be used > > + for other purposes that do not implement precise reference counting: > > + it is still allocated on demand, but never freed, allowing for more > > + wide-spread use, for example, when performing I/O similar to the > > + traditional shared zeropage. > > + > > + Not suitable for memory constrained systems. > > IMNHO, this is written like a changelog, not documentation for end users > trying to make sense of Kconfig options. I'd suggest keeping it short > and sweet: > > config PERSISTENT_HUGE_ZERO_FOLIO > bool "Allocate a persistent PMD-sized folio for zeroing" > ... > help > Enable this option to reduce the runtime refcounting overhead > of the huge zero folio and expand the places in the kernel > that can use huge zero folios. > > With this option enabled, the huge zero folio is allocated > once and never freed. It potentially wastes one huge page > worth of memory. > > Say Y if your system has lots of memory. Say N if you are > memory constrained. > This looks short and to the point. I can fold this in the next version. Thanks. > > config MM_ID > > def_bool n > > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index ff06dee213eb..e117b280b38d 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -75,6 +75,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > > static bool split_underused_thp = true; > > > > static atomic_t huge_zero_refcount; > > +atomic_t huge_zero_folio_is_static __read_mostly; > > struct folio *huge_zero_folio __read_mostly; > > unsigned long huge_zero_pfn __read_mostly = ~0UL; > > unsigned long huge_anon_orders_always __read_mostly; > > @@ -266,6 +267,45 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) > > put_huge_zero_folio(); > > } > > > > +#ifdef CONFIG_STATIC_HUGE_ZERO_FOLIO > > + > > +struct folio *__get_static_huge_zero_folio(void) > > +{ > > + static unsigned long fail_count_clear_timer; > > + static atomic_t huge_zero_static_fail_count __read_mostly; > > + > > + if (unlikely(!slab_is_available())) > > + return NULL; > > + > > + /* > > + * If we failed to allocate a huge zero folio, just refrain from > > + * trying for one minute before retrying to get a reference again. > > + */ > > + if (atomic_read(&huge_zero_static_fail_count) > 1) { > > + if (time_before(jiffies, fail_count_clear_timer)) > > + return NULL; > > + atomic_set(&huge_zero_static_fail_count, 0); > > + } > > Any reason that this is an open-coded ratelimit instead of using > 'struct ratelimit_state'? > > I also find the 'huge_zero_static_fail_count' use pretty unintuitive. > This is fundamentally a slow path. Ideally, it's called once. In the > pathological case, it's called once a minute. > > I'd probably just recommend putting a rate limit on this function, then > using a plain old mutex for the actual allocation to keep multiple > threads out. > > Then the function becomes something like this: > > if (__ratelimit(&huge_zero_alloc_ratelimit)) > return; > > guard(mutex)(&huge_zero_mutex); > > if (!get_huge_zero_folio()) > return NULL; > > static_key_enable(&huge_zero_noref_key); > > return huge_zero_folio; > > No atomic, no cmpxchg, no races on allocating. David already reworked this part based on Lorenzo's feedback (he also did not like the ratelimiting part like you). The reworked diff is here[1]. No ratelimiting, etc. > > > ... > > static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, > > struct shrink_control *sc) > > { > > @@ -277,7 +317,11 @@ static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, > > struct shrink_control *sc) > > { > > if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { > > - struct folio *zero_folio = xchg(&huge_zero_folio, NULL); > > + struct folio *zero_folio; > > + > > + if (WARN_ON_ONCE(atomic_read(&huge_zero_folio_is_static))) > > + return 0; > > + zero_folio = xchg(&huge_zero_folio, NULL); > > BUG_ON(zero_folio == NULL); > > WRITE_ONCE(huge_zero_pfn, ~0UL); > > folio_put(zero_folio); > > This seems like a hack to me. If you don't want the shrinker to run, > then deregister it. Keeping the refcount elevated is fine, but > repeatedly calling the shrinker to do atomic_cmpxchg() when you *know* > it will do nothing seems silly. > The new version[1] deregisters instead of having this condition. :) > If you can't deregister the shrinker, at least use the static_key > approach and check the static key instead of doing futile cmpxchg's forever. -- Pankaj [1] https://lore.kernel.org/linux-mm/70049abc-bf79-4d04-a0a8-dd3787195986@redhat.com/
© 2016 - 2025 Red Hat, Inc.